Auto-clean the board after every crawl (no manual cleanup clicks)
RunAsync now calls a new RunPostIngestCleanupAsync at the end of each crawl: archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill missing Tehran coords. All in-place, reversible for listings, guarded for facilities, and pure DB+CPU (no AI/network) so it is cheap to run every ingest. The cleanup counts are appended to the run-log detail. This keeps legacy + freshly-arrived junk from accumulating without the admin having to click the cleanup buttons after each run. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -149,11 +149,16 @@ public class IngestionService
|
|||||||
|
|
||||||
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
|
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
|
||||||
|
|
||||||
|
// Self-clean after every crawl so the board stays tidy with no manual admin clicks: archive
|
||||||
|
// out-of-scope/duplicate listings, merge duplicate + fold junk facilities, backfill coords.
|
||||||
|
var cleanup = results.Count > 0 ? await RunPostIngestCleanupAsync(ct) : default;
|
||||||
|
|
||||||
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
|
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
|
||||||
if (results.Count > 0)
|
if (results.Count > 0)
|
||||||
{
|
{
|
||||||
var detail = string.Join("؛ ", results.Select(r =>
|
var detail = string.Join("؛ ", results.Select(r =>
|
||||||
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"));
|
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"))
|
||||||
|
+ $" || پاکسازیِ خودکار: {cleanup.archived} بایگانی، {cleanup.dedupedJobs} استخدامِ تکراری، {cleanup.mergedFac} مرکزِ ادغام، {cleanup.cleanedFac} مرکزِ حذف، {cleanup.coords} مختصات";
|
||||||
_db.IngestionRuns.Add(new IngestionRun
|
_db.IngestionRuns.Add(new IngestionRun
|
||||||
{
|
{
|
||||||
Fetched = summary.TotalFetched,
|
Fetched = summary.TotalFetched,
|
||||||
@@ -336,6 +341,24 @@ public class IngestionService
|
|||||||
return filled;
|
return filled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
|
||||||
|
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
|
||||||
|
/// missing Tehran map coords. All in-place — reversible (archive, not delete) for listings, guarded
|
||||||
|
/// (never touches employer/verified facilities) — and pure DB + CPU (no AI, no network), so it's
|
||||||
|
/// cheap to run on every ingest. Keeps the board tidy without the admin clicking the cleanup buttons.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<(int archived, int dedupedJobs, int mergedFac, int cleanedFac, int coords)>
|
||||||
|
RunPostIngestCleanupAsync(CancellationToken ct = default)
|
||||||
|
{
|
||||||
|
var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct);
|
||||||
|
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
|
||||||
|
var coords = await BackfillCoordsAsync(ct);
|
||||||
|
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}",
|
||||||
|
archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||||
|
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
|
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
|
||||||
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
|
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
|
||||||
|
|||||||
Reference in New Issue
Block a user