From b48e7dbc653dca98ce8ae4cb36e4265ad1fc572e Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 13:19:11 +0330 Subject: [PATCH] Auto-clean the board after every crawl (no manual cleanup clicks) RunAsync now calls a new RunPostIngestCleanupAsync at the end of each crawl: archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill missing Tehran coords. All in-place, reversible for listings, guarded for facilities, and pure DB+CPU (no AI/network) so it is cheap to run every ingest. The cleanup counts are appended to the run-log detail. This keeps legacy + freshly-arrived junk from accumulating without the admin having to click the cleanup buttons after each run. Co-Authored-By: Claude Opus 4.8 --- .../Services/Scraping/IngestionService.cs | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index d70e945..54b442a 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -149,11 +149,16 @@ public class IngestionService await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch + // Self-clean after every crawl so the board stays tidy with no manual admin clicks: archive + // out-of-scope/duplicate listings, merge duplicate + fold junk facilities, backfill coords. + var cleanup = results.Count > 0 ? await RunPostIngestCleanupAsync(ct) : default; + // Persist a run-log row so admins get a crawl history (with a per-source breakdown). if (results.Count > 0) { var detail = string.Join("؛ ", results.Select(r => - $"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}")); + $"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}")) + + $" || پاک‌سازیِ خودکار: {cleanup.archived} بایگانی، {cleanup.dedupedJobs} استخدامِ تکراری، {cleanup.mergedFac} مرکزِ ادغام، {cleanup.cleanedFac} مرکزِ حذف، {cleanup.coords} مختصات"; _db.IngestionRuns.Add(new IngestionRun { Fetched = summary.TotalFetched, @@ -336,6 +341,24 @@ public class IngestionService return filled; } + /// + /// The self-cleaning pass run automatically at the end of every crawl (and available on demand): + /// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill + /// missing Tehran map coords. All in-place — reversible (archive, not delete) for listings, guarded + /// (never touches employer/verified facilities) — and pure DB + CPU (no AI, no network), so it's + /// cheap to run on every ingest. Keeps the board tidy without the admin clicking the cleanup buttons. + /// + public async Task<(int archived, int dedupedJobs, int mergedFac, int cleanedFac, int coords)> + RunPostIngestCleanupAsync(CancellationToken ct = default) + { + var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct); + var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct); + var coords = await BackfillCoordsAsync(ct); + _log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}", + archived, dedupedJobs, mergedFac, cleanedFac, coords); + return (archived, dedupedJobs, mergedFac, cleanedFac, coords); + } + /// /// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open /// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)