Auto-clean the board after every crawl (no manual cleanup clicks)
RunAsync now calls a new RunPostIngestCleanupAsync at the end of each crawl: archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill missing Tehran coords. All in-place, reversible for listings, guarded for facilities, and pure DB+CPU (no AI/network) so it is cheap to run every ingest. The cleanup counts are appended to the run-log detail. This keeps legacy + freshly-arrived junk from accumulating without the admin having to click the cleanup buttons after each run. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -149,11 +149,16 @@ public class IngestionService
|
||||
|
||||
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
|
||||
|
||||
// Self-clean after every crawl so the board stays tidy with no manual admin clicks: archive
|
||||
// out-of-scope/duplicate listings, merge duplicate + fold junk facilities, backfill coords.
|
||||
var cleanup = results.Count > 0 ? await RunPostIngestCleanupAsync(ct) : default;
|
||||
|
||||
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
|
||||
if (results.Count > 0)
|
||||
{
|
||||
var detail = string.Join("؛ ", results.Select(r =>
|
||||
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"));
|
||||
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"))
|
||||
+ $" || پاکسازیِ خودکار: {cleanup.archived} بایگانی، {cleanup.dedupedJobs} استخدامِ تکراری، {cleanup.mergedFac} مرکزِ ادغام، {cleanup.cleanedFac} مرکزِ حذف، {cleanup.coords} مختصات";
|
||||
_db.IngestionRuns.Add(new IngestionRun
|
||||
{
|
||||
Fetched = summary.TotalFetched,
|
||||
@@ -336,6 +341,24 @@ public class IngestionService
|
||||
return filled;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
|
||||
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
|
||||
/// missing Tehran map coords. All in-place — reversible (archive, not delete) for listings, guarded
|
||||
/// (never touches employer/verified facilities) — and pure DB + CPU (no AI, no network), so it's
|
||||
/// cheap to run on every ingest. Keeps the board tidy without the admin clicking the cleanup buttons.
|
||||
/// </summary>
|
||||
public async Task<(int archived, int dedupedJobs, int mergedFac, int cleanedFac, int coords)>
|
||||
RunPostIngestCleanupAsync(CancellationToken ct = default)
|
||||
{
|
||||
var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct);
|
||||
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
|
||||
var coords = await BackfillCoordsAsync(ct);
|
||||
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}",
|
||||
archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
|
||||
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
|
||||
|
||||
Reference in New Issue
Block a user