From 8d0a403b366fad8d92edc32a1958704dc9d1e059 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sat, 20 Jun 2026 17:54:26 +0330 Subject: [PATCH] Near-duplicate applicant detection (collapse source reposts) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exact ContentHash dedup misses the same ad reposted with slightly different text (e.g. the ~18 repeated «کمک‌یار آقا»). DedupeTalentAsync collapses open aggregated applicants by two high-precision signals — identical phone, or identical (role, city, normalized description core with digits/«… پیش» time-phrases stripped) — keeping the newest of each group. Runs at the end of both RunAsync and ReprocessAsync; removed count surfaces in the run log. Improvement 1 of the data-quality/SEO backlog. Co-Authored-By: Claude Opus 4.8 --- .../Services/Scraping/IngestionService.cs | 52 +++++++++++++++++-- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 8984b3f..d4edf83 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -147,6 +147,8 @@ public class IngestionService var summary = new IngestionSummary(results); + await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch + // Persist a run-log row so admins get a crawl history (with a per-source breakdown). if (results.Count > 0) { @@ -246,19 +248,59 @@ public class IngestionService } await _db.SaveChangesAsync(ct); + var deduped = await DedupeTalentAsync(ct); // collapse reposts the exact-hash dedup missed + _db.IngestionRuns.Add(new IngestionRun { - Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = 0, - Detail = $"پردازش مجدد آیتم‌های ذخیره‌شده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی", + Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = deduped, + Detail = $"پردازش مجدد آیتم‌های ذخیره‌شده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی، {deduped} تکراریِ حذف‌شده", }); await _db.SaveChangesAsync(ct); - _log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S}", - fetched, published, queued, flagged, spam); + _log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S} deduped={D}", + fetched, published, queued, flagged, spam, deduped); return new IngestionSummary(new List - { new("پردازش مجدد", fetched, queued, published, flagged, spam, 0) }); + { new("پردازش مجدد", fetched, queued, published, flagged, spam, deduped) }); } + /// + /// Collapse near-duplicate aggregated APPLICANTS left when a source reposts the same ad (different + /// text → different ContentHash, so exact dedup missed them). Two high-precision signals: an + /// identical phone, or identical (role, city, normalized description core with digits/«… پیش» + /// time-phrases removed). Keeps the NEWEST of each group, deletes the rest. Returns the count removed. + /// + public async Task DedupeTalentAsync(CancellationToken ct = default) + { + var rows = await _db.TalentListings + .Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated) + .Select(t => new { t.Id, t.Phone, t.RoleId, t.CityId, t.Description, t.CreatedAt }) + .ToListAsync(ct); + + string? Sig(string? phone, int roleId, int cityId, string? desc) + { + var p = DigitsOnly(phone ?? ""); + if (p.Length >= 7) return "p:" + p; // same number = same person/repost + var core = NormalizeFa(Regex.Replace(desc ?? "", + @"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim(); + if (core.Length < 15) return null; // too little to call it a dup safely + return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}"; + } + + var toRemove = rows + .Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.Phone, r.RoleId, r.CityId, r.Description) }) + .Where(x => x.Key is not null) + .GroupBy(x => x.Key) + .SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id)) + .ToList(); + + if (toRemove.Count == 0) return 0; + var removed = await _db.TalentListings.Where(t => toRemove.Contains(t.Id)).ExecuteDeleteAsync(ct); + _log.LogInformation("Deduped {N} near-duplicate applicants.", removed); + return removed; + } + + private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray()); + private static (RawListingStatus status, string? reason, int confidence) Decide( AppSetting s, ValidationResult val, AiAuditResult? ai) {