Near-duplicate applicant detection (collapse source reposts)
CI/CD / CI · dotnet build (push) Successful in 1m57s
CI/CD / Deploy · hamkadr (push) Successful in 1m9s

Exact ContentHash dedup misses the same ad reposted with slightly different text
(e.g. the ~18 repeated «کمک‌یار آقا»). DedupeTalentAsync collapses open aggregated
applicants by two high-precision signals — identical phone, or identical
(role, city, normalized description core with digits/«… پیش» time-phrases
stripped) — keeping the newest of each group. Runs at the end of both RunAsync
and ReprocessAsync; removed count surfaces in the run log.

Improvement 1 of the data-quality/SEO backlog.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-20 17:54:26 +03:30
parent 21befd5b1e
commit 8d0a403b36
@@ -147,6 +147,8 @@ public class IngestionService
var summary = new IngestionSummary(results); var summary = new IngestionSummary(results);
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
// Persist a run-log row so admins get a crawl history (with a per-source breakdown). // Persist a run-log row so admins get a crawl history (with a per-source breakdown).
if (results.Count > 0) if (results.Count > 0)
{ {
@@ -246,19 +248,59 @@ public class IngestionService
} }
await _db.SaveChangesAsync(ct); await _db.SaveChangesAsync(ct);
var deduped = await DedupeTalentAsync(ct); // collapse reposts the exact-hash dedup missed
_db.IngestionRuns.Add(new IngestionRun _db.IngestionRuns.Add(new IngestionRun
{ {
Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = 0, Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = deduped,
Detail = $"پردازش مجدد آیتم‌های ذخیره‌شده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی", Detail = $"پردازش مجدد آیتم‌های ذخیره‌شده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی، {deduped} تکراریِ حذف‌شده",
}); });
await _db.SaveChangesAsync(ct); await _db.SaveChangesAsync(ct);
_log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S}", _log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S} deduped={D}",
fetched, published, queued, flagged, spam); fetched, published, queued, flagged, spam, deduped);
return new IngestionSummary(new List<SourceResult> return new IngestionSummary(new List<SourceResult>
{ new("پردازش مجدد", fetched, queued, published, flagged, spam, 0) }); { new("پردازش مجدد", fetched, queued, published, flagged, spam, deduped) });
} }
/// <summary>
/// Collapse near-duplicate aggregated APPLICANTS left when a source reposts the same ad (different
/// text → different ContentHash, so exact dedup missed them). Two high-precision signals: an
/// identical phone, or identical (role, city, normalized description core with digits/«… پیش»
/// time-phrases removed). Keeps the NEWEST of each group, deletes the rest. Returns the count removed.
/// </summary>
public async Task<int> DedupeTalentAsync(CancellationToken ct = default)
{
var rows = await _db.TalentListings
.Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated)
.Select(t => new { t.Id, t.Phone, t.RoleId, t.CityId, t.Description, t.CreatedAt })
.ToListAsync(ct);
string? Sig(string? phone, int roleId, int cityId, string? desc)
{
var p = DigitsOnly(phone ?? "");
if (p.Length >= 7) return "p:" + p; // same number = same person/repost
var core = NormalizeFa(Regex.Replace(desc ?? "",
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
if (core.Length < 15) return null; // too little to call it a dup safely
return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}";
}
var toRemove = rows
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.Phone, r.RoleId, r.CityId, r.Description) })
.Where(x => x.Key is not null)
.GroupBy(x => x.Key)
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
.ToList();
if (toRemove.Count == 0) return 0;
var removed = await _db.TalentListings.Where(t => toRemove.Contains(t.Id)).ExecuteDeleteAsync(ct);
_log.LogInformation("Deduped {N} near-duplicate applicants.", removed);
return removed;
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide( private static (RawListingStatus status, string? reason, int confidence) Decide(
AppSetting s, ValidationResult val, AiAuditResult? ai) AppSetting s, ValidationResult val, AiAuditResult? ai)
{ {