Near-duplicate applicant detection (collapse source reposts)
Exact ContentHash dedup misses the same ad reposted with slightly different text (e.g. the ~18 repeated «کمکیار آقا»). DedupeTalentAsync collapses open aggregated applicants by two high-precision signals — identical phone, or identical (role, city, normalized description core with digits/«… پیش» time-phrases stripped) — keeping the newest of each group. Runs at the end of both RunAsync and ReprocessAsync; removed count surfaces in the run log. Improvement 1 of the data-quality/SEO backlog. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -147,6 +147,8 @@ public class IngestionService
|
|||||||
|
|
||||||
var summary = new IngestionSummary(results);
|
var summary = new IngestionSummary(results);
|
||||||
|
|
||||||
|
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
|
||||||
|
|
||||||
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
|
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
|
||||||
if (results.Count > 0)
|
if (results.Count > 0)
|
||||||
{
|
{
|
||||||
@@ -246,19 +248,59 @@ public class IngestionService
|
|||||||
}
|
}
|
||||||
await _db.SaveChangesAsync(ct);
|
await _db.SaveChangesAsync(ct);
|
||||||
|
|
||||||
|
var deduped = await DedupeTalentAsync(ct); // collapse reposts the exact-hash dedup missed
|
||||||
|
|
||||||
_db.IngestionRuns.Add(new IngestionRun
|
_db.IngestionRuns.Add(new IngestionRun
|
||||||
{
|
{
|
||||||
Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = 0,
|
Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = deduped,
|
||||||
Detail = $"پردازش مجدد آیتمهای ذخیرهشده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی",
|
Detail = $"پردازش مجدد آیتمهای ذخیرهشده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی، {deduped} تکراریِ حذفشده",
|
||||||
});
|
});
|
||||||
await _db.SaveChangesAsync(ct);
|
await _db.SaveChangesAsync(ct);
|
||||||
_log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S}",
|
_log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S} deduped={D}",
|
||||||
fetched, published, queued, flagged, spam);
|
fetched, published, queued, flagged, spam, deduped);
|
||||||
|
|
||||||
return new IngestionSummary(new List<SourceResult>
|
return new IngestionSummary(new List<SourceResult>
|
||||||
{ new("پردازش مجدد", fetched, queued, published, flagged, spam, 0) });
|
{ new("پردازش مجدد", fetched, queued, published, flagged, spam, deduped) });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Collapse near-duplicate aggregated APPLICANTS left when a source reposts the same ad (different
|
||||||
|
/// text → different ContentHash, so exact dedup missed them). Two high-precision signals: an
|
||||||
|
/// identical phone, or identical (role, city, normalized description core with digits/«… پیش»
|
||||||
|
/// time-phrases removed). Keeps the NEWEST of each group, deletes the rest. Returns the count removed.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<int> DedupeTalentAsync(CancellationToken ct = default)
|
||||||
|
{
|
||||||
|
var rows = await _db.TalentListings
|
||||||
|
.Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated)
|
||||||
|
.Select(t => new { t.Id, t.Phone, t.RoleId, t.CityId, t.Description, t.CreatedAt })
|
||||||
|
.ToListAsync(ct);
|
||||||
|
|
||||||
|
string? Sig(string? phone, int roleId, int cityId, string? desc)
|
||||||
|
{
|
||||||
|
var p = DigitsOnly(phone ?? "");
|
||||||
|
if (p.Length >= 7) return "p:" + p; // same number = same person/repost
|
||||||
|
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
||||||
|
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
||||||
|
if (core.Length < 15) return null; // too little to call it a dup safely
|
||||||
|
return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}";
|
||||||
|
}
|
||||||
|
|
||||||
|
var toRemove = rows
|
||||||
|
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.Phone, r.RoleId, r.CityId, r.Description) })
|
||||||
|
.Where(x => x.Key is not null)
|
||||||
|
.GroupBy(x => x.Key)
|
||||||
|
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
if (toRemove.Count == 0) return 0;
|
||||||
|
var removed = await _db.TalentListings.Where(t => toRemove.Contains(t.Id)).ExecuteDeleteAsync(ct);
|
||||||
|
_log.LogInformation("Deduped {N} near-duplicate applicants.", removed);
|
||||||
|
return removed;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
|
||||||
|
|
||||||
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
||||||
AppSetting s, ValidationResult val, AiAuditResult? ai)
|
AppSetting s, ValidationResult val, AiAuditResult? ai)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user