From aaeb37e1af9d5434e4d1f1fe27c8169703c877af Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Fri, 26 Jun 2026 03:26:25 +0330 Subject: [PATCH] Make dedupe stricter to avoid wrongly archiving distinct placeholder-facility ads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The job/talent duplicate-detector compared only the first 100–120 chars of the normalized description. Since ~82% of jobs share the «نامشخص» placeholder facility, two genuinely different ads that merely open the same way could collapse — wrongly archiving a valid listing (→ a 410 like /Jobs/Details/7032). Compare a 400-char slice instead, so only near-identical full texts dedupe; true reposts/fan-out (identical text) are still caught. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Services/Scraping/IngestionService.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 60246b4..ee149c3 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -288,7 +288,7 @@ public class IngestionService var core = NormalizeFa(Regex.Replace(desc ?? "", @"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim(); if (core.Length < 15) return null; // too little to call it a dup safely - return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}"; + return $"t:{roleId}:{cityId}:{(core.Length > 400 ? core[..400] : core)}"; } var toRemove = rows @@ -497,7 +497,9 @@ public class IngestionService var core = NormalizeFa(Regex.Replace(desc ?? "", @"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim(); if (core.Length < 15) return null; // too little to call it a dup safely - return $"j:{facId}:{(core.Length > 120 ? core[..120] : core)}"; + // Compare a LONG slice (not 120) — most jobs share the «نامشخص» facility, so a short + // prefix could collapse two different placeholder ads that merely open the same way. + return $"j:{facId}:{(core.Length > 400 ? core[..400] : core)}"; } // Keep one per group — prefer a non-«پزشک عمومی» role (the fan-out's GP copy is the usual