Make dedupe stricter to avoid wrongly archiving distinct placeholder-facility ads
The job/talent duplicate-detector compared only the first 100–120 chars of the normalized description. Since ~82% of jobs share the «نامشخص» placeholder facility, two genuinely different ads that merely open the same way could collapse — wrongly archiving a valid listing (→ a 410 like /Jobs/Details/7032). Compare a 400-char slice instead, so only near-identical full texts dedupe; true reposts/fan-out (identical text) are still caught. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -288,7 +288,7 @@ public class IngestionService
|
||||
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
||||
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
||||
if (core.Length < 15) return null; // too little to call it a dup safely
|
||||
return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}";
|
||||
return $"t:{roleId}:{cityId}:{(core.Length > 400 ? core[..400] : core)}";
|
||||
}
|
||||
|
||||
var toRemove = rows
|
||||
@@ -497,7 +497,9 @@ public class IngestionService
|
||||
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
||||
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
||||
if (core.Length < 15) return null; // too little to call it a dup safely
|
||||
return $"j:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
||||
// Compare a LONG slice (not 120) — most jobs share the «نامشخص» facility, so a short
|
||||
// prefix could collapse two different placeholder ads that merely open the same way.
|
||||
return $"j:{facId}:{(core.Length > 400 ? core[..400] : core)}";
|
||||
}
|
||||
|
||||
// Keep one per group — prefer a non-«پزشک عمومی» role (the fan-out's GP copy is the usual
|
||||
|
||||
Reference in New Issue
Block a user