Facility data hygiene: merge duplicates, drop junk-named facilities
Cleans up the crawl-generated facility table that surfaced garbage on /Facilities («بیمارستان هستم», «... از مدجابز», bare «کلینیک», «سازمان برنامه جنوبی» x3): - FacilityMatcher.IsJunkName: shared detector for non-names — bare type words, cores made only of filler/verb tokens, and leaked crawl-source/placeholder text. Added داروخانه/آسایشگاه to the generic type words so bare ones are caught and dedupe better. - HeuristicListingParser.ExtractFacilityName now rejects junk candidates (and emoji), so new ingests fall back to the shared placeholder instead of forging a fake facility. - IngestionService.MergeAndCleanFacilitiesAsync (+ admin button): folds junk facilities into the placeholder and merges Persian-fuzzy duplicates into one keeper, repointing their shifts/jobs first. Hard guard: only purely crawl-generated, unmanaged facilities are removed — employer-owned and verified facilities are never touched. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -17,7 +17,7 @@ public static class FacilityMatcher
|
||||
{
|
||||
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
|
||||
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
|
||||
"آزمایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
|
||||
"آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
|
||||
};
|
||||
|
||||
/// <summary>Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.</summary>
|
||||
@@ -47,6 +47,42 @@ public static class FacilityMatcher
|
||||
return Regex.Replace(n, @"\s+", " ").Trim();
|
||||
}
|
||||
|
||||
// Filler/verb/locator tokens that are never a real facility name — the parser sweeps these in
|
||||
// when an ad has no named facility («بیمارستان هستم», «مطب نیازمندیم سه», «کلینیک های فقط منطقه»).
|
||||
private static readonly string[] JunkCoreWords =
|
||||
{
|
||||
"هستم", "هستیم", "هستش", "میشوم", "میشم", "بشوم", "میباشد", "باشد", "میباشم",
|
||||
"نیازمندیم", "نیازمند", "نیازمندم", "داریم", "دارم", "میخواهیم", "میخوام",
|
||||
"حتی", "تعدادی", "فقط", "منطقه", "واقع", "های", "مبتدی", "محترم", "خوب",
|
||||
"سه", "دو", "یک", "چند", "این", "آن", "همکار", "نیرو",
|
||||
};
|
||||
|
||||
// Crawl-source names that must never appear as a public facility («مرکز درمانی (از مدجابز)»),
|
||||
// plus the shared placeholder text.
|
||||
private static readonly string[] SourceMarkers =
|
||||
{
|
||||
"مدجابز", "مدجاب", "از تلگرام", "از دیوار", "از بله", "از کانال", "ثبت نشده", "نامشخص",
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// True when a name is NOT a usable facility name: a bare type word («بیمارستان»), a name whose
|
||||
/// distinctive core is only filler/verb tokens («بیمارستان هستم» → «هستم»), or a leaked crawl
|
||||
/// source / placeholder («... از مدجابز», «نامشخص»). Such an ad has no real named facility and
|
||||
/// should fall back to the shared placeholder instead of forging a fake one.
|
||||
/// </summary>
|
||||
public static bool IsJunkName(string? name)
|
||||
{
|
||||
var normalized = Normalize(name);
|
||||
if (normalized.Length == 0) return true;
|
||||
if (SourceMarkers.Any(m => normalized.Contains(Normalize(m)))) return true;
|
||||
|
||||
var core = Core(name);
|
||||
if (core.Length == 0) return true; // bare type word only («بیمارستان»، «کلینیک»)
|
||||
|
||||
var tokens = core.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
||||
return tokens.All(t => t.Length <= 1 || JunkCoreWords.Contains(t));
|
||||
}
|
||||
|
||||
/// <summary>True when two names almost certainly denote the same facility.</summary>
|
||||
public static bool IsSame(string? a, string? b)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user