Match crawled listings to existing facilities (fuzzy) before creating new
When publishing a scraped listing we now look for a facility we already have that is exactly or closely the same, and only create a new one when there is no match — avoiding duplicates like «بیمارستان میلاد» vs «میلاد». - ListingParser: extract a facility name (keyword + distinctive words) from the post and surface it in the parser notes. - FacilityMatcher: Persian-aware normalization (ي/ك, ZWNJ, punctuation), type-word stripping for a "core" name, contains + Levenshtein similarity, and FindBest (same-city exact → any-city exact → same-city fuzzy → fuzzy). - Review (manual publish): auto-select a matching facility or prefill the new-facility name; resolve-or-create uses fuzzy match; dropdown preselects. - IngestionService (auto-publish): reuse FacilityMatcher against a run-wide facility list (grows as new ones are created) instead of exact-name only. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ public class ParsedListing
|
||||
public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز
|
||||
public string? CityName { get; set; }
|
||||
public string? DistrictName { get; set; }
|
||||
public string? FacilityName { get; set; } // hospital/clinic name guessed from the text
|
||||
public string? Phone { get; set; }
|
||||
public List<string> Notes { get; set; } = new(); // what was/wasn't detected (shown to admin)
|
||||
}
|
||||
@@ -107,6 +108,10 @@ public class HeuristicListingParser : IListingParser
|
||||
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
|
||||
}
|
||||
|
||||
// --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) ---
|
||||
p.FacilityName = ExtractFacilityName(text);
|
||||
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
|
||||
|
||||
// --- Phone ---
|
||||
var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}");
|
||||
if (phone.Success) p.Phone = phone.Value;
|
||||
@@ -114,6 +119,48 @@ public class HeuristicListingParser : IListingParser
|
||||
return p;
|
||||
}
|
||||
|
||||
// Words that introduce a facility name, longest/most-specific first.
|
||||
private static readonly string[] FacilityKeywords =
|
||||
{
|
||||
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
|
||||
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
|
||||
"آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان",
|
||||
};
|
||||
|
||||
// Words that clearly aren't part of a facility's name — stop collecting here.
|
||||
private static readonly string[] NameStops =
|
||||
{
|
||||
"جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب",
|
||||
"دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز",
|
||||
"شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم",
|
||||
};
|
||||
|
||||
/// <summary>Best-effort hospital/clinic name: a facility keyword plus up to three name words.</summary>
|
||||
private static string? ExtractFacilityName(string text)
|
||||
{
|
||||
foreach (var kw in FacilityKeywords)
|
||||
{
|
||||
var idx = text.IndexOf(kw, StringComparison.Ordinal);
|
||||
if (idx < 0) continue;
|
||||
var after = text[(idx + kw.Length)..];
|
||||
var words = after.Split(
|
||||
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' },
|
||||
StringSplitOptions.RemoveEmptyEntries);
|
||||
var picked = new List<string>();
|
||||
foreach (var w in words)
|
||||
{
|
||||
if (NameStops.Contains(w)) break;
|
||||
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
|
||||
if (w.Length == 1) break; // stray letters
|
||||
picked.Add(w);
|
||||
if (picked.Count >= 3) break;
|
||||
}
|
||||
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
|
||||
return (kw + " " + string.Join(" ", picked)).Trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary>
|
||||
private static long? ExtractAmount(string text)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user