Files
hamkadr/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs
T
soroush.asadi 3e65c88765
CI/CD / CI · dotnet build (push) Successful in 49s
CI/CD / Deploy · hamkadr (push) Successful in 1m1s
Strip generic facility descriptors so distinctive names dont false-merge
FacilityMatcher treated «شبانه روزی»/«خیریه»/«دولتی»/«خصوصی» as part of a name, so a real
facility merged into a generic one when they shared a descriptor — «درمانگاه شبانه‌روزی اسفند»
collapsed into the existing «پلی کلینیک شبانه روزی», losing «اسفند». Add these descriptors to
the stripped type-words so matching compares the distinctive core («اسفند») instead. Side
benefit: bare descriptor-only names («پلی کلینیک شبانه روزی») now resolve to junk and get
folded into the placeholder by the cleanup, rather than masquerading as a real facility.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 14:00:00 +03:30

149 lines
7.2 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text;
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// Persian-aware fuzzy matching for facility names, so the same hospital written slightly
/// differently — spacing, ي/ك vs ی/ک, ZWNJ, with or without «بیمارستان» — resolves to one
/// record instead of creating a duplicate. Used by both the manual review/publish flow and
/// the auto-publish ingestion pipeline.
/// </summary>
public static class FacilityMatcher
{
// Generic type words stripped to compare the distinctive core of a name.
private static readonly string[] TypeWords =
{
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
"آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
// Generic descriptors — never the distinctive part of a name. Stripping them stops false
// merges like «درمانگاه شبانه‌روزی اسفند» → «پلی کلینیک شبانه روزی» (they share «شبانه روزی»).
"شبانه روزی", "شبانه‌روزی", "خیریه", "دولتی", "خصوصی", "۲۴ ساعته", "24 ساعته", "تامین اجتماعی",
};
/// <summary>Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.</summary>
public static string Normalize(string? s)
{
if (string.IsNullOrWhiteSpace(s)) return "";
var t = s.Replace('ي', 'ی').Replace('ك', 'ک').Replace('ۀ', 'ه').Replace('ة', 'ه')
.Replace('أ', 'ا').Replace('إ', 'ا').Replace('آ', 'ا').Replace('ئ', 'ی')
.Replace('', ' ').ToLowerInvariant();
var sb = new StringBuilder(t.Length);
foreach (var ch in t)
sb.Append(char.IsLetterOrDigit(ch) || ch == ' ' ? ch : ' ');
return Regex.Replace(sb.ToString(), @"\s+", " ").Trim();
}
/// <summary>Normalized name with generic type words removed — the distinctive part.</summary>
public static string Core(string? s)
{
var n = Normalize(s);
if (n.Length == 0) return "";
foreach (var w in TypeWords)
{
var nw = Normalize(w);
if (nw.Length == 0) continue;
n = Regex.Replace(n, $@"(?<![\p{{L}}\p{{N}}]){Regex.Escape(nw)}(?![\p{{L}}\p{{N}}])", " ");
}
return Regex.Replace(n, @"\s+", " ").Trim();
}
// Filler/verb/locator tokens that are never a real facility name — the parser sweeps these in
// when an ad has no named facility («بیمارستان هستم», «مطب نیازمندیم سه», «کلینیک های فقط منطقه»).
private static readonly string[] JunkCoreWords =
{
"هستم", "هستیم", "هستش", "میشوم", "میشم", "بشوم", "میباشد", "باشد", "میباشم",
"نیازمندیم", "نیازمند", "نیازمندم", "داریم", "دارم", "میخواهیم", "میخوام",
"حتی", "تعدادی", "فقط", "منطقه", "واقع", "های", "مبتدی", "محترم", "خوب",
"سه", "دو", "یک", "چند", "این", "آن", "همکار", "نیرو",
};
// Crawl-source names that must never appear as a public facility («مرکز درمانی (از مدجابز)»),
// plus the shared placeholder text.
private static readonly string[] SourceMarkers =
{
"مدجابز", "مدجاب", "از تلگرام", "از دیوار", "از بله", "از کانال", "ثبت نشده", "نامشخص",
};
/// <summary>
/// True when a name is NOT a usable facility name: a bare type word («بیمارستان»), a name whose
/// distinctive core is only filler/verb tokens («بیمارستان هستم» → «هستم»), or a leaked crawl
/// source / placeholder («... از مدجابز», «نامشخص»). Such an ad has no real named facility and
/// should fall back to the shared placeholder instead of forging a fake one.
/// </summary>
public static bool IsJunkName(string? name)
{
var normalized = Normalize(name);
if (normalized.Length == 0) return true;
if (SourceMarkers.Any(m => normalized.Contains(Normalize(m)))) return true;
var core = Core(name);
if (core.Length == 0) return true; // bare type word only («بیمارستان»، «کلینیک»)
var tokens = core.Split(' ', StringSplitOptions.RemoveEmptyEntries);
return tokens.All(t => t.Length <= 1 || JunkCoreWords.Contains(t));
}
/// <summary>True when two names almost certainly denote the same facility.</summary>
public static bool IsSame(string? a, string? b)
{
var na = Normalize(a);
var nb = Normalize(b);
if (na.Length == 0 || nb.Length == 0) return false;
if (na == nb) return true;
var ca = Core(a);
var cb = Core(b);
if (ca.Length >= 2 && ca == cb) return true;
// one core fully contains the other (e.g. «میلاد» vs «میلاد ۱»)
if (ca.Length >= 3 && cb.Length >= 3 && (ca.Contains(cb) || cb.Contains(ca))) return true;
// edit-distance similarity on the most informative basis
var (x, y) = ca.Length >= 3 && cb.Length >= 3 ? (ca, cb) : (na, nb);
return Similarity(x, y) >= 0.86;
}
/// <summary>
/// Best existing facility for <paramref name="name"/>: same-city exact match first, then
/// any-city exact, then same-city fuzzy, then any-city fuzzy. Null when nothing matches.
/// </summary>
public static Facility? FindBest(IEnumerable<Facility> facilities, string? name, int? cityId)
{
if (string.IsNullOrWhiteSpace(name)) return null;
var list = facilities as IList<Facility> ?? facilities.ToList();
var target = Normalize(name);
return list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && Normalize(f.Name) == target)
?? list.FirstOrDefault(f => Normalize(f.Name) == target)
?? list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && IsSame(f.Name, name))
?? list.FirstOrDefault(f => IsSame(f.Name, name));
}
private static double Similarity(string a, string b)
{
if (a == b) return 1;
var max = Math.Max(a.Length, b.Length);
return max == 0 ? 1 : 1.0 - (double)Levenshtein(a, b) / max;
}
private static int Levenshtein(string a, string b)
{
var dp = new int[b.Length + 1];
for (var j = 0; j <= b.Length; j++) dp[j] = j;
for (var i = 1; i <= a.Length; i++)
{
var prev = dp[0];
dp[0] = i;
for (var j = 1; j <= b.Length; j++)
{
var tmp = dp[j];
dp[j] = Math.Min(Math.Min(dp[j] + 1, dp[j - 1] + 1), prev + (a[i - 1] == b[j - 1] ? 0 : 1));
prev = tmp;
}
}
return dp[b.Length];
}
}