3e65c88765
FacilityMatcher treated «شبانه روزی»/«خیریه»/«دولتی»/«خصوصی» as part of a name, so a real facility merged into a generic one when they shared a descriptor — «درمانگاه شبانهروزی اسفند» collapsed into the existing «پلی کلینیک شبانه روزی», losing «اسفند». Add these descriptors to the stripped type-words so matching compares the distinctive core («اسفند») instead. Side benefit: bare descriptor-only names («پلی کلینیک شبانه روزی») now resolve to junk and get folded into the placeholder by the cleanup, rather than masquerading as a real facility. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
149 lines
7.2 KiB
C#
149 lines
7.2 KiB
C#
using System.Text;
|
||
using System.Text.RegularExpressions;
|
||
using JobsMedical.Web.Models;
|
||
|
||
namespace JobsMedical.Web.Services.Scraping;
|
||
|
||
/// <summary>
|
||
/// Persian-aware fuzzy matching for facility names, so the same hospital written slightly
|
||
/// differently — spacing, ي/ك vs ی/ک, ZWNJ, with or without «بیمارستان» — resolves to one
|
||
/// record instead of creating a duplicate. Used by both the manual review/publish flow and
|
||
/// the auto-publish ingestion pipeline.
|
||
/// </summary>
|
||
public static class FacilityMatcher
|
||
{
|
||
// Generic type words stripped to compare the distinctive core of a name.
|
||
private static readonly string[] TypeWords =
|
||
{
|
||
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
|
||
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
|
||
"آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
|
||
// Generic descriptors — never the distinctive part of a name. Stripping them stops false
|
||
// merges like «درمانگاه شبانهروزی اسفند» → «پلی کلینیک شبانه روزی» (they share «شبانه روزی»).
|
||
"شبانه روزی", "شبانهروزی", "خیریه", "دولتی", "خصوصی", "۲۴ ساعته", "24 ساعته", "تامین اجتماعی",
|
||
};
|
||
|
||
/// <summary>Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.</summary>
|
||
public static string Normalize(string? s)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(s)) return "";
|
||
var t = s.Replace('ي', 'ی').Replace('ك', 'ک').Replace('ۀ', 'ه').Replace('ة', 'ه')
|
||
.Replace('أ', 'ا').Replace('إ', 'ا').Replace('آ', 'ا').Replace('ئ', 'ی')
|
||
.Replace('', ' ').ToLowerInvariant();
|
||
var sb = new StringBuilder(t.Length);
|
||
foreach (var ch in t)
|
||
sb.Append(char.IsLetterOrDigit(ch) || ch == ' ' ? ch : ' ');
|
||
return Regex.Replace(sb.ToString(), @"\s+", " ").Trim();
|
||
}
|
||
|
||
/// <summary>Normalized name with generic type words removed — the distinctive part.</summary>
|
||
public static string Core(string? s)
|
||
{
|
||
var n = Normalize(s);
|
||
if (n.Length == 0) return "";
|
||
foreach (var w in TypeWords)
|
||
{
|
||
var nw = Normalize(w);
|
||
if (nw.Length == 0) continue;
|
||
n = Regex.Replace(n, $@"(?<![\p{{L}}\p{{N}}]){Regex.Escape(nw)}(?![\p{{L}}\p{{N}}])", " ");
|
||
}
|
||
return Regex.Replace(n, @"\s+", " ").Trim();
|
||
}
|
||
|
||
// Filler/verb/locator tokens that are never a real facility name — the parser sweeps these in
|
||
// when an ad has no named facility («بیمارستان هستم», «مطب نیازمندیم سه», «کلینیک های فقط منطقه»).
|
||
private static readonly string[] JunkCoreWords =
|
||
{
|
||
"هستم", "هستیم", "هستش", "میشوم", "میشم", "بشوم", "میباشد", "باشد", "میباشم",
|
||
"نیازمندیم", "نیازمند", "نیازمندم", "داریم", "دارم", "میخواهیم", "میخوام",
|
||
"حتی", "تعدادی", "فقط", "منطقه", "واقع", "های", "مبتدی", "محترم", "خوب",
|
||
"سه", "دو", "یک", "چند", "این", "آن", "همکار", "نیرو",
|
||
};
|
||
|
||
// Crawl-source names that must never appear as a public facility («مرکز درمانی (از مدجابز)»),
|
||
// plus the shared placeholder text.
|
||
private static readonly string[] SourceMarkers =
|
||
{
|
||
"مدجابز", "مدجاب", "از تلگرام", "از دیوار", "از بله", "از کانال", "ثبت نشده", "نامشخص",
|
||
};
|
||
|
||
/// <summary>
|
||
/// True when a name is NOT a usable facility name: a bare type word («بیمارستان»), a name whose
|
||
/// distinctive core is only filler/verb tokens («بیمارستان هستم» → «هستم»), or a leaked crawl
|
||
/// source / placeholder («... از مدجابز», «نامشخص»). Such an ad has no real named facility and
|
||
/// should fall back to the shared placeholder instead of forging a fake one.
|
||
/// </summary>
|
||
public static bool IsJunkName(string? name)
|
||
{
|
||
var normalized = Normalize(name);
|
||
if (normalized.Length == 0) return true;
|
||
if (SourceMarkers.Any(m => normalized.Contains(Normalize(m)))) return true;
|
||
|
||
var core = Core(name);
|
||
if (core.Length == 0) return true; // bare type word only («بیمارستان»، «کلینیک»)
|
||
|
||
var tokens = core.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
||
return tokens.All(t => t.Length <= 1 || JunkCoreWords.Contains(t));
|
||
}
|
||
|
||
/// <summary>True when two names almost certainly denote the same facility.</summary>
|
||
public static bool IsSame(string? a, string? b)
|
||
{
|
||
var na = Normalize(a);
|
||
var nb = Normalize(b);
|
||
if (na.Length == 0 || nb.Length == 0) return false;
|
||
if (na == nb) return true;
|
||
|
||
var ca = Core(a);
|
||
var cb = Core(b);
|
||
if (ca.Length >= 2 && ca == cb) return true;
|
||
// one core fully contains the other (e.g. «میلاد» vs «میلاد ۱»)
|
||
if (ca.Length >= 3 && cb.Length >= 3 && (ca.Contains(cb) || cb.Contains(ca))) return true;
|
||
|
||
// edit-distance similarity on the most informative basis
|
||
var (x, y) = ca.Length >= 3 && cb.Length >= 3 ? (ca, cb) : (na, nb);
|
||
return Similarity(x, y) >= 0.86;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Best existing facility for <paramref name="name"/>: same-city exact match first, then
|
||
/// any-city exact, then same-city fuzzy, then any-city fuzzy. Null when nothing matches.
|
||
/// </summary>
|
||
public static Facility? FindBest(IEnumerable<Facility> facilities, string? name, int? cityId)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(name)) return null;
|
||
var list = facilities as IList<Facility> ?? facilities.ToList();
|
||
var target = Normalize(name);
|
||
|
||
return list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && Normalize(f.Name) == target)
|
||
?? list.FirstOrDefault(f => Normalize(f.Name) == target)
|
||
?? list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && IsSame(f.Name, name))
|
||
?? list.FirstOrDefault(f => IsSame(f.Name, name));
|
||
}
|
||
|
||
private static double Similarity(string a, string b)
|
||
{
|
||
if (a == b) return 1;
|
||
var max = Math.Max(a.Length, b.Length);
|
||
return max == 0 ? 1 : 1.0 - (double)Levenshtein(a, b) / max;
|
||
}
|
||
|
||
private static int Levenshtein(string a, string b)
|
||
{
|
||
var dp = new int[b.Length + 1];
|
||
for (var j = 0; j <= b.Length; j++) dp[j] = j;
|
||
for (var i = 1; i <= a.Length; i++)
|
||
{
|
||
var prev = dp[0];
|
||
dp[0] = i;
|
||
for (var j = 1; j <= b.Length; j++)
|
||
{
|
||
var tmp = dp[j];
|
||
dp[j] = Math.Min(Math.Min(dp[j] + 1, dp[j - 1] + 1), prev + (a[i - 1] == b[j - 1] ? 0 : 1));
|
||
prev = tmp;
|
||
}
|
||
}
|
||
return dp[b.Length];
|
||
}
|
||
}
|