using System.Text; using System.Text.RegularExpressions; using JobsMedical.Web.Models; namespace JobsMedical.Web.Services.Scraping; /// /// Persian-aware fuzzy matching for facility names, so the same hospital written slightly /// differently — spacing, ي/ك vs ی/ک, ZWNJ, with or without «بیمارستان» — resolves to one /// record instead of creating a duplicate. Used by both the manual review/publish flow and /// the auto-publish ingestion pipeline. /// public static class FacilityMatcher { // Generic type words stripped to compare the distinctive core of a name. private static readonly string[] TypeWords = { "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک", "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع", "آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک", // Generic descriptors — never the distinctive part of a name. Stripping them stops false // merges like «درمانگاه شبانه‌روزی اسفند» → «پلی کلینیک شبانه روزی» (they share «شبانه روزی»). "شبانه روزی", "شبانه‌روزی", "خیریه", "دولتی", "خصوصی", "۲۴ ساعته", "24 ساعته", "تامین اجتماعی", }; /// Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed. public static string Normalize(string? s) { if (string.IsNullOrWhiteSpace(s)) return ""; var t = s.Replace('ي', 'ی').Replace('ك', 'ک').Replace('ۀ', 'ه').Replace('ة', 'ه') .Replace('أ', 'ا').Replace('إ', 'ا').Replace('آ', 'ا').Replace('ئ', 'ی') .Replace('‌', ' ').ToLowerInvariant(); var sb = new StringBuilder(t.Length); foreach (var ch in t) sb.Append(char.IsLetterOrDigit(ch) || ch == ' ' ? ch : ' '); return Regex.Replace(sb.ToString(), @"\s+", " ").Trim(); } /// Normalized name with generic type words removed — the distinctive part. public static string Core(string? s) { var n = Normalize(s); if (n.Length == 0) return ""; foreach (var w in TypeWords) { var nw = Normalize(w); if (nw.Length == 0) continue; n = Regex.Replace(n, $@"(? /// True when a name is NOT a usable facility name: a bare type word («بیمارستان»), a name whose /// distinctive core is only filler/verb tokens («بیمارستان هستم» → «هستم»), or a leaked crawl /// source / placeholder («... از مدجابز», «نامشخص»). Such an ad has no real named facility and /// should fall back to the shared placeholder instead of forging a fake one. /// public static bool IsJunkName(string? name) { var normalized = Normalize(name); if (normalized.Length == 0) return true; if (SourceMarkers.Any(m => normalized.Contains(Normalize(m)))) return true; var core = Core(name); if (core.Length == 0) return true; // bare type word only («بیمارستان»، «کلینیک») var tokens = core.Split(' ', StringSplitOptions.RemoveEmptyEntries); return tokens.All(t => t.Length <= 1 || JunkCoreWords.Contains(t)); } /// True when two names almost certainly denote the same facility. public static bool IsSame(string? a, string? b) { var na = Normalize(a); var nb = Normalize(b); if (na.Length == 0 || nb.Length == 0) return false; if (na == nb) return true; var ca = Core(a); var cb = Core(b); if (ca.Length >= 2 && ca == cb) return true; // one core fully contains the other (e.g. «میلاد» vs «میلاد ۱») if (ca.Length >= 3 && cb.Length >= 3 && (ca.Contains(cb) || cb.Contains(ca))) return true; // edit-distance similarity on the most informative basis var (x, y) = ca.Length >= 3 && cb.Length >= 3 ? (ca, cb) : (na, nb); return Similarity(x, y) >= 0.86; } /// /// Best existing facility for : same-city exact match first, then /// any-city exact, then same-city fuzzy, then any-city fuzzy. Null when nothing matches. /// public static Facility? FindBest(IEnumerable facilities, string? name, int? cityId) { if (string.IsNullOrWhiteSpace(name)) return null; var list = facilities as IList ?? facilities.ToList(); var target = Normalize(name); return list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && Normalize(f.Name) == target) ?? list.FirstOrDefault(f => Normalize(f.Name) == target) ?? list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && IsSame(f.Name, name)) ?? list.FirstOrDefault(f => IsSame(f.Name, name)); } private static double Similarity(string a, string b) { if (a == b) return 1; var max = Math.Max(a.Length, b.Length); return max == 0 ? 1 : 1.0 - (double)Levenshtein(a, b) / max; } private static int Levenshtein(string a, string b) { var dp = new int[b.Length + 1]; for (var j = 0; j <= b.Length; j++) dp[j] = j; for (var i = 1; i <= a.Length; i++) { var prev = dp[0]; dp[0] = i; for (var j = 1; j <= b.Length; j++) { var tmp = dp[j]; dp[j] = Math.Min(Math.Min(dp[j] + 1, dp[j - 1] + 1), prev + (a[i - 1] == b[j - 1] ? 0 : 1)); prev = tmp; } } return dp[b.Length]; } }