From e6a796ab27b8bf335cf62a5475443cdc45e7f3f9 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Mon, 8 Jun 2026 07:14:48 +0330 Subject: [PATCH] Match crawled listings to existing facilities (fuzzy) before creating new MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When publishing a scraped listing we now look for a facility we already have that is exactly or closely the same, and only create a new one when there is no match — avoiding duplicates like «بیمارستان میلاد» vs «میلاد». - ListingParser: extract a facility name (keyword + distinctive words) from the post and surface it in the parser notes. - FacilityMatcher: Persian-aware normalization (ي/ك, ZWNJ, punctuation), type-word stripping for a "core" name, contains + Levenshtein similarity, and FindBest (same-city exact → any-city exact → same-city fuzzy → fuzzy). - Review (manual publish): auto-select a matching facility or prefill the new-facility name; resolve-or-create uses fuzzy match; dropdown preselects. - IngestionService (auto-publish): reuse FacilityMatcher against a run-wide facility list (grows as new ones are created) instead of exact-name only. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Pages/Admin/Review.cshtml | 4 +- .../Pages/Admin/Review.cshtml.cs | 30 ++++- src/JobsMedical.Web/Services/ListingParser.cs | 47 ++++++++ .../Services/Scraping/FacilityMatcher.cs | 109 ++++++++++++++++++ .../Services/Scraping/IngestionService.cs | 11 +- 5 files changed, 191 insertions(+), 10 deletions(-) create mode 100644 src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs diff --git a/src/JobsMedical.Web/Pages/Admin/Review.cshtml b/src/JobsMedical.Web/Pages/Admin/Review.cshtml index 6ae6115..6824f2a 100644 --- a/src/JobsMedical.Web/Pages/Admin/Review.cshtml +++ b/src/JobsMedical.Web/Pages/Admin/Review.cshtml @@ -51,10 +51,10 @@
diff --git a/src/JobsMedical.Web/Pages/Admin/Review.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Review.cshtml.cs index 1832c05..0923a1b 100644 --- a/src/JobsMedical.Web/Pages/Admin/Review.cshtml.cs +++ b/src/JobsMedical.Web/Pages/Admin/Review.cshtml.cs @@ -1,6 +1,7 @@ using JobsMedical.Web.Data; using JobsMedical.Web.Models; using JobsMedical.Web.Services; +using JobsMedical.Web.Services.Scraping; using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Mvc; using Microsoft.AspNetCore.Mvc.RazorPages; @@ -72,6 +73,25 @@ public class ReviewModel : PageModel if (Parsed.PayAmount is not null) { PayAmount = Parsed.PayAmount; SalaryMin = Parsed.PayAmount; } Description = Raw.RawText; Title = Parsed.RoleName is not null ? $"استخدام {Parsed.RoleName}" : "موقعیت استخدامی"; + + // Facility: try to match the listing's facility to one we already have; otherwise + // prefill the "new facility" box so publishing creates it. + if (!string.IsNullOrWhiteSpace(Parsed.FacilityName)) + { + var cityId = await _db.Cities.Where(c => c.Name == Parsed.CityName) + .Select(c => (int?)c.Id).FirstOrDefaultAsync(); + var match = FacilityMatcher.FindBest(Facilities, Parsed.FacilityName, cityId); + if (match is not null) + { + FacilityId = match.Id; + Parsed.Notes.Add($"مرکز منطبق در سیستم: «{match.Name}» — همین انتخاب شد."); + } + else + { + NewFacilityName = Parsed.FacilityName; + Parsed.Notes.Add($"مرکز جدید پیشنهادی: «{Parsed.FacilityName}» — هنگام انتشار ساخته می‌شود."); + } + } return Page(); } @@ -181,15 +201,17 @@ public class ReviewModel : PageModel if (string.IsNullOrWhiteSpace(NewFacilityName)) return null; - // Reuse a same-named facility if one already exists, else create it. var name = NewFacilityName.Trim(); - var existing = await _db.Facilities.FirstOrDefaultAsync(f => f.Name == name); - if (existing is not null) return existing.Id; - var cityId = await _db.Cities.OrderByDescending(c => c.IsActive) .Select(c => (int?)c.Id).FirstOrDefaultAsync(); if (cityId is null) return null; // no cities seeded — cannot create a facility + // Reuse an existing facility that's exactly or closely the same (Persian-aware fuzzy + // match), so we don't create duplicates like «بیمارستان میلاد» vs «میلاد». + var all = await _db.Facilities.ToListAsync(); + var match = FacilityMatcher.FindBest(all, name, cityId); + if (match is not null) return match.Id; + var facility = new Facility { Name = name, diff --git a/src/JobsMedical.Web/Services/ListingParser.cs b/src/JobsMedical.Web/Services/ListingParser.cs index 03a90cc..1ffc506 100644 --- a/src/JobsMedical.Web/Services/ListingParser.cs +++ b/src/JobsMedical.Web/Services/ListingParser.cs @@ -16,6 +16,7 @@ public class ParsedListing public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز public string? CityName { get; set; } public string? DistrictName { get; set; } + public string? FacilityName { get; set; } // hospital/clinic name guessed from the text public string? Phone { get; set; } public List Notes { get; set; } = new(); // what was/wasn't detected (shown to admin) } @@ -107,6 +108,10 @@ public class HeuristicListingParser : IListingParser else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد"); } + // --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) --- + p.FacilityName = ExtractFacilityName(text); + if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}"); + // --- Phone --- var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}"); if (phone.Success) p.Phone = phone.Value; @@ -114,6 +119,48 @@ public class HeuristicListingParser : IListingParser return p; } + // Words that introduce a facility name, longest/most-specific first. + private static readonly string[] FacilityKeywords = + { + "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک", + "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع", + "آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان", + }; + + // Words that clearly aren't part of a facility's name — stop collecting here. + private static readonly string[] NameStops = + { + "جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب", + "دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز", + "شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم", + }; + + /// Best-effort hospital/clinic name: a facility keyword plus up to three name words. + private static string? ExtractFacilityName(string text) + { + foreach (var kw in FacilityKeywords) + { + var idx = text.IndexOf(kw, StringComparison.Ordinal); + if (idx < 0) continue; + var after = text[(idx + kw.Length)..]; + var words = after.Split( + new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' }, + StringSplitOptions.RemoveEmptyEntries); + var picked = new List(); + foreach (var w in words) + { + if (NameStops.Contains(w)) break; + if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names + if (w.Length == 1) break; // stray letters + picked.Add(w); + if (picked.Count >= 3) break; + } + if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful + return (kw + " " + string.Join(" ", picked)).Trim(); + } + return null; + } + /// Pull a Toman figure out of free text, handling «میلیون» and Persian digits. private static long? ExtractAmount(string text) { diff --git a/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs new file mode 100644 index 0000000..5b97f2e --- /dev/null +++ b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs @@ -0,0 +1,109 @@ +using System.Text; +using System.Text.RegularExpressions; +using JobsMedical.Web.Models; + +namespace JobsMedical.Web.Services.Scraping; + +/// +/// Persian-aware fuzzy matching for facility names, so the same hospital written slightly +/// differently — spacing, ي/ك vs ی/ک, ZWNJ, with or without «بیمارستان» — resolves to one +/// record instead of creating a duplicate. Used by both the manual review/publish flow and +/// the auto-publish ingestion pipeline. +/// +public static class FacilityMatcher +{ + // Generic type words stripped to compare the distinctive core of a name. + private static readonly string[] TypeWords = + { + "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک", + "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع", + "آزمایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک", + }; + + /// Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed. + public static string Normalize(string? s) + { + if (string.IsNullOrWhiteSpace(s)) return ""; + var t = s.Replace('ي', 'ی').Replace('ك', 'ک').Replace('ۀ', 'ه').Replace('ة', 'ه') + .Replace('أ', 'ا').Replace('إ', 'ا').Replace('آ', 'ا').Replace('ئ', 'ی') + .Replace('‌', ' ').ToLowerInvariant(); + var sb = new StringBuilder(t.Length); + foreach (var ch in t) + sb.Append(char.IsLetterOrDigit(ch) || ch == ' ' ? ch : ' '); + return Regex.Replace(sb.ToString(), @"\s+", " ").Trim(); + } + + /// Normalized name with generic type words removed — the distinctive part. + public static string Core(string? s) + { + var n = Normalize(s); + if (n.Length == 0) return ""; + foreach (var w in TypeWords) + { + var nw = Normalize(w); + if (nw.Length == 0) continue; + n = Regex.Replace(n, $@"(?True when two names almost certainly denote the same facility. + public static bool IsSame(string? a, string? b) + { + var na = Normalize(a); + var nb = Normalize(b); + if (na.Length == 0 || nb.Length == 0) return false; + if (na == nb) return true; + + var ca = Core(a); + var cb = Core(b); + if (ca.Length >= 2 && ca == cb) return true; + // one core fully contains the other (e.g. «میلاد» vs «میلاد ۱») + if (ca.Length >= 3 && cb.Length >= 3 && (ca.Contains(cb) || cb.Contains(ca))) return true; + + // edit-distance similarity on the most informative basis + var (x, y) = ca.Length >= 3 && cb.Length >= 3 ? (ca, cb) : (na, nb); + return Similarity(x, y) >= 0.86; + } + + /// + /// Best existing facility for : same-city exact match first, then + /// any-city exact, then same-city fuzzy, then any-city fuzzy. Null when nothing matches. + /// + public static Facility? FindBest(IEnumerable facilities, string? name, int? cityId) + { + if (string.IsNullOrWhiteSpace(name)) return null; + var list = facilities as IList ?? facilities.ToList(); + var target = Normalize(name); + + return list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && Normalize(f.Name) == target) + ?? list.FirstOrDefault(f => Normalize(f.Name) == target) + ?? list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && IsSame(f.Name, name)) + ?? list.FirstOrDefault(f => IsSame(f.Name, name)); + } + + private static double Similarity(string a, string b) + { + if (a == b) return 1; + var max = Math.Max(a.Length, b.Length); + return max == 0 ? 1 : 1.0 - (double)Levenshtein(a, b) / max; + } + + private static int Levenshtein(string a, string b) + { + var dp = new int[b.Length + 1]; + for (var j = 0; j <= b.Length; j++) dp[j] = j; + for (var i = 1; i <= a.Length; i++) + { + var prev = dp[0]; + dp[0] = i; + for (var j = 1; j <= b.Length; j++) + { + var tmp = dp[j]; + dp[j] = Math.Min(Math.Min(dp[j] + 1, dp[j - 1] + 1), prev + (a[i - 1] == b[j - 1] ? 0 : 1)); + prev = tmp; + } + } + return dp[b.Length]; + } +} diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 09bd795..7c79eae 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -52,6 +52,7 @@ public class IngestionService var roles = await _db.Roles.ToListAsync(ct); var cities = await _db.Cities.ToListAsync(ct); var districts = await _db.Districts.ToListAsync(ct); + var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create var roleNames = roles.Select(r => r.Name).ToList(); var cityNames = cities.Select(c => c.Name).ToList(); var districtNames = districts.Select(d => d.Name).ToList(); @@ -95,7 +96,7 @@ public class IngestionService if (status == RawListingStatus.Normalized) { - try { Publish(parsed, ai, raw, roles, cities, districts); published++; } + try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; } catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; } } else if (status == RawListingStatus.New) queued++; @@ -157,7 +158,7 @@ public class IngestionService } private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw, - List roles, List cities, List districts) + List roles, List cities, List districts, List facilities) { var d = ai?.Data; var roleName = d?.Role ?? parsed.RoleName; @@ -170,9 +171,10 @@ public class IngestionService var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id); var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim() + : !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim() : $"مرکز درمانی (از {raw.SourceChannel})"; - var facility = _db.Facilities.Local.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id) - ?? _db.Facilities.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id); + // Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one. + var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id); if (facility is null) { facility = new Facility @@ -181,6 +183,7 @@ public class IngestionService Phone = parsed.Phone, IsVerified = false, }; _db.Facilities.Add(facility); + facilities.Add(facility); // so later listings in this run match it too } var kind = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();