From 88eca92333d58b4769b04871d64842e4e3b57185 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 05:40:29 +0330 Subject: [PATCH] Facility data hygiene: merge duplicates, drop junk-named facilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleans up the crawl-generated facility table that surfaced garbage on /Facilities («بیمارستان هستم», «... از مدجابز», bare «کلینیک», «سازمان برنامه جنوبی» x3): - FacilityMatcher.IsJunkName: shared detector for non-names — bare type words, cores made only of filler/verb tokens, and leaked crawl-source/placeholder text. Added داروخانه/آسایشگاه to the generic type words so bare ones are caught and dedupe better. - HeuristicListingParser.ExtractFacilityName now rejects junk candidates (and emoji), so new ingests fall back to the shared placeholder instead of forging a fake facility. - IngestionService.MergeAndCleanFacilitiesAsync (+ admin button): folds junk facilities into the placeholder and merges Persian-fuzzy duplicates into one keeper, repointing their shifts/jobs first. Hard guard: only purely crawl-generated, unmanaged facilities are removed — employer-owned and verified facilities are never touched. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Pages/Admin/Index.cshtml | 9 +++ .../Pages/Admin/Index.cshtml.cs | 12 +++ src/JobsMedical.Web/Services/ListingParser.cs | 7 +- .../Services/Scraping/FacilityMatcher.cs | 38 +++++++++- .../Services/Scraping/IngestionService.cs | 73 +++++++++++++++++++ 5 files changed, 137 insertions(+), 2 deletions(-) diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml b/src/JobsMedical.Web/Pages/Admin/Index.cshtml index c52986e..68c3c19 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml @@ -76,6 +76,15 @@ فقط آگهی‌هایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده می‌شوند (نه صرفاً ناقص) و استخدام‌های تکراری بایگانی می‌شوند (وضعیت «بایگانی»، نه حذف). آگهی‌های معتبر دست‌نخورده‌اند، پس آدرسِ ایندکس‌شده‌شان تغییر نمی‌کند؛ صفحهٔ موارد بایگانی‌شده ۴۱۰ Gone می‌دهد تا گوگل تمیز حذفشان کند.

+
+ +
+

+ مراکز تکراری (با تطبیقِ فارسی) در یک رکورد ادغام و مراکزِ بدونِ نامِ واقعی به «نامشخص» منتقل می‌شوند. آگهی‌ها حفظ می‌شوند؛ فقط مراکزِ جمع‌آوری‌شده و مدیریت‌نشده پاک می‌شوند. +

+

افزودن دستی

diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs index 35c8c8f..a099017 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs @@ -145,6 +145,18 @@ public class IndexModel : PageModel return RedirectToPage(); } + /// + /// Clean up the crawl-generated facility table: merge Persian-fuzzy duplicate facilities and fold + /// junk-named ones («بیمارستان هستم»، «... از مدجابز»، bare «کلینیک») into the shared placeholder, + /// repointing their listings first. Employer-owned / verified facilities are never touched. + /// + public async Task OnPostCleanFacilitiesAsync() + { + var (merged, cleaned) = await _ingest.MergeAndCleanFacilitiesAsync(); + IngestMessage = $"پاک‌سازی مراکز: {merged} مرکزِ تکراری ادغام و {cleaned} مرکزِ بی‌نام/نامعتبر حذف شد (آگهی‌هایشان به مرکزِ معتبر یا «نامشخص» منتقل شد). مراکز ثبت‌شده توسط کارفرما/تأییدشده دست‌نخورده ماند."; + return RedirectToPage(); + } + private async Task LoadAsync() { Queue = await _db.RawListings diff --git a/src/JobsMedical.Web/Services/ListingParser.cs b/src/JobsMedical.Web/Services/ListingParser.cs index 9854c38..a13c445 100644 --- a/src/JobsMedical.Web/Services/ListingParser.cs +++ b/src/JobsMedical.Web/Services/ListingParser.cs @@ -218,12 +218,17 @@ public class HeuristicListingParser : IListingParser { if (NameStops.Contains(w)) break; if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names + if (!w.Any(char.IsLetter)) break; // emoji / punctuation («📍») isn't a name if (w.Length == 1) break; // stray letters picked.Add(w); if (picked.Count >= 3) break; } if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful - return (kw + " " + string.Join(" ", picked)).Trim(); + var candidate = (kw + " " + string.Join(" ", picked)).Trim(); + // Reject names that are only filler/verb/source noise («بیمارستان هستم», «... از مدجابز») — + // a real name couldn't be extracted, so fall back to the shared placeholder downstream. + if (Scraping.FacilityMatcher.IsJunkName(candidate)) continue; + return candidate; } return null; } diff --git a/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs index 5b97f2e..4eca8cd 100644 --- a/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs +++ b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs @@ -17,7 +17,7 @@ public static class FacilityMatcher { "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک", "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع", - "آزمایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک", + "آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک", }; /// Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed. @@ -47,6 +47,42 @@ public static class FacilityMatcher return Regex.Replace(n, @"\s+", " ").Trim(); } + // Filler/verb/locator tokens that are never a real facility name — the parser sweeps these in + // when an ad has no named facility («بیمارستان هستم», «مطب نیازمندیم سه», «کلینیک های فقط منطقه»). + private static readonly string[] JunkCoreWords = + { + "هستم", "هستیم", "هستش", "میشوم", "میشم", "بشوم", "میباشد", "باشد", "میباشم", + "نیازمندیم", "نیازمند", "نیازمندم", "داریم", "دارم", "میخواهیم", "میخوام", + "حتی", "تعدادی", "فقط", "منطقه", "واقع", "های", "مبتدی", "محترم", "خوب", + "سه", "دو", "یک", "چند", "این", "آن", "همکار", "نیرو", + }; + + // Crawl-source names that must never appear as a public facility («مرکز درمانی (از مدجابز)»), + // plus the shared placeholder text. + private static readonly string[] SourceMarkers = + { + "مدجابز", "مدجاب", "از تلگرام", "از دیوار", "از بله", "از کانال", "ثبت نشده", "نامشخص", + }; + + /// + /// True when a name is NOT a usable facility name: a bare type word («بیمارستان»), a name whose + /// distinctive core is only filler/verb tokens («بیمارستان هستم» → «هستم»), or a leaked crawl + /// source / placeholder («... از مدجابز», «نامشخص»). Such an ad has no real named facility and + /// should fall back to the shared placeholder instead of forging a fake one. + /// + public static bool IsJunkName(string? name) + { + var normalized = Normalize(name); + if (normalized.Length == 0) return true; + if (SourceMarkers.Any(m => normalized.Contains(Normalize(m)))) return true; + + var core = Core(name); + if (core.Length == 0) return true; // bare type word only («بیمارستان»، «کلینیک») + + var tokens = core.Split(' ', StringSplitOptions.RemoveEmptyEntries); + return tokens.All(t => t.Length <= 1 || JunkCoreWords.Contains(t)); + } + /// True when two names almost certainly denote the same facility. public static bool IsSame(string? a, string? b) { diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 21a7742..8483649 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -418,6 +418,79 @@ public class IngestionService return archived; } + /// + /// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities + /// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the + /// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper, + /// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely + /// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and + /// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost. + /// Returns (merged, cleaned). + /// + public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default) + { + var facilities = await _db.Facilities.ToListAsync(ct); + var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName); + + var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId) + .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct); + var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId) + .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct); + int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id); + + // Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a + // verified facility (those carry real employer data / verification). + bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified + && f.Verification == VerificationStatus.Unverified + && (placeholder is null || f.Id != placeholder.Id); + + async Task AbsorbAsync(int fromId, int toId) + { + await _db.Shifts.Where(s => s.FacilityId == fromId) + .ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct); + await _db.JobOpenings.Where(j => j.FacilityId == fromId) + .ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct); + await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews + } + + int merged = 0, cleaned = 0; + + // 1) Junk-named crawl facilities → fold into the shared placeholder. + if (placeholder is not null) + foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList()) + { + await AbsorbAsync(f.Id, placeholder.Id); + cleaned++; + } + + // 2) Merge same-city Persian-fuzzy duplicates into the best keeper. + var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct); + var done = new HashSet(); + foreach (var f in remaining) + { + if (done.Contains(f.Id)) continue; + done.Add(f.Id); + var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id) + && o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList(); + if (cluster.Count == 0) continue; + cluster.Add(f); + // keeper: verified > owned > most listings > lowest id (oldest). + var keeper = cluster.OrderByDescending(x => x.IsVerified) + .ThenByDescending(x => x.OwnerUserId.HasValue) + .ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First(); + foreach (var dup in cluster.Where(x => x.Id != keeper.Id)) + { + done.Add(dup.Id); + if (!Removable(dup)) continue; // never delete an employer/verified facility + await AbsorbAsync(dup.Id, keeper.Id); + merged++; + } + } + + _log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned); + return (merged, cleaned); + } + private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray()); private static (RawListingStatus status, string? reason, int confidence) Decide(