From 3e65c887659496376fb317f6a86cab60d78b9d86 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 14:00:00 +0330 Subject: [PATCH] Strip generic facility descriptors so distinctive names dont false-merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FacilityMatcher treated «شبانه روزی»/«خیریه»/«دولتی»/«خصوصی» as part of a name, so a real facility merged into a generic one when they shared a descriptor — «درمانگاه شبانه‌روزی اسفند» collapsed into the existing «پلی کلینیک شبانه روزی», losing «اسفند». Add these descriptors to the stripped type-words so matching compares the distinctive core («اسفند») instead. Side benefit: bare descriptor-only names («پلی کلینیک شبانه روزی») now resolve to junk and get folded into the placeholder by the cleanup, rather than masquerading as a real facility. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs index 4eca8cd..30b6ea0 100644 --- a/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs +++ b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs @@ -18,6 +18,9 @@ public static class FacilityMatcher "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک", "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع", "آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک", + // Generic descriptors — never the distinctive part of a name. Stripping them stops false + // merges like «درمانگاه شبانه‌روزی اسفند» → «پلی کلینیک شبانه روزی» (they share «شبانه روزی»). + "شبانه روزی", "شبانه‌روزی", "خیریه", "دولتی", "خصوصی", "۲۴ ساعته", "24 ساعته", "تامین اجتماعی", }; /// Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.