From 1c580e0f7ac22f797084e35c71eaf27cf67eea4d Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 13:29:43 +0330 Subject: [PATCH] Fix role + contact mislabels seen on a live iranestekhdam ad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (1) Specialist guard: the AI sometimes labels a clearly-specialist ad («پزشک متخصص گوش و حلق و بینی»، «فلوشیپ»، «فوق تخصص») as «پزشک عمومی», so an ENT post published as «استخدام پزشک عمومی». When the primary role is GP but the ad text names a specialist, swap it to «پزشک متخصص» (the subspecialty stays as a tag). (2) Phone type: the landline regex 0\d{2,3} also matched 09xx MOBILE numbers and labeled them «تلفن ثابت». Iranian landline area codes are 0[1-8]xx (021/026/…), never 09 — restrict it so mobiles are no longer mislabeled as landlines. Both apply to new ingests; existing mislabeled rows correct on turnover/reprocess. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Services/ListingParser.cs | 4 +++- .../Services/Scraping/IngestionService.cs | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/JobsMedical.Web/Services/ListingParser.cs b/src/JobsMedical.Web/Services/ListingParser.cs index a13c445..ea067e2 100644 --- a/src/JobsMedical.Web/Services/ListingParser.cs +++ b/src/JobsMedical.Web/Services/ListingParser.cs @@ -361,7 +361,9 @@ public class HeuristicListingParser : IListingParser if (d.Length == 10 && d[0] == '9') d = "0" + d; Add(ContactType.Mobile, d); } - foreach (Match m in Regex.Matches(latin, @"(? c.Name == cityName) ?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First(); var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id); @@ -835,6 +841,17 @@ public class IngestionService /// Greater-Tehran bounding box — rejects out-of-area (hallucinated) AI coordinates. private static bool InTehran(double lat, double lng) => lat is >= 35.4 and <= 35.95 && lng is >= 51.0 and <= 51.8; + // Markers that mean a doctor role is a SPECIALIST, not a GP — used to correct a «پزشک عمومی» + // mislabel on a clearly-specialist ad (e.g. an ENT post showing as «استخدام پزشک عمومی»). + private static readonly string[] SpecialistMarkers = + { "متخصص", "فوق تخصص", "فوقتخصص", "فلوشیپ", "فلوشیب", "بورد تخصصی", "ساب اسپشالیتی", "ent" }; + + private static bool LooksSpecialist(string? rawText) + { + var t = NormalizeFa(rawText); + return SpecialistMarkers.Any(m => t.Contains(NormalizeFa(m))); + } + // Gender/seniority tokens that don't belong in a role name (they go to tags / the Gender field). private static readonly string[] RoleModifierWords = { "آقا", "خانم", "خانوم", "بانو", "مرد", "زن", "کارآموز", "کارورز", "ارشد", "مبتدی" };