From baa617daa97189346ccbd9bfe5add2023931dd64 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sat, 20 Jun 2026 19:58:06 +0330 Subject: [PATCH] =?UTF-8?q?Strip=20=C2=AB=D8=A2=D9=85=D8=A7=D8=AF=D9=87=20?= =?UTF-8?q?=D8=A8=D9=87=20=DA=A9=D8=A7=D8=B1=C2=BB=20from=20role=20names?= =?UTF-8?q?=20+=20reject=20domestic-helper=20ads?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-check of live applicants found two gaps: - «کمک بهیار آماده به کار» — the availability phrase glued onto the role. StripRoleModifiers now removes «آماده به کار / آماده همکاری / جویای کار / جهت همکاری» phrases before token-stripping, so the role collapses to «کمک بهیار». - «خانم امورسبک منزل» — light-housework domestic helpers (not کادر درمان). Validator now discards ads with «امور منزل / نظافت منزل / خدمتکار / مستخدم …» markers. Both take effect for existing data on the next applicant reprocess. Co-Authored-By: Claude Opus 4.8 --- .../Services/Scraping/IngestionService.cs | 13 ++++++++++--- .../Services/Scraping/ListingValidator.cs | 15 +++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index d4edf83..732d5d4 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -610,11 +610,18 @@ public class IngestionService private static readonly string[] RoleModifierWords = { "آقا", "خانم", "خانوم", "بانو", "مرد", "زن", "کارآموز", "کارورز", "ارشد", "مبتدی" }; - /// Remove modifier tokens from a role name, keeping the base profession. Never strips to - /// empty (falls back to the original). + // Availability phrases that the model sometimes glues onto the role («کمک بهیار آماده به کار»); + // removed as whole substrings before token-stripping (so «به»/«کار» tokens stay safe elsewhere). + private static readonly string[] RolePhraseNoise = + { "آماده به کار", "آماده همکاری", "آماده بکار", "آماده به همکاری", "جویای کار", "دنبال کار", "جهت همکاری" }; + + /// Remove availability phrases + gender/seniority tokens from a role name, keeping the + /// base profession. Never strips to empty (falls back to the original). private static string StripRoleModifiers(string name) { - var kept = NormalizeFa(name).Split(' ', StringSplitOptions.RemoveEmptyEntries) + var n = NormalizeFa(name); + foreach (var p in RolePhraseNoise) n = n.Replace(NormalizeFa(p), " "); + var kept = n.Split(' ', StringSplitOptions.RemoveEmptyEntries) .Where(t => !RoleModifierWords.Any(m => NormalizeFa(m) == t)).ToList(); return kept.Count > 0 ? string.Join(" ", kept) : name.Trim(); } diff --git a/src/JobsMedical.Web/Services/Scraping/ListingValidator.cs b/src/JobsMedical.Web/Services/Scraping/ListingValidator.cs index e818e5c..7392d48 100644 --- a/src/JobsMedical.Web/Services/Scraping/ListingValidator.cs +++ b/src/JobsMedical.Web/Services/Scraping/ListingValidator.cs @@ -39,6 +39,14 @@ public class ListingValidator "بوتاکس و فیلر", "مزوتراپی", "فیلر صورت", }; + // Domestic-helper ads (housekeeping/cleaning/servant) — not کادر درمان, even when they also + // mention سالمند/نگهداری. The «امور منزل / نظافت» phrasing is the giveaway. + private static readonly string[] DomesticMarkers = + { + "امور منزل", "امور سبک منزل", "امورسبک منزل", "کارهای منزل", "کار منزل", "نظافت منزل", + "نظافتچی", "خدمتکار", "کارگر منزل", "خدمات منزل", "مستخدم", + }; + // Words that signal a real staffing post (hiring, shift, or availability). private static readonly string[] StaffingIntent = { @@ -67,6 +75,13 @@ public class ListingValidator return new ValidationResult(false, true, 0, issues, looksMedical); // IsSpam → auto-discard } + // Domestic-helper / housekeeping ads — out of scope (not کادر درمان), discard. + if (DomesticMarkers.Any(text.Contains)) + { + issues.Add("آگهی خدماتِ منزل/نظافت است، نه کادر درمان"); + return new ValidationResult(false, true, 0, issues, looksMedical); // IsSpam → auto-discard + } + // «آماده به کار»: a worker offering themselves. No facility/shift-date expected; the role // and a contact number are what matter. if (parsed.Kind == ListingKind.Talent)