From 7740d9f8d71ac9babd5d7e4688b40a634a4860f8 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 09:56:25 +0330 Subject: [PATCH] iranestekhdam: restrict to Tehran for launch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep only ads located in Tehran: pre-drop slugs naming other major cities to save fetches, then authoritatively keep ads whose text states «تهران» (the og:description reliably says «شهر تهران»). Pool 5x candidates so the Tehran filter still yields a full batch. Validated against live data: ~16/18 clinical candidates are Tehran. Nationwide expansion later becomes a per-source city setting once the engine is proven. Co-Authored-By: Claude Opus 4.8 --- .../Scraping/IranEstekhdamListingSource.cs | 44 ++++++++++++++----- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/src/JobsMedical.Web/Services/Scraping/IranEstekhdamListingSource.cs b/src/JobsMedical.Web/Services/Scraping/IranEstekhdamListingSource.cs index 1d953ca..0f570f7 100644 --- a/src/JobsMedical.Web/Services/Scraping/IranEstekhdamListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/IranEstekhdamListingSource.cs @@ -38,6 +38,18 @@ public class IranEstekhdamListingSource : IListingSource // Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them. private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" }; + // LAUNCH = TEHRAN ONLY. We keep only ads located in Tehran (the ad's og:description reliably + // states «شهر تهران»). Other major cities named in the slug are pre-dropped to save fetches. + // When the engine is proven and we expand nationwide, make this a per-source city setting. + private const string Tehran = "تهران"; + private static readonly string[] OtherCitySlugs = + { + "شیراز", "اصفهان", "مشهد", "تبریز", "کرج", "اهواز", "قم", "یزد", "رشت", "کرمان", "اراک", + "اردبیل", "همدان", "کرمانشاه", "زنجان", "قزوین", "ساری", "گرگان", "بندرعباس", "بوشهر", + "سنندج", "خرم-آباد", "بیرجند", "سمنان", "شهرکرد", "ایلام", "یاسوج", "زاهدان", "ارومیه", + "نجف-آباد", "کاشان", "قائم-شهر", "بابل", "آمل", "دزفول", "ملارد", "پاکدشت", + }; + public async Task> FetchAsync(AppSetting s, CancellationToken ct = default) { if (!s.IranEstekhdamEnabled) return Array.Empty(); @@ -51,34 +63,40 @@ public class IranEstekhdamListingSource : IListingSource var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList(); if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty(); } - // 2. collect ad URLs, keeping only clinical-role slugs. Pull from successive monthly - // sitemaps until we have enough candidates (or run out). - var picked = new List(); + // 2. pool clinical-role candidate URLs, pre-dropping obvious non-Tehran slugs. We gather + // more than `max` because the authoritative Tehran check (on the ad text) trims further. + var pool = new List(); + var budget = max * 5; foreach (var sm in monthly) { - if (picked.Count >= max) break; + if (pool.Count >= budget) break; try { - var clinical = Locs(await client.GetStringAsync(sm, ct)).Where(IsClinicalSlug); - foreach (var u in clinical) { if (!picked.Contains(u)) picked.Add(u); if (picked.Count >= max) break; } + foreach (var u in Locs(await client.GetStringAsync(sm, ct))) + { + if (IsClinicalSlug(u) && !IsOtherCitySlug(u) && !pool.Contains(u)) pool.Add(u); + if (pool.Count >= budget) break; + } } catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); } } - // 3. fetch each ad page → title + description (+ phone if present in the body) + // 3. fetch each ad → keep only Tehran ones (text must name «تهران»), up to `max`. var items = new List(); - foreach (var url in picked) + foreach (var url in pool) { + if (items.Count >= max) break; ct.ThrowIfCancellationRequested(); try { var html = await client.GetStringAsync(url, ct); var text = ExtractAd(html); - if (text.Length >= 25) items.Add(new ScrapedItem("ایران‌استخدام", text, url)); + if (text.Length < 25 || !text.Contains(Tehran)) continue; // Tehran-only launch filter + items.Add(new ScrapedItem("ایران‌استخدام", text, url)); } catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); } } - _log.LogInformation("iranestekhdam: fetched {Count} clinical ads", items.Count); + _log.LogInformation("iranestekhdam: fetched {Count} Tehran clinical ads (from {Pool} pooled)", items.Count, pool.Count); return items; } catch (Exception ex) @@ -95,6 +113,12 @@ public class IranEstekhdamListingSource : IListingSource return RoleSlugs.Any(slug.Contains); } + private static bool IsOtherCitySlug(string url) + { + var slug = Uri.UnescapeDataString(url); + return OtherCitySlugs.Any(slug.Contains); + } + private static IEnumerable Locs(string xml) => Regex.Matches(xml, "([^<]+)").Select(m => m.Groups[1].Value.Trim());