iranestekhdam: restrict to Tehran for launch
CI/CD / CI · dotnet build (push) Successful in 2m0s
CI/CD / Deploy · hamkadr (push) Successful in 1m7s

Keep only ads located in Tehran: pre-drop slugs naming other major cities to save fetches,
then authoritatively keep ads whose text states «تهران» (the og:description reliably says
«شهر تهران»). Pool 5x candidates so the Tehran filter still yields a full batch. Validated
against live data: ~16/18 clinical candidates are Tehran. Nationwide expansion later becomes
a per-source city setting once the engine is proven.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 09:56:25 +03:30
parent f118db55ef
commit 7740d9f8d7
@@ -38,6 +38,18 @@ public class IranEstekhdamListingSource : IListingSource
// Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them.
private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" };
// LAUNCH = TEHRAN ONLY. We keep only ads located in Tehran (the ad's og:description reliably
// states «شهر تهران»). Other major cities named in the slug are pre-dropped to save fetches.
// When the engine is proven and we expand nationwide, make this a per-source city setting.
private const string Tehran = "تهران";
private static readonly string[] OtherCitySlugs =
{
"شیراز", "اصفهان", "مشهد", "تبریز", "کرج", "اهواز", "قم", "یزد", "رشت", "کرمان", "اراک",
"اردبیل", "همدان", "کرمانشاه", "زنجان", "قزوین", "ساری", "گرگان", "بندرعباس", "بوشهر",
"سنندج", "خرم-آباد", "بیرجند", "سمنان", "شهرکرد", "ایلام", "یاسوج", "زاهدان", "ارومیه",
"نجف-آباد", "کاشان", "قائم-شهر", "بابل", "آمل", "دزفول", "ملارد", "پاکدشت",
};
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
if (!s.IranEstekhdamEnabled) return Array.Empty<ScrapedItem>();
@@ -51,34 +63,40 @@ public class IranEstekhdamListingSource : IListingSource
var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList();
if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty<ScrapedItem>(); }
// 2. collect ad URLs, keeping only clinical-role slugs. Pull from successive monthly
// sitemaps until we have enough candidates (or run out).
var picked = new List<string>();
// 2. pool clinical-role candidate URLs, pre-dropping obvious non-Tehran slugs. We gather
// more than `max` because the authoritative Tehran check (on the ad text) trims further.
var pool = new List<string>();
var budget = max * 5;
foreach (var sm in monthly)
{
if (picked.Count >= max) break;
if (pool.Count >= budget) break;
try
{
var clinical = Locs(await client.GetStringAsync(sm, ct)).Where(IsClinicalSlug);
foreach (var u in clinical) { if (!picked.Contains(u)) picked.Add(u); if (picked.Count >= max) break; }
foreach (var u in Locs(await client.GetStringAsync(sm, ct)))
{
if (IsClinicalSlug(u) && !IsOtherCitySlug(u) && !pool.Contains(u)) pool.Add(u);
if (pool.Count >= budget) break;
}
}
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); }
}
// 3. fetch each ad page → title + description (+ phone if present in the body)
// 3. fetch each ad → keep only Tehran ones (text must name «تهران»), up to `max`.
var items = new List<ScrapedItem>();
foreach (var url in picked)
foreach (var url in pool)
{
if (items.Count >= max) break;
ct.ThrowIfCancellationRequested();
try
{
var html = await client.GetStringAsync(url, ct);
var text = ExtractAd(html);
if (text.Length >= 25) items.Add(new ScrapedItem("ایران‌استخدام", text, url));
if (text.Length < 25 || !text.Contains(Tehran)) continue; // Tehran-only launch filter
items.Add(new ScrapedItem("ایران‌استخدام", text, url));
}
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); }
}
_log.LogInformation("iranestekhdam: fetched {Count} clinical ads", items.Count);
_log.LogInformation("iranestekhdam: fetched {Count} Tehran clinical ads (from {Pool} pooled)", items.Count, pool.Count);
return items;
}
catch (Exception ex)
@@ -95,6 +113,12 @@ public class IranEstekhdamListingSource : IListingSource
return RoleSlugs.Any(slug.Contains);
}
private static bool IsOtherCitySlug(string url)
{
var slug = Uri.UnescapeDataString(url);
return OtherCitySlugs.Any(slug.Contains);
}
private static IEnumerable<string> Locs(string xml)
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());