iranestekhdam: restrict to Tehran for launch
CI/CD / CI · dotnet build (push) Successful in 2m0s
CI/CD / Deploy · hamkadr (push) Successful in 1m7s

Keep only ads located in Tehran: pre-drop slugs naming other major cities to save fetches,
then authoritatively keep ads whose text states «تهران» (the og:description reliably says
«شهر تهران»). Pool 5x candidates so the Tehran filter still yields a full batch. Validated
against live data: ~16/18 clinical candidates are Tehran. Nationwide expansion later becomes
a per-source city setting once the engine is proven.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 09:56:25 +03:30
parent f118db55ef
commit 7740d9f8d7
@@ -38,6 +38,18 @@ public class IranEstekhdamListingSource : IListingSource
// Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them. // Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them.
private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" }; private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" };
// LAUNCH = TEHRAN ONLY. We keep only ads located in Tehran (the ad's og:description reliably
// states «شهر تهران»). Other major cities named in the slug are pre-dropped to save fetches.
// When the engine is proven and we expand nationwide, make this a per-source city setting.
private const string Tehran = "تهران";
private static readonly string[] OtherCitySlugs =
{
"شیراز", "اصفهان", "مشهد", "تبریز", "کرج", "اهواز", "قم", "یزد", "رشت", "کرمان", "اراک",
"اردبیل", "همدان", "کرمانشاه", "زنجان", "قزوین", "ساری", "گرگان", "بندرعباس", "بوشهر",
"سنندج", "خرم-آباد", "بیرجند", "سمنان", "شهرکرد", "ایلام", "یاسوج", "زاهدان", "ارومیه",
"نجف-آباد", "کاشان", "قائم-شهر", "بابل", "آمل", "دزفول", "ملارد", "پاکدشت",
};
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default) public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{ {
if (!s.IranEstekhdamEnabled) return Array.Empty<ScrapedItem>(); if (!s.IranEstekhdamEnabled) return Array.Empty<ScrapedItem>();
@@ -51,34 +63,40 @@ public class IranEstekhdamListingSource : IListingSource
var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList(); var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList();
if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty<ScrapedItem>(); } if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty<ScrapedItem>(); }
// 2. collect ad URLs, keeping only clinical-role slugs. Pull from successive monthly // 2. pool clinical-role candidate URLs, pre-dropping obvious non-Tehran slugs. We gather
// sitemaps until we have enough candidates (or run out). // more than `max` because the authoritative Tehran check (on the ad text) trims further.
var picked = new List<string>(); var pool = new List<string>();
var budget = max * 5;
foreach (var sm in monthly) foreach (var sm in monthly)
{ {
if (picked.Count >= max) break; if (pool.Count >= budget) break;
try try
{ {
var clinical = Locs(await client.GetStringAsync(sm, ct)).Where(IsClinicalSlug); foreach (var u in Locs(await client.GetStringAsync(sm, ct)))
foreach (var u in clinical) { if (!picked.Contains(u)) picked.Add(u); if (picked.Count >= max) break; } {
if (IsClinicalSlug(u) && !IsOtherCitySlug(u) && !pool.Contains(u)) pool.Add(u);
if (pool.Count >= budget) break;
}
} }
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); } catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); }
} }
// 3. fetch each ad page → title + description (+ phone if present in the body) // 3. fetch each ad → keep only Tehran ones (text must name «تهران»), up to `max`.
var items = new List<ScrapedItem>(); var items = new List<ScrapedItem>();
foreach (var url in picked) foreach (var url in pool)
{ {
if (items.Count >= max) break;
ct.ThrowIfCancellationRequested(); ct.ThrowIfCancellationRequested();
try try
{ {
var html = await client.GetStringAsync(url, ct); var html = await client.GetStringAsync(url, ct);
var text = ExtractAd(html); var text = ExtractAd(html);
if (text.Length >= 25) items.Add(new ScrapedItem("ایران‌استخدام", text, url)); if (text.Length < 25 || !text.Contains(Tehran)) continue; // Tehran-only launch filter
items.Add(new ScrapedItem("ایران‌استخدام", text, url));
} }
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); } catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); }
} }
_log.LogInformation("iranestekhdam: fetched {Count} clinical ads", items.Count); _log.LogInformation("iranestekhdam: fetched {Count} Tehran clinical ads (from {Pool} pooled)", items.Count, pool.Count);
return items; return items;
} }
catch (Exception ex) catch (Exception ex)
@@ -95,6 +113,12 @@ public class IranEstekhdamListingSource : IListingSource
return RoleSlugs.Any(slug.Contains); return RoleSlugs.Any(slug.Contains);
} }
private static bool IsOtherCitySlug(string url)
{
var slug = Uri.UnescapeDataString(url);
return OtherCitySlugs.Any(slug.Contains);
}
private static IEnumerable<string> Locs(string xml) private static IEnumerable<string> Locs(string xml)
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim()); => Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());