iranestekhdam: restrict to Tehran for launch
Keep only ads located in Tehran: pre-drop slugs naming other major cities to save fetches, then authoritatively keep ads whose text states «تهران» (the og:description reliably says «شهر تهران»). Pool 5x candidates so the Tehran filter still yields a full batch. Validated against live data: ~16/18 clinical candidates are Tehran. Nationwide expansion later becomes a per-source city setting once the engine is proven. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -38,6 +38,18 @@ public class IranEstekhdamListingSource : IListingSource
|
||||
// Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them.
|
||||
private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" };
|
||||
|
||||
// LAUNCH = TEHRAN ONLY. We keep only ads located in Tehran (the ad's og:description reliably
|
||||
// states «شهر تهران»). Other major cities named in the slug are pre-dropped to save fetches.
|
||||
// When the engine is proven and we expand nationwide, make this a per-source city setting.
|
||||
private const string Tehran = "تهران";
|
||||
private static readonly string[] OtherCitySlugs =
|
||||
{
|
||||
"شیراز", "اصفهان", "مشهد", "تبریز", "کرج", "اهواز", "قم", "یزد", "رشت", "کرمان", "اراک",
|
||||
"اردبیل", "همدان", "کرمانشاه", "زنجان", "قزوین", "ساری", "گرگان", "بندرعباس", "بوشهر",
|
||||
"سنندج", "خرم-آباد", "بیرجند", "سمنان", "شهرکرد", "ایلام", "یاسوج", "زاهدان", "ارومیه",
|
||||
"نجف-آباد", "کاشان", "قائم-شهر", "بابل", "آمل", "دزفول", "ملارد", "پاکدشت",
|
||||
};
|
||||
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
|
||||
{
|
||||
if (!s.IranEstekhdamEnabled) return Array.Empty<ScrapedItem>();
|
||||
@@ -51,34 +63,40 @@ public class IranEstekhdamListingSource : IListingSource
|
||||
var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList();
|
||||
if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
// 2. collect ad URLs, keeping only clinical-role slugs. Pull from successive monthly
|
||||
// sitemaps until we have enough candidates (or run out).
|
||||
var picked = new List<string>();
|
||||
// 2. pool clinical-role candidate URLs, pre-dropping obvious non-Tehran slugs. We gather
|
||||
// more than `max` because the authoritative Tehran check (on the ad text) trims further.
|
||||
var pool = new List<string>();
|
||||
var budget = max * 5;
|
||||
foreach (var sm in monthly)
|
||||
{
|
||||
if (picked.Count >= max) break;
|
||||
if (pool.Count >= budget) break;
|
||||
try
|
||||
{
|
||||
var clinical = Locs(await client.GetStringAsync(sm, ct)).Where(IsClinicalSlug);
|
||||
foreach (var u in clinical) { if (!picked.Contains(u)) picked.Add(u); if (picked.Count >= max) break; }
|
||||
foreach (var u in Locs(await client.GetStringAsync(sm, ct)))
|
||||
{
|
||||
if (IsClinicalSlug(u) && !IsOtherCitySlug(u) && !pool.Contains(u)) pool.Add(u);
|
||||
if (pool.Count >= budget) break;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); }
|
||||
}
|
||||
|
||||
// 3. fetch each ad page → title + description (+ phone if present in the body)
|
||||
// 3. fetch each ad → keep only Tehran ones (text must name «تهران»), up to `max`.
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var url in picked)
|
||||
foreach (var url in pool)
|
||||
{
|
||||
if (items.Count >= max) break;
|
||||
ct.ThrowIfCancellationRequested();
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(url, ct);
|
||||
var text = ExtractAd(html);
|
||||
if (text.Length >= 25) items.Add(new ScrapedItem("ایراناستخدام", text, url));
|
||||
if (text.Length < 25 || !text.Contains(Tehran)) continue; // Tehran-only launch filter
|
||||
items.Add(new ScrapedItem("ایراناستخدام", text, url));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); }
|
||||
}
|
||||
_log.LogInformation("iranestekhdam: fetched {Count} clinical ads", items.Count);
|
||||
_log.LogInformation("iranestekhdam: fetched {Count} Tehran clinical ads (from {Pool} pooled)", items.Count, pool.Count);
|
||||
return items;
|
||||
}
|
||||
catch (Exception ex)
|
||||
@@ -95,6 +113,12 @@ public class IranEstekhdamListingSource : IListingSource
|
||||
return RoleSlugs.Any(slug.Contains);
|
||||
}
|
||||
|
||||
private static bool IsOtherCitySlug(string url)
|
||||
{
|
||||
var slug = Uri.UnescapeDataString(url);
|
||||
return OtherCitySlugs.Any(slug.Contains);
|
||||
}
|
||||
|
||||
private static IEnumerable<string> Locs(string xml)
|
||||
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
|
||||
|
||||
|
||||
Reference in New Issue
Block a user