Add iranestekhdam.ir as an ingestion source (clinical job ads at named facilities)
CI/CD / CI · dotnet build (push) Successful in 1m43s
CI/CD / Deploy · hamkadr (push) Successful in 1m55s

New IranEstekhdamListingSource: reads the site monthly ad sitemaps
(sitemap-ads.xml -> sitemap-ads-YYYY-M.xml), keeps only ad URLs whose Persian slug names a
clinical role (veterinary/non-clinical excluded), then extracts each ad title + description
(+ phone). These are employer ads at NAMED facilities, so they directly improve the
unknown-facility problem the classifieds content has.

Wired in like Medjobs: AppSetting toggles (IranEstekhdamEnabled/MaxAds/UseProxy) + EF
migration, SettingsService persistence, admin Settings UI, and DI registration. Off by
default; the medical-gate validator + AI auditor + junk filters screen results downstream.

Note: e-estekhdam / jobinja / jobvision are JS-rendered SPAs whose ad lists are not in static
HTML, so they need API reverse-engineering (a separate effort), not this static-scrape path.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 07:39:39 +03:30
parent da55f82c6c
commit f118db55ef
9 changed files with 1869 additions and 0 deletions
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,51 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace JobsMedical.Web.Migrations
{
/// <inheritdoc />
public partial class IranEstekhdamSource : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.AddColumn<bool>(
name: "IranEstekhdamEnabled",
table: "AppSettings",
type: "boolean",
nullable: false,
defaultValue: false);
migrationBuilder.AddColumn<int>(
name: "IranEstekhdamMaxAds",
table: "AppSettings",
type: "integer",
nullable: false,
defaultValue: 40);
migrationBuilder.AddColumn<bool>(
name: "IranEstekhdamUseProxy",
table: "AppSettings",
type: "boolean",
nullable: false,
defaultValue: false);
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropColumn(
name: "IranEstekhdamEnabled",
table: "AppSettings");
migrationBuilder.DropColumn(
name: "IranEstekhdamMaxAds",
table: "AppSettings");
migrationBuilder.DropColumn(
name: "IranEstekhdamUseProxy",
table: "AppSettings");
}
}
}
@@ -103,6 +103,15 @@ namespace JobsMedical.Web.Migrations
.HasMaxLength(1000)
.HasColumnType("character varying(1000)");
b.Property<bool>("IranEstekhdamEnabled")
.HasColumnType("boolean");
b.Property<int>("IranEstekhdamMaxAds")
.HasColumnType("integer");
b.Property<bool>("IranEstekhdamUseProxy")
.HasColumnType("boolean");
b.Property<bool>("MedjobsEnabled")
.HasColumnType("boolean");
+6
View File
@@ -81,6 +81,12 @@ public class AppSetting
/// <summary>Max ads to fetch per ingestion run (be polite; dedupe skips already-seen).</summary>
public int MedjobsMaxAds { get; set; } = 40;
/// <summary>Scrape iranestekhdam.ir clinical job ads (crawled via its monthly ad sitemaps;
/// employer ads at named facilities, filtered to clinical-role slugs).</summary>
public bool IranEstekhdamEnabled { get; set; } = false;
public int IranEstekhdamMaxAds { get; set; } = 40;
public bool IranEstekhdamUseProxy { get; set; } = false;
// --- SMS OTP (Kavenegar). When off, the code is shown on screen (dev only). ---
public bool SmsEnabled { get; set; } = false;
[MaxLength(200)] public string? SmsApiKey { get; set; }
@@ -147,6 +147,16 @@
</div>
</div>
<div class="source-box">
<label class="toggle-row">
<input type="checkbox" name="IranEstekhdamEnabled" value="true" checked="@Model.IranEstekhdamEnabled" />
<span class="t-body"><span>🏥 ایران‌استخدام (iranestekhdam.ir)</span><span class="t-hint">آگهی‌های استخدامِ مراکز درمانیِ نام‌دار از سایت‌مپِ ماهانه؛ فقط نقش‌های بالینی.</span></span>
</label>
<div class="filter-group"><label>حداکثر آگهی در هر اجرا</label><input type="number" name="IranEstekhdamMaxAds" min="1" max="500" value="@Model.IranEstekhdamMaxAds" dir="ltr" />
<label class="proxy-toggle"><input type="checkbox" name="IranEstekhdamUseProxy" value="true" checked="@Model.IranEstekhdamUseProxy" /> از پروکسی استفاده شود</label>
</div>
</div>
<div class="source-box">
<label class="toggle-row">
<input type="checkbox" name="WebsitesEnabled" value="true" checked="@Model.WebsitesEnabled" />
@@ -47,6 +47,9 @@ public class SettingsModel : PageModel
[BindProperty] public string? DivarQueries { get; set; }
[BindProperty] public bool MedjobsEnabled { get; set; }
[BindProperty] public int MedjobsMaxAds { get; set; } = 40;
[BindProperty] public bool IranEstekhdamEnabled { get; set; }
[BindProperty] public int IranEstekhdamMaxAds { get; set; } = 40;
[BindProperty] public bool IranEstekhdamUseProxy { get; set; }
[BindProperty] public bool SmsEnabled { get; set; }
[BindProperty] public string? SmsApiKey { get; set; }
[BindProperty] public string? SmsTemplate { get; set; }
@@ -95,6 +98,9 @@ public class SettingsModel : PageModel
DivarQueries = s.DivarQueries;
MedjobsEnabled = s.MedjobsEnabled;
MedjobsMaxAds = s.MedjobsMaxAds;
IranEstekhdamEnabled = s.IranEstekhdamEnabled;
IranEstekhdamMaxAds = s.IranEstekhdamMaxAds;
IranEstekhdamUseProxy = s.IranEstekhdamUseProxy;
SmsEnabled = s.SmsEnabled;
SmsApiKey = s.SmsApiKey;
SmsTemplate = s.SmsTemplate;
@@ -140,6 +146,9 @@ public class SettingsModel : PageModel
DivarQueries = DivarQueries,
MedjobsEnabled = MedjobsEnabled,
MedjobsMaxAds = MedjobsMaxAds,
IranEstekhdamEnabled = IranEstekhdamEnabled,
IranEstekhdamMaxAds = IranEstekhdamMaxAds,
IranEstekhdamUseProxy = IranEstekhdamUseProxy,
SmsEnabled = SmsEnabled,
SmsApiKey = SmsApiKey,
SmsTemplate = SmsTemplate,
+2
View File
@@ -60,6 +60,8 @@ builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
JobsMedical.Web.Services.Scraping.DivarListingSource>();
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
JobsMedical.Web.Services.Scraping.MedjobsListingSource>();
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
JobsMedical.Web.Services.Scraping.IranEstekhdamListingSource>();
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
JobsMedical.Web.Services.Scraping.WebsiteListingSource>();
builder.Services.AddScoped<JobsMedical.Web.Services.Scraping.ListingArchiver>();
@@ -0,0 +1,135 @@
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// Scrapes clinical job ads from iranestekhdam.ir. It reads the site's monthly ad sitemaps
/// (sitemap-ads.xml → sitemap-ads-YYYY-M.xml) to enumerate ad URLs, keeps only those whose
/// readable Persian slug names a CLINICAL role (veterinary / non-clinical excluded), then fetches
/// each ad page and extracts its title + description (+ any phone). These are EMPLOYER ads at NAMED
/// facilities (بیمارستان/درمانگاه/کلینیک/آزمایشگاه …) — far higher quality than classifieds, so they
/// directly improve the «نامشخص»-facility problem. Content-hash dedupe ingests each ad once; the
/// medical-gate validator + AI auditor + junk filters do the final screening on top.
/// </summary>
public class IranEstekhdamListingSource : IListingSource
{
private const string SitemapIndex = "https://iranestekhdam.ir/sitemap-ads.xml";
private readonly ScrapeHttpClients _clients;
private readonly ILogger<IranEstekhdamListingSource> _log;
public IranEstekhdamListingSource(ScrapeHttpClients clients, ILogger<IranEstekhdamListingSource> log)
{
_clients = clients;
_log = log;
}
public string Name => "ایران‌استخدام (iranestekhdam.ir)";
// Clinical-role markers matched against the DECODED Persian URL slug. Words are hyphen-joined in
// the slug, so substring matching works on the decoded form.
private static readonly string[] RoleSlugs =
{
"پرستار", "بهیار", "کمک-پرستار", "کمک-بهیار", "پزشک", "دندان", "مامایی", "ماما", "تکنسین",
"رادیولوژ", "سونوگراف", "فیزیوتراپ", "کاردرمان", "گفتاردرمان", "شنوایی", "بینایی", "اپتومتر",
"دیالیز", "اتاق-عمل", "بیهوش", "تزریقات", "فوریت", "اورژانس", "داروساز", "نسخه", "سالمند",
};
// Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them.
private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" };
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
if (!s.IranEstekhdamEnabled) return Array.Empty<ScrapedItem>();
var max = Math.Clamp(s.IranEstekhdamMaxAds, 1, 500);
var client = _clients.For(s, s.IranEstekhdamUseProxy);
try
{
// 1. sitemap index → the monthly ad sitemaps (newest first as listed by the site)
var index = await client.GetStringAsync(SitemapIndex, ct);
var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList();
if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty<ScrapedItem>(); }
// 2. collect ad URLs, keeping only clinical-role slugs. Pull from successive monthly
// sitemaps until we have enough candidates (or run out).
var picked = new List<string>();
foreach (var sm in monthly)
{
if (picked.Count >= max) break;
try
{
var clinical = Locs(await client.GetStringAsync(sm, ct)).Where(IsClinicalSlug);
foreach (var u in clinical) { if (!picked.Contains(u)) picked.Add(u); if (picked.Count >= max) break; }
}
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); }
}
// 3. fetch each ad page → title + description (+ phone if present in the body)
var items = new List<ScrapedItem>();
foreach (var url in picked)
{
ct.ThrowIfCancellationRequested();
try
{
var html = await client.GetStringAsync(url, ct);
var text = ExtractAd(html);
if (text.Length >= 25) items.Add(new ScrapedItem("ایران‌استخدام", text, url));
}
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); }
}
_log.LogInformation("iranestekhdam: fetched {Count} clinical ads", items.Count);
return items;
}
catch (Exception ex)
{
_log.LogWarning(ex, "iranestekhdam fetch failed");
return Array.Empty<ScrapedItem>();
}
}
private static bool IsClinicalSlug(string url)
{
var slug = Uri.UnescapeDataString(url);
if (ExcludeSlugs.Any(slug.Contains)) return false;
return RoleSlugs.Any(slug.Contains);
}
private static IEnumerable<string> Locs(string xml)
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
/// <summary>Title (site suffix stripped) + the ad's description. iranestekhdam puts a complete,
/// structured summary (facility + city + district + role) in og:description, with the full
/// requirements in the .single-ad container — prefer whichever yields more text.</summary>
private static string ExtractAd(string html)
{
var title = Meta(html, "og:title");
if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); }
var ogBody = Meta(html, "og:description");
var single = BetweenClass(html, "single-ad");
var singleText = single is null ? null : HtmlUtil.ToPlainText(single);
var body = (singleText?.Length ?? 0) > (ogBody?.Length ?? 0) ? singleText : ogBody;
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p))));
if (text.Length > 1800) text = text[..1800];
var phones = HtmlUtil.HarvestPhones(body ?? "");
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
}
private static string? Meta(string html, string prop)
{
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
}
private static string? BetweenClass(string html, string cls)
{
var m = Regex.Match(html, $"<(?:div|article|section)[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</(?:div|article|section)>",
RegexOptions.Singleline);
return m.Success ? m.Groups[1].Value : null;
}
}
@@ -55,6 +55,9 @@ public class SettingsService
s.DivarQueries = incoming.DivarQueries?.Trim();
s.MedjobsEnabled = incoming.MedjobsEnabled;
s.MedjobsMaxAds = Math.Clamp(incoming.MedjobsMaxAds, 1, 500);
s.IranEstekhdamEnabled = incoming.IranEstekhdamEnabled;
s.IranEstekhdamMaxAds = Math.Clamp(incoming.IranEstekhdamMaxAds, 1, 500);
s.IranEstekhdamUseProxy = incoming.IranEstekhdamUseProxy;
s.SmsEnabled = incoming.SmsEnabled;
s.SmsApiKey = incoming.SmsApiKey?.Trim();
s.SmsTemplate = incoming.SmsTemplate?.Trim();