Add iranestekhdam.ir as an ingestion source (clinical job ads at named facilities)
New IranEstekhdamListingSource: reads the site monthly ad sitemaps (sitemap-ads.xml -> sitemap-ads-YYYY-M.xml), keeps only ad URLs whose Persian slug names a clinical role (veterinary/non-clinical excluded), then extracts each ad title + description (+ phone). These are employer ads at NAMED facilities, so they directly improve the unknown-facility problem the classifieds content has. Wired in like Medjobs: AppSetting toggles (IranEstekhdamEnabled/MaxAds/UseProxy) + EF migration, SettingsService persistence, admin Settings UI, and DI registration. Off by default; the medical-gate validator + AI auditor + junk filters screen results downstream. Note: e-estekhdam / jobinja / jobvision are JS-rendered SPAs whose ad lists are not in static HTML, so they need API reverse-engineering (a separate effort), not this static-scrape path. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+1644
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,51 @@
|
||||
using Microsoft.EntityFrameworkCore.Migrations;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace JobsMedical.Web.Migrations
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public partial class IranEstekhdamSource : Migration
|
||||
{
|
||||
/// <inheritdoc />
|
||||
protected override void Up(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.AddColumn<bool>(
|
||||
name: "IranEstekhdamEnabled",
|
||||
table: "AppSettings",
|
||||
type: "boolean",
|
||||
nullable: false,
|
||||
defaultValue: false);
|
||||
|
||||
migrationBuilder.AddColumn<int>(
|
||||
name: "IranEstekhdamMaxAds",
|
||||
table: "AppSettings",
|
||||
type: "integer",
|
||||
nullable: false,
|
||||
defaultValue: 40);
|
||||
|
||||
migrationBuilder.AddColumn<bool>(
|
||||
name: "IranEstekhdamUseProxy",
|
||||
table: "AppSettings",
|
||||
type: "boolean",
|
||||
nullable: false,
|
||||
defaultValue: false);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void Down(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.DropColumn(
|
||||
name: "IranEstekhdamEnabled",
|
||||
table: "AppSettings");
|
||||
|
||||
migrationBuilder.DropColumn(
|
||||
name: "IranEstekhdamMaxAds",
|
||||
table: "AppSettings");
|
||||
|
||||
migrationBuilder.DropColumn(
|
||||
name: "IranEstekhdamUseProxy",
|
||||
table: "AppSettings");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -103,6 +103,15 @@ namespace JobsMedical.Web.Migrations
|
||||
.HasMaxLength(1000)
|
||||
.HasColumnType("character varying(1000)");
|
||||
|
||||
b.Property<bool>("IranEstekhdamEnabled")
|
||||
.HasColumnType("boolean");
|
||||
|
||||
b.Property<int>("IranEstekhdamMaxAds")
|
||||
.HasColumnType("integer");
|
||||
|
||||
b.Property<bool>("IranEstekhdamUseProxy")
|
||||
.HasColumnType("boolean");
|
||||
|
||||
b.Property<bool>("MedjobsEnabled")
|
||||
.HasColumnType("boolean");
|
||||
|
||||
|
||||
@@ -81,6 +81,12 @@ public class AppSetting
|
||||
/// <summary>Max ads to fetch per ingestion run (be polite; dedupe skips already-seen).</summary>
|
||||
public int MedjobsMaxAds { get; set; } = 40;
|
||||
|
||||
/// <summary>Scrape iranestekhdam.ir clinical job ads (crawled via its monthly ad sitemaps;
|
||||
/// employer ads at named facilities, filtered to clinical-role slugs).</summary>
|
||||
public bool IranEstekhdamEnabled { get; set; } = false;
|
||||
public int IranEstekhdamMaxAds { get; set; } = 40;
|
||||
public bool IranEstekhdamUseProxy { get; set; } = false;
|
||||
|
||||
// --- SMS OTP (Kavenegar). When off, the code is shown on screen (dev only). ---
|
||||
public bool SmsEnabled { get; set; } = false;
|
||||
[MaxLength(200)] public string? SmsApiKey { get; set; }
|
||||
|
||||
@@ -147,6 +147,16 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="source-box">
|
||||
<label class="toggle-row">
|
||||
<input type="checkbox" name="IranEstekhdamEnabled" value="true" checked="@Model.IranEstekhdamEnabled" />
|
||||
<span class="t-body"><span>🏥 ایراناستخدام (iranestekhdam.ir)</span><span class="t-hint">آگهیهای استخدامِ مراکز درمانیِ نامدار از سایتمپِ ماهانه؛ فقط نقشهای بالینی.</span></span>
|
||||
</label>
|
||||
<div class="filter-group"><label>حداکثر آگهی در هر اجرا</label><input type="number" name="IranEstekhdamMaxAds" min="1" max="500" value="@Model.IranEstekhdamMaxAds" dir="ltr" />
|
||||
<label class="proxy-toggle"><input type="checkbox" name="IranEstekhdamUseProxy" value="true" checked="@Model.IranEstekhdamUseProxy" /> از پروکسی استفاده شود</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="source-box">
|
||||
<label class="toggle-row">
|
||||
<input type="checkbox" name="WebsitesEnabled" value="true" checked="@Model.WebsitesEnabled" />
|
||||
|
||||
@@ -47,6 +47,9 @@ public class SettingsModel : PageModel
|
||||
[BindProperty] public string? DivarQueries { get; set; }
|
||||
[BindProperty] public bool MedjobsEnabled { get; set; }
|
||||
[BindProperty] public int MedjobsMaxAds { get; set; } = 40;
|
||||
[BindProperty] public bool IranEstekhdamEnabled { get; set; }
|
||||
[BindProperty] public int IranEstekhdamMaxAds { get; set; } = 40;
|
||||
[BindProperty] public bool IranEstekhdamUseProxy { get; set; }
|
||||
[BindProperty] public bool SmsEnabled { get; set; }
|
||||
[BindProperty] public string? SmsApiKey { get; set; }
|
||||
[BindProperty] public string? SmsTemplate { get; set; }
|
||||
@@ -95,6 +98,9 @@ public class SettingsModel : PageModel
|
||||
DivarQueries = s.DivarQueries;
|
||||
MedjobsEnabled = s.MedjobsEnabled;
|
||||
MedjobsMaxAds = s.MedjobsMaxAds;
|
||||
IranEstekhdamEnabled = s.IranEstekhdamEnabled;
|
||||
IranEstekhdamMaxAds = s.IranEstekhdamMaxAds;
|
||||
IranEstekhdamUseProxy = s.IranEstekhdamUseProxy;
|
||||
SmsEnabled = s.SmsEnabled;
|
||||
SmsApiKey = s.SmsApiKey;
|
||||
SmsTemplate = s.SmsTemplate;
|
||||
@@ -140,6 +146,9 @@ public class SettingsModel : PageModel
|
||||
DivarQueries = DivarQueries,
|
||||
MedjobsEnabled = MedjobsEnabled,
|
||||
MedjobsMaxAds = MedjobsMaxAds,
|
||||
IranEstekhdamEnabled = IranEstekhdamEnabled,
|
||||
IranEstekhdamMaxAds = IranEstekhdamMaxAds,
|
||||
IranEstekhdamUseProxy = IranEstekhdamUseProxy,
|
||||
SmsEnabled = SmsEnabled,
|
||||
SmsApiKey = SmsApiKey,
|
||||
SmsTemplate = SmsTemplate,
|
||||
|
||||
@@ -60,6 +60,8 @@ builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
|
||||
JobsMedical.Web.Services.Scraping.DivarListingSource>();
|
||||
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
|
||||
JobsMedical.Web.Services.Scraping.MedjobsListingSource>();
|
||||
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
|
||||
JobsMedical.Web.Services.Scraping.IranEstekhdamListingSource>();
|
||||
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
|
||||
JobsMedical.Web.Services.Scraping.WebsiteListingSource>();
|
||||
builder.Services.AddScoped<JobsMedical.Web.Services.Scraping.ListingArchiver>();
|
||||
|
||||
@@ -0,0 +1,135 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes clinical job ads from iranestekhdam.ir. It reads the site's monthly ad sitemaps
|
||||
/// (sitemap-ads.xml → sitemap-ads-YYYY-M.xml) to enumerate ad URLs, keeps only those whose
|
||||
/// readable Persian slug names a CLINICAL role (veterinary / non-clinical excluded), then fetches
|
||||
/// each ad page and extracts its title + description (+ any phone). These are EMPLOYER ads at NAMED
|
||||
/// facilities (بیمارستان/درمانگاه/کلینیک/آزمایشگاه …) — far higher quality than classifieds, so they
|
||||
/// directly improve the «نامشخص»-facility problem. Content-hash dedupe ingests each ad once; the
|
||||
/// medical-gate validator + AI auditor + junk filters do the final screening on top.
|
||||
/// </summary>
|
||||
public class IranEstekhdamListingSource : IListingSource
|
||||
{
|
||||
private const string SitemapIndex = "https://iranestekhdam.ir/sitemap-ads.xml";
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<IranEstekhdamListingSource> _log;
|
||||
|
||||
public IranEstekhdamListingSource(ScrapeHttpClients clients, ILogger<IranEstekhdamListingSource> log)
|
||||
{
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "ایراناستخدام (iranestekhdam.ir)";
|
||||
|
||||
// Clinical-role markers matched against the DECODED Persian URL slug. Words are hyphen-joined in
|
||||
// the slug, so substring matching works on the decoded form.
|
||||
private static readonly string[] RoleSlugs =
|
||||
{
|
||||
"پرستار", "بهیار", "کمک-پرستار", "کمک-بهیار", "پزشک", "دندان", "مامایی", "ماما", "تکنسین",
|
||||
"رادیولوژ", "سونوگراف", "فیزیوتراپ", "کاردرمان", "گفتاردرمان", "شنوایی", "بینایی", "اپتومتر",
|
||||
"دیالیز", "اتاق-عمل", "بیهوش", "تزریقات", "فوریت", "اورژانس", "داروساز", "نسخه", "سالمند",
|
||||
};
|
||||
|
||||
// Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them.
|
||||
private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" };
|
||||
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
|
||||
{
|
||||
if (!s.IranEstekhdamEnabled) return Array.Empty<ScrapedItem>();
|
||||
var max = Math.Clamp(s.IranEstekhdamMaxAds, 1, 500);
|
||||
var client = _clients.For(s, s.IranEstekhdamUseProxy);
|
||||
|
||||
try
|
||||
{
|
||||
// 1. sitemap index → the monthly ad sitemaps (newest first as listed by the site)
|
||||
var index = await client.GetStringAsync(SitemapIndex, ct);
|
||||
var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList();
|
||||
if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
// 2. collect ad URLs, keeping only clinical-role slugs. Pull from successive monthly
|
||||
// sitemaps until we have enough candidates (or run out).
|
||||
var picked = new List<string>();
|
||||
foreach (var sm in monthly)
|
||||
{
|
||||
if (picked.Count >= max) break;
|
||||
try
|
||||
{
|
||||
var clinical = Locs(await client.GetStringAsync(sm, ct)).Where(IsClinicalSlug);
|
||||
foreach (var u in clinical) { if (!picked.Contains(u)) picked.Add(u); if (picked.Count >= max) break; }
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); }
|
||||
}
|
||||
|
||||
// 3. fetch each ad page → title + description (+ phone if present in the body)
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var url in picked)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(url, ct);
|
||||
var text = ExtractAd(html);
|
||||
if (text.Length >= 25) items.Add(new ScrapedItem("ایراناستخدام", text, url));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); }
|
||||
}
|
||||
_log.LogInformation("iranestekhdam: fetched {Count} clinical ads", items.Count);
|
||||
return items;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "iranestekhdam fetch failed");
|
||||
return Array.Empty<ScrapedItem>();
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsClinicalSlug(string url)
|
||||
{
|
||||
var slug = Uri.UnescapeDataString(url);
|
||||
if (ExcludeSlugs.Any(slug.Contains)) return false;
|
||||
return RoleSlugs.Any(slug.Contains);
|
||||
}
|
||||
|
||||
private static IEnumerable<string> Locs(string xml)
|
||||
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
|
||||
|
||||
/// <summary>Title (site suffix stripped) + the ad's description. iranestekhdam puts a complete,
|
||||
/// structured summary (facility + city + district + role) in og:description, with the full
|
||||
/// requirements in the .single-ad container — prefer whichever yields more text.</summary>
|
||||
private static string ExtractAd(string html)
|
||||
{
|
||||
var title = Meta(html, "og:title");
|
||||
if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); }
|
||||
|
||||
var ogBody = Meta(html, "og:description");
|
||||
var single = BetweenClass(html, "single-ad");
|
||||
var singleText = single is null ? null : HtmlUtil.ToPlainText(single);
|
||||
var body = (singleText?.Length ?? 0) > (ogBody?.Length ?? 0) ? singleText : ogBody;
|
||||
|
||||
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p))));
|
||||
if (text.Length > 1800) text = text[..1800];
|
||||
|
||||
var phones = HtmlUtil.HarvestPhones(body ?? "");
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
return text;
|
||||
}
|
||||
|
||||
private static string? Meta(string html, string prop)
|
||||
{
|
||||
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
|
||||
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
|
||||
}
|
||||
|
||||
private static string? BetweenClass(string html, string cls)
|
||||
{
|
||||
var m = Regex.Match(html, $"<(?:div|article|section)[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</(?:div|article|section)>",
|
||||
RegexOptions.Singleline);
|
||||
return m.Success ? m.Groups[1].Value : null;
|
||||
}
|
||||
}
|
||||
@@ -55,6 +55,9 @@ public class SettingsService
|
||||
s.DivarQueries = incoming.DivarQueries?.Trim();
|
||||
s.MedjobsEnabled = incoming.MedjobsEnabled;
|
||||
s.MedjobsMaxAds = Math.Clamp(incoming.MedjobsMaxAds, 1, 500);
|
||||
s.IranEstekhdamEnabled = incoming.IranEstekhdamEnabled;
|
||||
s.IranEstekhdamMaxAds = Math.Clamp(incoming.IranEstekhdamMaxAds, 1, 500);
|
||||
s.IranEstekhdamUseProxy = incoming.IranEstekhdamUseProxy;
|
||||
s.SmsEnabled = incoming.SmsEnabled;
|
||||
s.SmsApiKey = incoming.SmsApiKey?.Trim();
|
||||
s.SmsTemplate = incoming.SmsTemplate?.Trim();
|
||||
|
||||
Reference in New Issue
Block a user