Add medboom.ir as an ingestion source (doctor/dentist-heavy, VPN-free)
CI/CD / CI · dotnet build (push) Successful in 31s
CI/CD / Deploy · hamkadr (push) Successful in 3m15s

New MedboomListingSource: a WordPress medical-classifieds board crawled like medjobs
(wp-sitemap.xml -> posts-post-N.xml, newest first), filtered to clinical-role slugs and
Tehran-only for launch. medboom skews toward doctors/dentists/pharmacists and carries both
hiring and availability posts, so it directly broadens the role mix the nurse-heavy Divar
content lacks. Iranian-hosted -> no proxy/VPN needed (relevant now that Telegram is off).

Wired like the other sources: AppSetting toggles (MedboomEnabled/MaxAds/UseProxy) + EF
migration, SettingsService persistence, admin Settings UI, DI registration. Off by default.
Validated against live data: Tehran clinical ads at named clinics (pharmacy/dental/etc.).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 11:18:56 +03:30
parent 7740d9f8d7
commit bb8c6c3be5
9 changed files with 1898 additions and 0 deletions
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,51 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace JobsMedical.Web.Migrations
{
/// <inheritdoc />
public partial class MedboomSource : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.AddColumn<bool>(
name: "MedboomEnabled",
table: "AppSettings",
type: "boolean",
nullable: false,
defaultValue: false);
migrationBuilder.AddColumn<int>(
name: "MedboomMaxAds",
table: "AppSettings",
type: "integer",
nullable: false,
defaultValue: 40);
migrationBuilder.AddColumn<bool>(
name: "MedboomUseProxy",
table: "AppSettings",
type: "boolean",
nullable: false,
defaultValue: false);
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropColumn(
name: "MedboomEnabled",
table: "AppSettings");
migrationBuilder.DropColumn(
name: "MedboomMaxAds",
table: "AppSettings");
migrationBuilder.DropColumn(
name: "MedboomUseProxy",
table: "AppSettings");
}
}
}
@@ -112,6 +112,15 @@ namespace JobsMedical.Web.Migrations
b.Property<bool>("IranEstekhdamUseProxy")
.HasColumnType("boolean");
b.Property<bool>("MedboomEnabled")
.HasColumnType("boolean");
b.Property<int>("MedboomMaxAds")
.HasColumnType("integer");
b.Property<bool>("MedboomUseProxy")
.HasColumnType("boolean");
b.Property<bool>("MedjobsEnabled")
.HasColumnType("boolean");
+6
View File
@@ -87,6 +87,12 @@ public class AppSetting
public int IranEstekhdamMaxAds { get; set; } = 40;
public bool IranEstekhdamUseProxy { get; set; } = false;
/// <summary>Scrape medboom.ir clinical ads (WordPress board; doctor/dentist-heavy, hiring +
/// availability; crawled via its WP sitemap, Tehran-only for launch).</summary>
public bool MedboomEnabled { get; set; } = false;
public int MedboomMaxAds { get; set; } = 40;
public bool MedboomUseProxy { get; set; } = false;
// --- SMS OTP (Kavenegar). When off, the code is shown on screen (dev only). ---
public bool SmsEnabled { get; set; } = false;
[MaxLength(200)] public string? SmsApiKey { get; set; }
@@ -157,6 +157,16 @@
</div>
</div>
<div class="source-box">
<label class="toggle-row">
<input type="checkbox" name="MedboomEnabled" value="true" checked="@Model.MedboomEnabled" />
<span class="t-body"><span>🩺 مدبوم (medboom.ir)</span><span class="t-hint">آگهی‌های علوم پزشکی (بیشتر پزشک/دندانپزشک)، استخدام و آماده‌به‌کار؛ بدون نیاز به فیلترشکن.</span></span>
</label>
<div class="filter-group"><label>حداکثر آگهی در هر اجرا</label><input type="number" name="MedboomMaxAds" min="1" max="500" value="@Model.MedboomMaxAds" dir="ltr" />
<label class="proxy-toggle"><input type="checkbox" name="MedboomUseProxy" value="true" checked="@Model.MedboomUseProxy" /> از پروکسی استفاده شود</label>
</div>
</div>
<div class="source-box">
<label class="toggle-row">
<input type="checkbox" name="WebsitesEnabled" value="true" checked="@Model.WebsitesEnabled" />
@@ -50,6 +50,9 @@ public class SettingsModel : PageModel
[BindProperty] public bool IranEstekhdamEnabled { get; set; }
[BindProperty] public int IranEstekhdamMaxAds { get; set; } = 40;
[BindProperty] public bool IranEstekhdamUseProxy { get; set; }
[BindProperty] public bool MedboomEnabled { get; set; }
[BindProperty] public int MedboomMaxAds { get; set; } = 40;
[BindProperty] public bool MedboomUseProxy { get; set; }
[BindProperty] public bool SmsEnabled { get; set; }
[BindProperty] public string? SmsApiKey { get; set; }
[BindProperty] public string? SmsTemplate { get; set; }
@@ -101,6 +104,9 @@ public class SettingsModel : PageModel
IranEstekhdamEnabled = s.IranEstekhdamEnabled;
IranEstekhdamMaxAds = s.IranEstekhdamMaxAds;
IranEstekhdamUseProxy = s.IranEstekhdamUseProxy;
MedboomEnabled = s.MedboomEnabled;
MedboomMaxAds = s.MedboomMaxAds;
MedboomUseProxy = s.MedboomUseProxy;
SmsEnabled = s.SmsEnabled;
SmsApiKey = s.SmsApiKey;
SmsTemplate = s.SmsTemplate;
@@ -149,6 +155,9 @@ public class SettingsModel : PageModel
IranEstekhdamEnabled = IranEstekhdamEnabled,
IranEstekhdamMaxAds = IranEstekhdamMaxAds,
IranEstekhdamUseProxy = IranEstekhdamUseProxy,
MedboomEnabled = MedboomEnabled,
MedboomMaxAds = MedboomMaxAds,
MedboomUseProxy = MedboomUseProxy,
SmsEnabled = SmsEnabled,
SmsApiKey = SmsApiKey,
SmsTemplate = SmsTemplate,
+2
View File
@@ -62,6 +62,8 @@ builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
JobsMedical.Web.Services.Scraping.MedjobsListingSource>();
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
JobsMedical.Web.Services.Scraping.IranEstekhdamListingSource>();
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
JobsMedical.Web.Services.Scraping.MedboomListingSource>();
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
JobsMedical.Web.Services.Scraping.WebsiteListingSource>();
builder.Services.AddScoped<JobsMedical.Web.Services.Scraping.ListingArchiver>();
@@ -0,0 +1,155 @@
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// Scrapes clinical ads from medboom.ir («مرجع استخدام و نیازمندی علوم پزشکی») — a WordPress
/// ad-listing site like medjobs.ir. It enumerates ad posts via the WP sitemap
/// (wp-sitemap.xml → wp-sitemap-posts-post-N.xml), newest first, keeps clinical-role slugs, and
/// extracts each ad's title + description (+ phone). medboom skews toward DOCTORS/DENTISTS and
/// carries BOTH hiring («نیازمند…») and availability («آماده همکاری / جویای کار») posts, so it
/// directly broadens the role mix the nurse-heavy classifieds sources miss. Tehran-only for launch.
/// VPN-free (Iranian-hosted). Content-hash dedupe ingests each ad once; the validator/AI screen on top.
/// </summary>
public class MedboomListingSource : IListingSource
{
private const string SitemapIndex = "https://medboom.ir/wp-sitemap.xml";
private readonly ScrapeHttpClients _clients;
private readonly ILogger<MedboomListingSource> _log;
public MedboomListingSource(ScrapeHttpClients clients, ILogger<MedboomListingSource> log)
{
_clients = clients;
_log = log;
}
public string Name => "مدبوم (medboom.ir)";
// Clinical-role markers matched against the decoded Persian ad slug.
private static readonly string[] RoleSlugs =
{
"پزشک", "دندان", "پرستار", "بهیار", "مامایی", "ماما", "تکنسین", "رادیولوژ", "سونوگراف",
"فیزیوتراپ", "کاردرمان", "گفتاردرمان", "شنوایی", "بینایی", "اپتومتر", "دیالیز", "اتاق-عمل",
"بیهوش", "هوشبری", "تزریقات", "فوریت", "اورژانس", "داروساز", "داروخانه", "نسخه", "سالمند",
"علوم-آزمایشگاهی", "آزمایشگاه", "مسئول-فنی", "مامو", "تغذیه", "روانشناس", "اپتیک",
};
// Veterinary + obvious non-staffing categories medboom also carries (equipment sale, real estate).
private static readonly string[] ExcludeSlugs =
{
"دامپزشک", "دام-پزشک", "دامپزشکی", "فروش", "اجاره", "املاک", "دستگاه", "تجهیزات", "ملک",
};
private const string Tehran = "تهران";
private static readonly string[] OtherCitySlugs =
{
"شیراز", "اصفهان", "مشهد", "تبریز", "کرج", "قم", "یزد", "رشت", "کرمان", "اراک", "اردبیل",
"همدان", "کرمانشاه", "زنجان", "قزوین", "ساری", "گرگان", "بندرعباس", "بوشهر", "سنندج",
"بیرجند", "سمنان", "شهرکرد", "ایلام", "یاسوج", "زاهدان", "ارومیه", "البرز", "اهواز", "کاشان",
};
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
if (!s.MedboomEnabled) return Array.Empty<ScrapedItem>();
var max = Math.Clamp(s.MedboomMaxAds, 1, 500);
var client = _clients.For(s, s.MedboomUseProxy);
try
{
// 1. WP sitemap index → the ad-post sitemaps. Process newest first (highest-numbered).
var index = await client.GetStringAsync(SitemapIndex, ct);
var postMaps = Locs(index).Where(u => u.Contains("posts-post-"))
.OrderByDescending(u => u).ToList();
if (postMaps.Count == 0) { _log.LogWarning("medboom: no ad-post sitemaps found"); return Array.Empty<ScrapedItem>(); }
// 2. pool clinical candidate URLs (newest first within each map), pre-dropping other cities.
var pool = new List<string>();
var budget = max * 6;
foreach (var sm in postMaps)
{
if (pool.Count >= budget) break;
try
{
var urls = Locs(await client.GetStringAsync(sm, ct)).Reverse(); // newest ads last → take from end
foreach (var u in urls)
{
if (IsClinicalSlug(u) && !IsOtherCitySlug(u) && !pool.Contains(u)) pool.Add(u);
if (pool.Count >= budget) break;
}
}
catch (Exception ex) { _log.LogWarning(ex, "medboom: sitemap {Sm} failed", sm); }
}
// 3. fetch each ad → keep only Tehran ones, up to `max`.
var items = new List<ScrapedItem>();
foreach (var url in pool)
{
if (items.Count >= max) break;
ct.ThrowIfCancellationRequested();
try
{
var html = await client.GetStringAsync(url, ct);
var text = ExtractAd(html);
if (text.Length < 25 || !text.Contains(Tehran)) continue; // Tehran-only launch filter
items.Add(new ScrapedItem("مدبوم", text, url));
}
catch (Exception ex) { _log.LogWarning(ex, "medboom: ad {Url} failed", url); }
}
_log.LogInformation("medboom: fetched {Count} Tehran clinical ads (from {Pool} pooled)", items.Count, pool.Count);
return items;
}
catch (Exception ex)
{
_log.LogWarning(ex, "medboom fetch failed");
return Array.Empty<ScrapedItem>();
}
}
private static bool IsClinicalSlug(string url)
{
var slug = Uri.UnescapeDataString(url);
if (ExcludeSlugs.Any(slug.Contains)) return false;
return RoleSlugs.Any(slug.Contains);
}
private static bool IsOtherCitySlug(string url)
{
var slug = Uri.UnescapeDataString(url);
return OtherCitySlugs.Any(slug.Contains);
}
private static IEnumerable<string> Locs(string xml)
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
private static string ExtractAd(string html)
{
var title = Meta(html, "og:title");
if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); }
var ogBody = Meta(html, "og:description");
var entry = BetweenClass(html, "entry-content");
var entryText = entry is null ? null : HtmlUtil.ToPlainText(entry);
var body = (entryText?.Length ?? 0) > (ogBody?.Length ?? 0) ? entryText : ogBody;
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p))));
if (text.Length > 1800) text = text[..1800];
var phones = HtmlUtil.HarvestPhones(body ?? "");
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
}
private static string? Meta(string html, string prop)
{
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
}
private static string? BetweenClass(string html, string cls)
{
var m = Regex.Match(html, $"<(?:div|article|section)[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</(?:div|article|section)>",
RegexOptions.Singleline);
return m.Success ? m.Groups[1].Value : null;
}
}
@@ -58,6 +58,9 @@ public class SettingsService
s.IranEstekhdamEnabled = incoming.IranEstekhdamEnabled;
s.IranEstekhdamMaxAds = Math.Clamp(incoming.IranEstekhdamMaxAds, 1, 500);
s.IranEstekhdamUseProxy = incoming.IranEstekhdamUseProxy;
s.MedboomEnabled = incoming.MedboomEnabled;
s.MedboomMaxAds = Math.Clamp(incoming.MedboomMaxAds, 1, 500);
s.MedboomUseProxy = incoming.MedboomUseProxy;
s.SmsEnabled = incoming.SmsEnabled;
s.SmsApiKey = incoming.SmsApiKey?.Trim();
s.SmsTemplate = incoming.SmsTemplate?.Trim();