Add medboom.ir as an ingestion source (doctor/dentist-heavy, VPN-free)
New MedboomListingSource: a WordPress medical-classifieds board crawled like medjobs (wp-sitemap.xml -> posts-post-N.xml, newest first), filtered to clinical-role slugs and Tehran-only for launch. medboom skews toward doctors/dentists/pharmacists and carries both hiring and availability posts, so it directly broadens the role mix the nurse-heavy Divar content lacks. Iranian-hosted -> no proxy/VPN needed (relevant now that Telegram is off). Wired like the other sources: AppSetting toggles (MedboomEnabled/MaxAds/UseProxy) + EF migration, SettingsService persistence, admin Settings UI, DI registration. Off by default. Validated against live data: Tehran clinical ads at named clinics (pharmacy/dental/etc.). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,51 @@
|
||||
using Microsoft.EntityFrameworkCore.Migrations;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace JobsMedical.Web.Migrations
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public partial class MedboomSource : Migration
|
||||
{
|
||||
/// <inheritdoc />
|
||||
protected override void Up(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.AddColumn<bool>(
|
||||
name: "MedboomEnabled",
|
||||
table: "AppSettings",
|
||||
type: "boolean",
|
||||
nullable: false,
|
||||
defaultValue: false);
|
||||
|
||||
migrationBuilder.AddColumn<int>(
|
||||
name: "MedboomMaxAds",
|
||||
table: "AppSettings",
|
||||
type: "integer",
|
||||
nullable: false,
|
||||
defaultValue: 40);
|
||||
|
||||
migrationBuilder.AddColumn<bool>(
|
||||
name: "MedboomUseProxy",
|
||||
table: "AppSettings",
|
||||
type: "boolean",
|
||||
nullable: false,
|
||||
defaultValue: false);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void Down(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.DropColumn(
|
||||
name: "MedboomEnabled",
|
||||
table: "AppSettings");
|
||||
|
||||
migrationBuilder.DropColumn(
|
||||
name: "MedboomMaxAds",
|
||||
table: "AppSettings");
|
||||
|
||||
migrationBuilder.DropColumn(
|
||||
name: "MedboomUseProxy",
|
||||
table: "AppSettings");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -112,6 +112,15 @@ namespace JobsMedical.Web.Migrations
|
||||
b.Property<bool>("IranEstekhdamUseProxy")
|
||||
.HasColumnType("boolean");
|
||||
|
||||
b.Property<bool>("MedboomEnabled")
|
||||
.HasColumnType("boolean");
|
||||
|
||||
b.Property<int>("MedboomMaxAds")
|
||||
.HasColumnType("integer");
|
||||
|
||||
b.Property<bool>("MedboomUseProxy")
|
||||
.HasColumnType("boolean");
|
||||
|
||||
b.Property<bool>("MedjobsEnabled")
|
||||
.HasColumnType("boolean");
|
||||
|
||||
|
||||
@@ -87,6 +87,12 @@ public class AppSetting
|
||||
public int IranEstekhdamMaxAds { get; set; } = 40;
|
||||
public bool IranEstekhdamUseProxy { get; set; } = false;
|
||||
|
||||
/// <summary>Scrape medboom.ir clinical ads (WordPress board; doctor/dentist-heavy, hiring +
|
||||
/// availability; crawled via its WP sitemap, Tehran-only for launch).</summary>
|
||||
public bool MedboomEnabled { get; set; } = false;
|
||||
public int MedboomMaxAds { get; set; } = 40;
|
||||
public bool MedboomUseProxy { get; set; } = false;
|
||||
|
||||
// --- SMS OTP (Kavenegar). When off, the code is shown on screen (dev only). ---
|
||||
public bool SmsEnabled { get; set; } = false;
|
||||
[MaxLength(200)] public string? SmsApiKey { get; set; }
|
||||
|
||||
@@ -157,6 +157,16 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="source-box">
|
||||
<label class="toggle-row">
|
||||
<input type="checkbox" name="MedboomEnabled" value="true" checked="@Model.MedboomEnabled" />
|
||||
<span class="t-body"><span>🩺 مدبوم (medboom.ir)</span><span class="t-hint">آگهیهای علوم پزشکی (بیشتر پزشک/دندانپزشک)، استخدام و آمادهبهکار؛ بدون نیاز به فیلترشکن.</span></span>
|
||||
</label>
|
||||
<div class="filter-group"><label>حداکثر آگهی در هر اجرا</label><input type="number" name="MedboomMaxAds" min="1" max="500" value="@Model.MedboomMaxAds" dir="ltr" />
|
||||
<label class="proxy-toggle"><input type="checkbox" name="MedboomUseProxy" value="true" checked="@Model.MedboomUseProxy" /> از پروکسی استفاده شود</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="source-box">
|
||||
<label class="toggle-row">
|
||||
<input type="checkbox" name="WebsitesEnabled" value="true" checked="@Model.WebsitesEnabled" />
|
||||
|
||||
@@ -50,6 +50,9 @@ public class SettingsModel : PageModel
|
||||
[BindProperty] public bool IranEstekhdamEnabled { get; set; }
|
||||
[BindProperty] public int IranEstekhdamMaxAds { get; set; } = 40;
|
||||
[BindProperty] public bool IranEstekhdamUseProxy { get; set; }
|
||||
[BindProperty] public bool MedboomEnabled { get; set; }
|
||||
[BindProperty] public int MedboomMaxAds { get; set; } = 40;
|
||||
[BindProperty] public bool MedboomUseProxy { get; set; }
|
||||
[BindProperty] public bool SmsEnabled { get; set; }
|
||||
[BindProperty] public string? SmsApiKey { get; set; }
|
||||
[BindProperty] public string? SmsTemplate { get; set; }
|
||||
@@ -101,6 +104,9 @@ public class SettingsModel : PageModel
|
||||
IranEstekhdamEnabled = s.IranEstekhdamEnabled;
|
||||
IranEstekhdamMaxAds = s.IranEstekhdamMaxAds;
|
||||
IranEstekhdamUseProxy = s.IranEstekhdamUseProxy;
|
||||
MedboomEnabled = s.MedboomEnabled;
|
||||
MedboomMaxAds = s.MedboomMaxAds;
|
||||
MedboomUseProxy = s.MedboomUseProxy;
|
||||
SmsEnabled = s.SmsEnabled;
|
||||
SmsApiKey = s.SmsApiKey;
|
||||
SmsTemplate = s.SmsTemplate;
|
||||
@@ -149,6 +155,9 @@ public class SettingsModel : PageModel
|
||||
IranEstekhdamEnabled = IranEstekhdamEnabled,
|
||||
IranEstekhdamMaxAds = IranEstekhdamMaxAds,
|
||||
IranEstekhdamUseProxy = IranEstekhdamUseProxy,
|
||||
MedboomEnabled = MedboomEnabled,
|
||||
MedboomMaxAds = MedboomMaxAds,
|
||||
MedboomUseProxy = MedboomUseProxy,
|
||||
SmsEnabled = SmsEnabled,
|
||||
SmsApiKey = SmsApiKey,
|
||||
SmsTemplate = SmsTemplate,
|
||||
|
||||
@@ -62,6 +62,8 @@ builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
|
||||
JobsMedical.Web.Services.Scraping.MedjobsListingSource>();
|
||||
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
|
||||
JobsMedical.Web.Services.Scraping.IranEstekhdamListingSource>();
|
||||
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
|
||||
JobsMedical.Web.Services.Scraping.MedboomListingSource>();
|
||||
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IListingSource,
|
||||
JobsMedical.Web.Services.Scraping.WebsiteListingSource>();
|
||||
builder.Services.AddScoped<JobsMedical.Web.Services.Scraping.ListingArchiver>();
|
||||
|
||||
@@ -0,0 +1,155 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes clinical ads from medboom.ir («مرجع استخدام و نیازمندی علوم پزشکی») — a WordPress
|
||||
/// ad-listing site like medjobs.ir. It enumerates ad posts via the WP sitemap
|
||||
/// (wp-sitemap.xml → wp-sitemap-posts-post-N.xml), newest first, keeps clinical-role slugs, and
|
||||
/// extracts each ad's title + description (+ phone). medboom skews toward DOCTORS/DENTISTS and
|
||||
/// carries BOTH hiring («نیازمند…») and availability («آماده همکاری / جویای کار») posts, so it
|
||||
/// directly broadens the role mix the nurse-heavy classifieds sources miss. Tehran-only for launch.
|
||||
/// VPN-free (Iranian-hosted). Content-hash dedupe ingests each ad once; the validator/AI screen on top.
|
||||
/// </summary>
|
||||
public class MedboomListingSource : IListingSource
|
||||
{
|
||||
private const string SitemapIndex = "https://medboom.ir/wp-sitemap.xml";
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<MedboomListingSource> _log;
|
||||
|
||||
public MedboomListingSource(ScrapeHttpClients clients, ILogger<MedboomListingSource> log)
|
||||
{
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "مدبوم (medboom.ir)";
|
||||
|
||||
// Clinical-role markers matched against the decoded Persian ad slug.
|
||||
private static readonly string[] RoleSlugs =
|
||||
{
|
||||
"پزشک", "دندان", "پرستار", "بهیار", "مامایی", "ماما", "تکنسین", "رادیولوژ", "سونوگراف",
|
||||
"فیزیوتراپ", "کاردرمان", "گفتاردرمان", "شنوایی", "بینایی", "اپتومتر", "دیالیز", "اتاق-عمل",
|
||||
"بیهوش", "هوشبری", "تزریقات", "فوریت", "اورژانس", "داروساز", "داروخانه", "نسخه", "سالمند",
|
||||
"علوم-آزمایشگاهی", "آزمایشگاه", "مسئول-فنی", "مامو", "تغذیه", "روانشناس", "اپتیک",
|
||||
};
|
||||
// Veterinary + obvious non-staffing categories medboom also carries (equipment sale, real estate).
|
||||
private static readonly string[] ExcludeSlugs =
|
||||
{
|
||||
"دامپزشک", "دام-پزشک", "دامپزشکی", "فروش", "اجاره", "املاک", "دستگاه", "تجهیزات", "ملک",
|
||||
};
|
||||
|
||||
private const string Tehran = "تهران";
|
||||
private static readonly string[] OtherCitySlugs =
|
||||
{
|
||||
"شیراز", "اصفهان", "مشهد", "تبریز", "کرج", "قم", "یزد", "رشت", "کرمان", "اراک", "اردبیل",
|
||||
"همدان", "کرمانشاه", "زنجان", "قزوین", "ساری", "گرگان", "بندرعباس", "بوشهر", "سنندج",
|
||||
"بیرجند", "سمنان", "شهرکرد", "ایلام", "یاسوج", "زاهدان", "ارومیه", "البرز", "اهواز", "کاشان",
|
||||
};
|
||||
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
|
||||
{
|
||||
if (!s.MedboomEnabled) return Array.Empty<ScrapedItem>();
|
||||
var max = Math.Clamp(s.MedboomMaxAds, 1, 500);
|
||||
var client = _clients.For(s, s.MedboomUseProxy);
|
||||
|
||||
try
|
||||
{
|
||||
// 1. WP sitemap index → the ad-post sitemaps. Process newest first (highest-numbered).
|
||||
var index = await client.GetStringAsync(SitemapIndex, ct);
|
||||
var postMaps = Locs(index).Where(u => u.Contains("posts-post-"))
|
||||
.OrderByDescending(u => u).ToList();
|
||||
if (postMaps.Count == 0) { _log.LogWarning("medboom: no ad-post sitemaps found"); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
// 2. pool clinical candidate URLs (newest first within each map), pre-dropping other cities.
|
||||
var pool = new List<string>();
|
||||
var budget = max * 6;
|
||||
foreach (var sm in postMaps)
|
||||
{
|
||||
if (pool.Count >= budget) break;
|
||||
try
|
||||
{
|
||||
var urls = Locs(await client.GetStringAsync(sm, ct)).Reverse(); // newest ads last → take from end
|
||||
foreach (var u in urls)
|
||||
{
|
||||
if (IsClinicalSlug(u) && !IsOtherCitySlug(u) && !pool.Contains(u)) pool.Add(u);
|
||||
if (pool.Count >= budget) break;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "medboom: sitemap {Sm} failed", sm); }
|
||||
}
|
||||
|
||||
// 3. fetch each ad → keep only Tehran ones, up to `max`.
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var url in pool)
|
||||
{
|
||||
if (items.Count >= max) break;
|
||||
ct.ThrowIfCancellationRequested();
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(url, ct);
|
||||
var text = ExtractAd(html);
|
||||
if (text.Length < 25 || !text.Contains(Tehran)) continue; // Tehran-only launch filter
|
||||
items.Add(new ScrapedItem("مدبوم", text, url));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "medboom: ad {Url} failed", url); }
|
||||
}
|
||||
_log.LogInformation("medboom: fetched {Count} Tehran clinical ads (from {Pool} pooled)", items.Count, pool.Count);
|
||||
return items;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "medboom fetch failed");
|
||||
return Array.Empty<ScrapedItem>();
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsClinicalSlug(string url)
|
||||
{
|
||||
var slug = Uri.UnescapeDataString(url);
|
||||
if (ExcludeSlugs.Any(slug.Contains)) return false;
|
||||
return RoleSlugs.Any(slug.Contains);
|
||||
}
|
||||
|
||||
private static bool IsOtherCitySlug(string url)
|
||||
{
|
||||
var slug = Uri.UnescapeDataString(url);
|
||||
return OtherCitySlugs.Any(slug.Contains);
|
||||
}
|
||||
|
||||
private static IEnumerable<string> Locs(string xml)
|
||||
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
|
||||
|
||||
private static string ExtractAd(string html)
|
||||
{
|
||||
var title = Meta(html, "og:title");
|
||||
if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); }
|
||||
|
||||
var ogBody = Meta(html, "og:description");
|
||||
var entry = BetweenClass(html, "entry-content");
|
||||
var entryText = entry is null ? null : HtmlUtil.ToPlainText(entry);
|
||||
var body = (entryText?.Length ?? 0) > (ogBody?.Length ?? 0) ? entryText : ogBody;
|
||||
|
||||
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p))));
|
||||
if (text.Length > 1800) text = text[..1800];
|
||||
|
||||
var phones = HtmlUtil.HarvestPhones(body ?? "");
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
return text;
|
||||
}
|
||||
|
||||
private static string? Meta(string html, string prop)
|
||||
{
|
||||
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
|
||||
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
|
||||
}
|
||||
|
||||
private static string? BetweenClass(string html, string cls)
|
||||
{
|
||||
var m = Regex.Match(html, $"<(?:div|article|section)[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</(?:div|article|section)>",
|
||||
RegexOptions.Singleline);
|
||||
return m.Success ? m.Groups[1].Value : null;
|
||||
}
|
||||
}
|
||||
@@ -58,6 +58,9 @@ public class SettingsService
|
||||
s.IranEstekhdamEnabled = incoming.IranEstekhdamEnabled;
|
||||
s.IranEstekhdamMaxAds = Math.Clamp(incoming.IranEstekhdamMaxAds, 1, 500);
|
||||
s.IranEstekhdamUseProxy = incoming.IranEstekhdamUseProxy;
|
||||
s.MedboomEnabled = incoming.MedboomEnabled;
|
||||
s.MedboomMaxAds = Math.Clamp(incoming.MedboomMaxAds, 1, 500);
|
||||
s.MedboomUseProxy = incoming.MedboomUseProxy;
|
||||
s.SmsEnabled = incoming.SmsEnabled;
|
||||
s.SmsApiKey = incoming.SmsApiKey?.Trim();
|
||||
s.SmsTemplate = incoming.SmsTemplate?.Trim();
|
||||
|
||||
Reference in New Issue
Block a user