using System.Text.RegularExpressions; using JobsMedical.Web.Models; namespace JobsMedical.Web.Services.Scraping; /// /// Scrapes clinical ads from medboom.ir («مرجع استخدام و نیازمندی علوم پزشکی») — a WordPress /// ad-listing site like medjobs.ir. It enumerates ad posts via the WP sitemap /// (wp-sitemap.xml → wp-sitemap-posts-post-N.xml), newest first, keeps clinical-role slugs, and /// extracts each ad's title + description (+ phone). medboom skews toward DOCTORS/DENTISTS and /// carries BOTH hiring («نیازمند…») and availability («آماده همکاری / جویای کار») posts, so it /// directly broadens the role mix the nurse-heavy classifieds sources miss. Tehran-only for launch. /// VPN-free (Iranian-hosted). Content-hash dedupe ingests each ad once; the validator/AI screen on top. /// public class MedboomListingSource : IListingSource { private const string SitemapIndex = "https://medboom.ir/wp-sitemap.xml"; private readonly ScrapeHttpClients _clients; private readonly ILogger _log; public MedboomListingSource(ScrapeHttpClients clients, ILogger log) { _clients = clients; _log = log; } public string Name => "مدبوم (medboom.ir)"; // Clinical-role markers matched against the decoded Persian ad slug. private static readonly string[] RoleSlugs = { "پزشک", "دندان", "پرستار", "بهیار", "مامایی", "ماما", "تکنسین", "رادیولوژ", "سونوگراف", "فیزیوتراپ", "کاردرمان", "گفتاردرمان", "شنوایی", "بینایی", "اپتومتر", "دیالیز", "اتاق-عمل", "بیهوش", "هوشبری", "تزریقات", "فوریت", "اورژانس", "داروساز", "داروخانه", "نسخه", "سالمند", "علوم-آزمایشگاهی", "آزمایشگاه", "مسئول-فنی", "مامو", "تغذیه", "روانشناس", "اپتیک", }; // Veterinary + obvious non-staffing categories medboom also carries (equipment sale, real estate). private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی", "فروش", "اجاره", "املاک", "دستگاه", "تجهیزات", "ملک", }; private const string Tehran = "تهران"; private static readonly string[] OtherCitySlugs = { "شیراز", "اصفهان", "مشهد", "تبریز", "کرج", "قم", "یزد", "رشت", "کرمان", "اراک", "اردبیل", "همدان", "کرمانشاه", "زنجان", "قزوین", "ساری", "گرگان", "بندرعباس", "بوشهر", "سنندج", "بیرجند", "سمنان", "شهرکرد", "ایلام", "یاسوج", "زاهدان", "ارومیه", "البرز", "اهواز", "کاشان", }; public async Task> FetchAsync(AppSetting s, CancellationToken ct = default) { if (!s.MedboomEnabled) return Array.Empty(); var max = Math.Clamp(s.MedboomMaxAds, 1, 500); var client = _clients.For(s, s.MedboomUseProxy); try { // 1. WP sitemap index → the ad-post sitemaps. Process newest first (highest-numbered). var index = await client.GetStringAsync(SitemapIndex, ct); var postMaps = Locs(index).Where(u => u.Contains("posts-post-")) .OrderByDescending(u => u).ToList(); if (postMaps.Count == 0) { _log.LogWarning("medboom: no ad-post sitemaps found"); return Array.Empty(); } // 2. pool clinical candidate URLs (newest first within each map), pre-dropping other cities. var pool = new List(); var budget = max * 6; foreach (var sm in postMaps) { if (pool.Count >= budget) break; try { var urls = Locs(await client.GetStringAsync(sm, ct)).Reverse(); // newest ads last → take from end foreach (var u in urls) { if (IsClinicalSlug(u) && !IsOtherCitySlug(u) && !pool.Contains(u)) pool.Add(u); if (pool.Count >= budget) break; } } catch (Exception ex) { _log.LogWarning(ex, "medboom: sitemap {Sm} failed", sm); } } // 3. fetch each ad → keep only Tehran ones, up to `max`. var items = new List(); foreach (var url in pool) { if (items.Count >= max) break; ct.ThrowIfCancellationRequested(); try { var html = await client.GetStringAsync(url, ct); var text = ExtractAd(html); if (text.Length < 25 || !text.Contains(Tehran)) continue; // Tehran-only launch filter items.Add(new ScrapedItem("مدبوم", text, url)); } catch (Exception ex) { _log.LogWarning(ex, "medboom: ad {Url} failed", url); } } _log.LogInformation("medboom: fetched {Count} Tehran clinical ads (from {Pool} pooled)", items.Count, pool.Count); return items; } catch (Exception ex) { _log.LogWarning(ex, "medboom fetch failed"); return Array.Empty(); } } private static bool IsClinicalSlug(string url) { var slug = Uri.UnescapeDataString(url); if (ExcludeSlugs.Any(slug.Contains)) return false; return RoleSlugs.Any(slug.Contains); } private static bool IsOtherCitySlug(string url) { var slug = Uri.UnescapeDataString(url); return OtherCitySlugs.Any(slug.Contains); } private static IEnumerable Locs(string xml) => Regex.Matches(xml, "([^<]+)").Select(m => m.Groups[1].Value.Trim()); private static string ExtractAd(string html) { var title = Meta(html, "og:title"); if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); } var ogBody = Meta(html, "og:description"); var entry = BetweenClass(html, "entry-content"); var entryText = entry is null ? null : HtmlUtil.ToPlainText(entry); var body = (entryText?.Length ?? 0) > (ogBody?.Length ?? 0) ? entryText : ogBody; var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p)))); if (text.Length > 1800) text = text[..1800]; var phones = HtmlUtil.HarvestPhones(body ?? ""); if (phones.Count > 0 && !phones.Any(text.Contains)) text += "\nشماره تماس: " + string.Join("، ", phones); return text; } private static string? Meta(string html, string prop) { var m = Regex.Match(html, $"]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']"); return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null; } private static string? BetweenClass(string html, string cls) { var m = Regex.Match(html, $"<(?:div|article|section)[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)", RegexOptions.Singleline); return m.Success ? m.Groups[1].Value : null; } }