using System.Text.RegularExpressions; using JobsMedical.Web.Models; namespace JobsMedical.Web.Services.Scraping; /// /// Scrapes job ads from medjobs.ir (a WordPress "ad_listing" classifieds site). It reads the /// site's own sitemaps (sitemap_index.xml → ad_listing-sitemapN.xml) to enumerate every ad URL, /// then fetches each ad page and extracts its title + description. The engine's content-hash /// dedupe means each ad is only ever ingested once, so repeated runs pick up only new ads. /// Published items become job pages on hamkadr.ir (the SEO goal). /// public class MedjobsListingSource : IListingSource { private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml"; private readonly ScrapeHttpClients _clients; private readonly ILogger _log; public MedjobsListingSource(ScrapeHttpClients clients, ILogger log) { _clients = clients; _log = log; } public string Name => "مدجابز (medjobs.ir)"; public async Task> FetchAsync(AppSetting s, CancellationToken ct = default) { if (!s.MedjobsEnabled) return Array.Empty(); var max = Math.Clamp(s.MedjobsMaxAds, 1, 500); var client = _clients.For(s, s.MedjobsUseProxy); try { // 1. sitemap index → the ad_listing sitemaps var index = await client.GetStringAsync(SitemapIndex, ct); var adSitemaps = Locs(index).Where(u => u.Contains("ad_listing-sitemap")).ToList(); if (adSitemaps.Count == 0) { _log.LogWarning("medjobs: no ad_listing sitemaps found"); return Array.Empty(); } // 2. collect ad URLs (skip the bare /ads/ archive) var adUrls = new List(); foreach (var sm in adSitemaps) { if (adUrls.Count >= max) break; try { var body = await client.GetStringAsync(sm, ct); adUrls.AddRange(Locs(body).Where(u => u.Contains("/ads/") && !u.TrimEnd('/').EndsWith("/ads"))); } catch (Exception ex) { _log.LogWarning(ex, "medjobs: sitemap {Sm} failed", sm); } } adUrls = adUrls.Distinct().Take(max).ToList(); // 3. fetch each ad page → title + description var items = new List(); foreach (var url in adUrls) { ct.ThrowIfCancellationRequested(); try { var html = await client.GetStringAsync(url, ct); var text = ExtractAd(html); if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url)); } catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); } } _log.LogInformation("medjobs: fetched {Count} ads", items.Count); return items; } catch (Exception ex) { _log.LogWarning(ex, "medjobs fetch failed"); return Array.Empty(); } } private static IEnumerable Locs(string xml) => Regex.Matches(xml, "([^<]+)").Select(m => m.Groups[1].Value.Trim()); /// Title (og:title, site suffix stripped) + body (entry/description content or og:description). private static string ExtractAd(string html) { var title = Meta(html, "og:title"); if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); } string? body = BetweenClass(html, "rtcl-description") ?? BetweenClass(html, "entry-content") ?? Meta(html, "og:description"); var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p)); var text = HtmlUtil.ToPlainText(string.Join("\n", parts)); if (text.Length > 1800) text = text[..1800]; // The contact number is often outside the description (in a tel: link / data attribute the // page reveals on click). Harvest it from the full HTML and append so the parser/AI see it. var phones = HtmlUtil.HarvestPhones(html); if (phones.Count > 0 && !phones.Any(text.Contains)) text += "\nشماره تماس: " + string.Join("، ", phones); return text; } private static string? Meta(string html, string prop) { var m = Regex.Match(html, $"]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']"); return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null; } /// Grab the inner HTML of the first <div class="...name..."> (best-effort). private static string? BetweenClass(string html, string cls) { var m = Regex.Match(html, $"]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)", RegexOptions.Singleline); return m.Success ? m.Groups[1].Value : null; } }