diff --git a/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs b/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs index 592b8a0..7435622 100644 --- a/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs @@ -60,6 +60,10 @@ public class MedjobsListingSource : IListingSource { var html = await client.GetStringAsync(url, ct); var text = ExtractAd(html); + // The contact number is hidden until clicked; replay the site's reveal call. + var phones = await RevealPhonesAsync(client, html, url, ct); + if (phones.Count > 0 && !phones.Any(text.Contains)) + text += "\nشماره تماس: " + string.Join("، ", phones); if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url)); } catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); } @@ -74,6 +78,58 @@ public class MedjobsListingSource : IListingSource } } + private const string AjaxUrl = "https://medjobs.ir/wp-admin/admin-ajax.php"; + + /// + /// medjobs hides the contact number behind a click that POSTs to admin-ajax + /// (action=isatis_protect_contact, id=<listingId>) and returns an HTML table of tel: links. + /// We replay that request server-side and harvest the numbers. No nonce required. + /// + private async Task> RevealPhonesAsync(HttpClient client, string pageHtml, string adUrl, CancellationToken ct) + { + var id = ExtractListingId(pageHtml); + if (id is null) return new(); + try + { + using var req = new HttpRequestMessage(HttpMethod.Post, AjaxUrl) + { + Content = new FormUrlEncodedContent(new Dictionary + { + ["action"] = "isatis_protect_contact", + ["id"] = id, + }), + }; + req.Headers.TryAddWithoutValidation("X-Requested-With", "XMLHttpRequest"); + req.Headers.TryAddWithoutValidation("Referer", adUrl); + req.Headers.TryAddWithoutValidation("Accept", "text/html, */*; q=0.01"); + + using var resp = await client.SendAsync(req, ct); + if (!resp.IsSuccessStatusCode) return new(); + var body = await resp.Content.ReadAsStringAsync(ct); + return HtmlUtil.HarvestPhones(body); + } + catch (Exception ex) + { + _log.LogWarning(ex, "medjobs: phone reveal failed for {Url}", adUrl); + return new(); + } + } + + /// The numeric listing id used by the reveal call (= «کد آگهی»). + private static string? ExtractListingId(string html) + { + // Most reliable: the WP shortlink (?p=ID) and the body's postid-ID class. + var m = Regex.Match(html, @"[?&]p=(\d{2,})"); + if (m.Success) return m.Groups[1].Value; + m = Regex.Match(html, @"postid-(\d{2,})"); + if (m.Success) return m.Groups[1].Value; + m = Regex.Match(html, @"data-(?:id|listing[-_]?id)=[""'](\d{2,})[""']", RegexOptions.IgnoreCase); + if (m.Success) return m.Groups[1].Value; + // Fallback: the visible «کد آگهی : ۳۹۲۳۰۵». + m = Regex.Match(HtmlUtil.ToLatinDigits(html), @"کد[\s\S]{0,8}?آگهی[\s\S]{0,12}?(\d{3,})"); + return m.Success ? m.Groups[1].Value : null; + } + private static IEnumerable Locs(string xml) => Regex.Matches(xml, "([^<]+)").Select(m => m.Groups[1].Value.Trim());