From d238888710de07aee2f06757d7652219cc7fc957 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Mon, 8 Jun 2026 08:21:24 +0330 Subject: [PATCH] Medjobs: reveal hidden contact number via admin-ajax during crawl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The contact phone on medjobs.ir is loaded by JS only after clicking «تماس با این آگهی» — it isn't in the page HTML, so scanning the markup found nothing. We now replay that exact reveal request server-side: - POST https://medjobs.ir/wp-admin/admin-ajax.php with action=isatis_protect_contact & id= (no nonce needed), then harvest the tel: numbers from the returned HTML table. - Listing id is pulled from the page via the WP shortlink (?p=ID), postid-/data-id, or the visible «کد آگهی» as a fallback. - Numbers are appended to the ad text so the parser/AI capture them and they reach the published listing. Wrapped in try/catch so a failed reveal never breaks ingestion; uses the same (proxy-aware, brotli- decompressing) client as the page fetch. Co-Authored-By: Claude Opus 4.8 --- .../Services/Scraping/MedjobsListingSource.cs | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs b/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs index 592b8a0..7435622 100644 --- a/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs @@ -60,6 +60,10 @@ public class MedjobsListingSource : IListingSource { var html = await client.GetStringAsync(url, ct); var text = ExtractAd(html); + // The contact number is hidden until clicked; replay the site's reveal call. + var phones = await RevealPhonesAsync(client, html, url, ct); + if (phones.Count > 0 && !phones.Any(text.Contains)) + text += "\nشماره تماس: " + string.Join("، ", phones); if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url)); } catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); } @@ -74,6 +78,58 @@ public class MedjobsListingSource : IListingSource } } + private const string AjaxUrl = "https://medjobs.ir/wp-admin/admin-ajax.php"; + + /// + /// medjobs hides the contact number behind a click that POSTs to admin-ajax + /// (action=isatis_protect_contact, id=<listingId>) and returns an HTML table of tel: links. + /// We replay that request server-side and harvest the numbers. No nonce required. + /// + private async Task> RevealPhonesAsync(HttpClient client, string pageHtml, string adUrl, CancellationToken ct) + { + var id = ExtractListingId(pageHtml); + if (id is null) return new(); + try + { + using var req = new HttpRequestMessage(HttpMethod.Post, AjaxUrl) + { + Content = new FormUrlEncodedContent(new Dictionary + { + ["action"] = "isatis_protect_contact", + ["id"] = id, + }), + }; + req.Headers.TryAddWithoutValidation("X-Requested-With", "XMLHttpRequest"); + req.Headers.TryAddWithoutValidation("Referer", adUrl); + req.Headers.TryAddWithoutValidation("Accept", "text/html, */*; q=0.01"); + + using var resp = await client.SendAsync(req, ct); + if (!resp.IsSuccessStatusCode) return new(); + var body = await resp.Content.ReadAsStringAsync(ct); + return HtmlUtil.HarvestPhones(body); + } + catch (Exception ex) + { + _log.LogWarning(ex, "medjobs: phone reveal failed for {Url}", adUrl); + return new(); + } + } + + /// The numeric listing id used by the reveal call (= «کد آگهی»). + private static string? ExtractListingId(string html) + { + // Most reliable: the WP shortlink (?p=ID) and the body's postid-ID class. + var m = Regex.Match(html, @"[?&]p=(\d{2,})"); + if (m.Success) return m.Groups[1].Value; + m = Regex.Match(html, @"postid-(\d{2,})"); + if (m.Success) return m.Groups[1].Value; + m = Regex.Match(html, @"data-(?:id|listing[-_]?id)=[""'](\d{2,})[""']", RegexOptions.IgnoreCase); + if (m.Success) return m.Groups[1].Value; + // Fallback: the visible «کد آگهی : ۳۹۲۳۰۵». + m = Regex.Match(HtmlUtil.ToLatinDigits(html), @"کد[\s\S]{0,8}?آگهی[\s\S]{0,12}?(\d{3,})"); + return m.Success ? m.Groups[1].Value : null; + } + private static IEnumerable Locs(string xml) => Regex.Matches(xml, "([^<]+)").Select(m => m.Groups[1].Value.Trim());