Medjobs: reveal hidden contact number via admin-ajax during crawl
CI/CD / CI · dotnet build (push) Successful in 1m16s
CI/CD / Deploy · hamkadr (push) Successful in 2m14s

The contact phone on medjobs.ir is loaded by JS only after clicking
«تماس با این آگهی» — it isn't in the page HTML, so scanning the markup
found nothing. We now replay that exact reveal request server-side:

- POST https://medjobs.ir/wp-admin/admin-ajax.php with
  action=isatis_protect_contact & id=<listingId> (no nonce needed),
  then harvest the tel: numbers from the returned HTML table.
- Listing id is pulled from the page via the WP shortlink (?p=ID),
  postid-/data-id, or the visible «کد آگهی» as a fallback.
- Numbers are appended to the ad text so the parser/AI capture them and
  they reach the published listing. Wrapped in try/catch so a failed
  reveal never breaks ingestion; uses the same (proxy-aware, brotli-
  decompressing) client as the page fetch.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-08 08:21:24 +03:30
parent 213af9db48
commit d238888710
@@ -60,6 +60,10 @@ public class MedjobsListingSource : IListingSource
{
var html = await client.GetStringAsync(url, ct);
var text = ExtractAd(html);
// The contact number is hidden until clicked; replay the site's reveal call.
var phones = await RevealPhonesAsync(client, html, url, ct);
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url));
}
catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); }
@@ -74,6 +78,58 @@ public class MedjobsListingSource : IListingSource
}
}
private const string AjaxUrl = "https://medjobs.ir/wp-admin/admin-ajax.php";
/// <summary>
/// medjobs hides the contact number behind a click that POSTs to admin-ajax
/// (action=isatis_protect_contact, id=&lt;listingId&gt;) and returns an HTML table of tel: links.
/// We replay that request server-side and harvest the numbers. No nonce required.
/// </summary>
private async Task<List<string>> RevealPhonesAsync(HttpClient client, string pageHtml, string adUrl, CancellationToken ct)
{
var id = ExtractListingId(pageHtml);
if (id is null) return new();
try
{
using var req = new HttpRequestMessage(HttpMethod.Post, AjaxUrl)
{
Content = new FormUrlEncodedContent(new Dictionary<string, string>
{
["action"] = "isatis_protect_contact",
["id"] = id,
}),
};
req.Headers.TryAddWithoutValidation("X-Requested-With", "XMLHttpRequest");
req.Headers.TryAddWithoutValidation("Referer", adUrl);
req.Headers.TryAddWithoutValidation("Accept", "text/html, */*; q=0.01");
using var resp = await client.SendAsync(req, ct);
if (!resp.IsSuccessStatusCode) return new();
var body = await resp.Content.ReadAsStringAsync(ct);
return HtmlUtil.HarvestPhones(body);
}
catch (Exception ex)
{
_log.LogWarning(ex, "medjobs: phone reveal failed for {Url}", adUrl);
return new();
}
}
/// <summary>The numeric listing id used by the reveal call (= «کد آگهی»).</summary>
private static string? ExtractListingId(string html)
{
// Most reliable: the WP shortlink (?p=ID) and the body's postid-ID class.
var m = Regex.Match(html, @"[?&]p=(\d{2,})");
if (m.Success) return m.Groups[1].Value;
m = Regex.Match(html, @"postid-(\d{2,})");
if (m.Success) return m.Groups[1].Value;
m = Regex.Match(html, @"data-(?:id|listing[-_]?id)=[""'](\d{2,})[""']", RegexOptions.IgnoreCase);
if (m.Success) return m.Groups[1].Value;
// Fallback: the visible «کد آگهی : ۳۹۲۳۰۵».
m = Regex.Match(HtmlUtil.ToLatinDigits(html), @"کد[\s\S]{0,8}?آگهی[\s\S]{0,12}?(\d{3,})");
return m.Success ? m.Groups[1].Value : null;
}
private static IEnumerable<string> Locs(string xml)
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());