Medjobs: reveal hidden contact number via admin-ajax during crawl
The contact phone on medjobs.ir is loaded by JS only after clicking «تماس با این آگهی» — it isn't in the page HTML, so scanning the markup found nothing. We now replay that exact reveal request server-side: - POST https://medjobs.ir/wp-admin/admin-ajax.php with action=isatis_protect_contact & id=<listingId> (no nonce needed), then harvest the tel: numbers from the returned HTML table. - Listing id is pulled from the page via the WP shortlink (?p=ID), postid-/data-id, or the visible «کد آگهی» as a fallback. - Numbers are appended to the ad text so the parser/AI capture them and they reach the published listing. Wrapped in try/catch so a failed reveal never breaks ingestion; uses the same (proxy-aware, brotli- decompressing) client as the page fetch. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -60,6 +60,10 @@ public class MedjobsListingSource : IListingSource
|
||||
{
|
||||
var html = await client.GetStringAsync(url, ct);
|
||||
var text = ExtractAd(html);
|
||||
// The contact number is hidden until clicked; replay the site's reveal call.
|
||||
var phones = await RevealPhonesAsync(client, html, url, ct);
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); }
|
||||
@@ -74,6 +78,58 @@ public class MedjobsListingSource : IListingSource
|
||||
}
|
||||
}
|
||||
|
||||
private const string AjaxUrl = "https://medjobs.ir/wp-admin/admin-ajax.php";
|
||||
|
||||
/// <summary>
|
||||
/// medjobs hides the contact number behind a click that POSTs to admin-ajax
|
||||
/// (action=isatis_protect_contact, id=<listingId>) and returns an HTML table of tel: links.
|
||||
/// We replay that request server-side and harvest the numbers. No nonce required.
|
||||
/// </summary>
|
||||
private async Task<List<string>> RevealPhonesAsync(HttpClient client, string pageHtml, string adUrl, CancellationToken ct)
|
||||
{
|
||||
var id = ExtractListingId(pageHtml);
|
||||
if (id is null) return new();
|
||||
try
|
||||
{
|
||||
using var req = new HttpRequestMessage(HttpMethod.Post, AjaxUrl)
|
||||
{
|
||||
Content = new FormUrlEncodedContent(new Dictionary<string, string>
|
||||
{
|
||||
["action"] = "isatis_protect_contact",
|
||||
["id"] = id,
|
||||
}),
|
||||
};
|
||||
req.Headers.TryAddWithoutValidation("X-Requested-With", "XMLHttpRequest");
|
||||
req.Headers.TryAddWithoutValidation("Referer", adUrl);
|
||||
req.Headers.TryAddWithoutValidation("Accept", "text/html, */*; q=0.01");
|
||||
|
||||
using var resp = await client.SendAsync(req, ct);
|
||||
if (!resp.IsSuccessStatusCode) return new();
|
||||
var body = await resp.Content.ReadAsStringAsync(ct);
|
||||
return HtmlUtil.HarvestPhones(body);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "medjobs: phone reveal failed for {Url}", adUrl);
|
||||
return new();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>The numeric listing id used by the reveal call (= «کد آگهی»).</summary>
|
||||
private static string? ExtractListingId(string html)
|
||||
{
|
||||
// Most reliable: the WP shortlink (?p=ID) and the body's postid-ID class.
|
||||
var m = Regex.Match(html, @"[?&]p=(\d{2,})");
|
||||
if (m.Success) return m.Groups[1].Value;
|
||||
m = Regex.Match(html, @"postid-(\d{2,})");
|
||||
if (m.Success) return m.Groups[1].Value;
|
||||
m = Regex.Match(html, @"data-(?:id|listing[-_]?id)=[""'](\d{2,})[""']", RegexOptions.IgnoreCase);
|
||||
if (m.Success) return m.Groups[1].Value;
|
||||
// Fallback: the visible «کد آگهی : ۳۹۲۳۰۵».
|
||||
m = Regex.Match(HtmlUtil.ToLatinDigits(html), @"کد[\s\S]{0,8}?آگهی[\s\S]{0,12}?(\d{3,})");
|
||||
return m.Success ? m.Groups[1].Value : null;
|
||||
}
|
||||
|
||||
private static IEnumerable<string> Locs(string xml)
|
||||
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
|
||||
|
||||
|
||||
Reference in New Issue
Block a user