[Ingest] Fix Divar: use POST search API (GET was anti-bot blocked)
Divar's /v8/web-search GET returns a BLOCKING_VIEW (anti-bot), so the old source pulled nothing useful and could scrape the block message. Switch to the working POST /v8/postlist/w/search with a browser User-Agent and a city-id map (numeric id passthrough; tehran=1 default). Skip responses that are non-2xx or contain BLOCKING_VIEW so the block page is never ingested. Verified locally: fetched 25 real Tehran job posts into the review queue, 0 block messages. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -11,7 +11,10 @@ namespace JobsMedical.Web.Services.Scraping;
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public class DivarListingSource : IListingSource
|
public class DivarListingSource : IListingSource
|
||||||
{
|
{
|
||||||
private const string BaseUrl = "https://api.divar.ir/v8/web-search";
|
// Divar's web-search GET is anti-bot protected (returns a BLOCKING_VIEW). Their real search
|
||||||
|
// is this POST endpoint, which returns POST_ROW widgets we can harvest.
|
||||||
|
private const string SearchUrl = "https://api.divar.ir/v8/postlist/w/search";
|
||||||
|
private const string Ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36";
|
||||||
private readonly ScrapeHttpClients _clients;
|
private readonly ScrapeHttpClients _clients;
|
||||||
private readonly ILogger<DivarListingSource> _log;
|
private readonly ILogger<DivarListingSource> _log;
|
||||||
|
|
||||||
@@ -27,7 +30,7 @@ public class DivarListingSource : IListingSource
|
|||||||
{
|
{
|
||||||
var queries = AppSetting.SplitList(s.DivarQueries);
|
var queries = AppSetting.SplitList(s.DivarQueries);
|
||||||
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty<ScrapedItem>();
|
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty<ScrapedItem>();
|
||||||
var city = string.IsNullOrWhiteSpace(s.DivarCity) ? "tehran" : s.DivarCity.Trim();
|
var cityId = CityId(s.DivarCity);
|
||||||
|
|
||||||
var client = _clients.For(s, s.DivarUseProxy);
|
var client = _clients.For(s, s.DivarUseProxy);
|
||||||
var items = new List<ScrapedItem>();
|
var items = new List<ScrapedItem>();
|
||||||
@@ -35,8 +38,26 @@ public class DivarListingSource : IListingSource
|
|||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var url = $"{BaseUrl}/{city}/jobs?q={Uri.EscapeDataString(q)}";
|
var payload = JsonSerializer.Serialize(new
|
||||||
var body = await client.GetStringAsync(url, ct);
|
{
|
||||||
|
city_ids = new[] { cityId },
|
||||||
|
search_data = new
|
||||||
|
{
|
||||||
|
form_data = new { data = new { category = new { str = new { value = "jobs" } } } },
|
||||||
|
query = q
|
||||||
|
}
|
||||||
|
});
|
||||||
|
using var req = new HttpRequestMessage(HttpMethod.Post, SearchUrl)
|
||||||
|
{ Content = new StringContent(payload, Encoding.UTF8, "application/json") };
|
||||||
|
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
||||||
|
|
||||||
|
using var resp = await client.SendAsync(req, ct);
|
||||||
|
var body = await resp.Content.ReadAsStringAsync(ct);
|
||||||
|
if (!resp.IsSuccessStatusCode || body.Contains("BLOCKING_VIEW"))
|
||||||
|
{
|
||||||
|
_log.LogWarning("Divar blocked/failed for query {Query} (HTTP {Status})", q, (int)resp.StatusCode);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
using var doc = JsonDocument.Parse(body);
|
using var doc = JsonDocument.Parse(body);
|
||||||
foreach (var text in Harvest(doc.RootElement).Take(25))
|
foreach (var text in Harvest(doc.RootElement).Take(25))
|
||||||
items.Add(new ScrapedItem("دیوار", text, "https://divar.ir"));
|
items.Add(new ScrapedItem("دیوار", text, "https://divar.ir"));
|
||||||
@@ -46,6 +67,24 @@ public class DivarListingSource : IListingSource
|
|||||||
return items;
|
return items;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>Divar uses numeric city IDs in its API. Pass a number through; map common slugs;
|
||||||
|
/// default to Tehran (1). Admin can enter the numeric id directly in settings.</summary>
|
||||||
|
private static string CityId(string? city)
|
||||||
|
{
|
||||||
|
city = (city ?? "").Trim().ToLowerInvariant();
|
||||||
|
if (int.TryParse(city, out _)) return city;
|
||||||
|
return city switch
|
||||||
|
{
|
||||||
|
"tehran" or "تهران" => "1",
|
||||||
|
"isfahan" or "esfahan" or "اصفهان" => "3",
|
||||||
|
"mashhad" or "مشهد" => "4",
|
||||||
|
"shiraz" or "شیراز" => "5",
|
||||||
|
"tabriz" or "تبریز" => "6",
|
||||||
|
"karaj" or "کرج" => "1745",
|
||||||
|
_ => "1",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
private static readonly string[] DescKeys =
|
private static readonly string[] DescKeys =
|
||||||
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user