From 2485173aadc13cf4436ee96297c0a6f8520c6bc4 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 7 Jun 2026 21:23:36 +0330 Subject: [PATCH] [Ingest] Fix Divar: use POST search API (GET was anti-bot blocked) Divar's /v8/web-search GET returns a BLOCKING_VIEW (anti-bot), so the old source pulled nothing useful and could scrape the block message. Switch to the working POST /v8/postlist/w/search with a browser User-Agent and a city-id map (numeric id passthrough; tehran=1 default). Skip responses that are non-2xx or contain BLOCKING_VIEW so the block page is never ingested. Verified locally: fetched 25 real Tehran job posts into the review queue, 0 block messages. Co-Authored-By: Claude Opus 4.8 --- .../Services/Scraping/DivarListingSource.cs | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs b/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs index 58474d1..ae83213 100644 --- a/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs @@ -11,7 +11,10 @@ namespace JobsMedical.Web.Services.Scraping; /// public class DivarListingSource : IListingSource { - private const string BaseUrl = "https://api.divar.ir/v8/web-search"; + // Divar's web-search GET is anti-bot protected (returns a BLOCKING_VIEW). Their real search + // is this POST endpoint, which returns POST_ROW widgets we can harvest. + private const string SearchUrl = "https://api.divar.ir/v8/postlist/w/search"; + private const string Ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"; private readonly ScrapeHttpClients _clients; private readonly ILogger _log; @@ -27,7 +30,7 @@ public class DivarListingSource : IListingSource { var queries = AppSetting.SplitList(s.DivarQueries); if (!s.DivarEnabled || queries.Count == 0) return Array.Empty(); - var city = string.IsNullOrWhiteSpace(s.DivarCity) ? "tehran" : s.DivarCity.Trim(); + var cityId = CityId(s.DivarCity); var client = _clients.For(s, s.DivarUseProxy); var items = new List(); @@ -35,8 +38,26 @@ public class DivarListingSource : IListingSource { try { - var url = $"{BaseUrl}/{city}/jobs?q={Uri.EscapeDataString(q)}"; - var body = await client.GetStringAsync(url, ct); + var payload = JsonSerializer.Serialize(new + { + city_ids = new[] { cityId }, + search_data = new + { + form_data = new { data = new { category = new { str = new { value = "jobs" } } } }, + query = q + } + }); + using var req = new HttpRequestMessage(HttpMethod.Post, SearchUrl) + { Content = new StringContent(payload, Encoding.UTF8, "application/json") }; + req.Headers.TryAddWithoutValidation("User-Agent", Ua); + + using var resp = await client.SendAsync(req, ct); + var body = await resp.Content.ReadAsStringAsync(ct); + if (!resp.IsSuccessStatusCode || body.Contains("BLOCKING_VIEW")) + { + _log.LogWarning("Divar blocked/failed for query {Query} (HTTP {Status})", q, (int)resp.StatusCode); + continue; + } using var doc = JsonDocument.Parse(body); foreach (var text in Harvest(doc.RootElement).Take(25)) items.Add(new ScrapedItem("دیوار", text, "https://divar.ir")); @@ -46,6 +67,24 @@ public class DivarListingSource : IListingSource return items; } + /// Divar uses numeric city IDs in its API. Pass a number through; map common slugs; + /// default to Tehran (1). Admin can enter the numeric id directly in settings. + private static string CityId(string? city) + { + city = (city ?? "").Trim().ToLowerInvariant(); + if (int.TryParse(city, out _)) return city; + return city switch + { + "tehran" or "تهران" => "1", + "isfahan" or "esfahan" or "اصفهان" => "3", + "mashhad" or "مشهد" => "4", + "shiraz" or "شیراز" => "5", + "tabriz" or "تبریز" => "6", + "karaj" or "کرج" => "1745", + _ => "1", + }; + } + private static readonly string[] DescKeys = { "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };