using System.Text; using System.Text.Json; using JobsMedical.Web.Models; namespace JobsMedical.Web.Services.Scraping; /// /// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post /// titles + descriptions. Enabled + city + queries come from admin settings (DB). Divar's /// private API shifts shape, so we walk JSON tolerantly and fail soft. /// public class DivarListingSource : IListingSource { // Divar's web-search GET is anti-bot protected (returns a BLOCKING_VIEW). Their real search // is this POST endpoint, which returns POST_ROW widgets we can harvest. private const string SearchUrl = "https://api.divar.ir/v8/postlist/w/search"; private const string Ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"; private readonly ScrapeHttpClients _clients; private readonly ILogger _log; public DivarListingSource(ScrapeHttpClients clients, ILogger log) { _clients = clients; _log = log; } public string Name => "دیوار"; public async Task> FetchAsync(AppSetting s, CancellationToken ct = default) { var queries = AppSetting.SplitList(s.DivarQueries); if (!s.DivarEnabled || queries.Count == 0) return Array.Empty(); var cityId = CityId(s.DivarCity); var client = _clients.For(s, s.DivarUseProxy); var items = new List(); foreach (var q in queries) { try { var payload = JsonSerializer.Serialize(new { city_ids = new[] { cityId }, search_data = new { form_data = new { data = new { category = new { str = new { value = "jobs" } } } }, query = q } }); using var req = new HttpRequestMessage(HttpMethod.Post, SearchUrl) { Content = new StringContent(payload, Encoding.UTF8, "application/json") }; req.Headers.TryAddWithoutValidation("User-Agent", Ua); using var resp = await client.SendAsync(req, ct); var body = await resp.Content.ReadAsStringAsync(ct); if (!resp.IsSuccessStatusCode || body.Contains("BLOCKING_VIEW")) { _log.LogWarning("Divar blocked/failed for query {Query} (HTTP {Status})", q, (int)resp.StatusCode); continue; } using var doc = JsonDocument.Parse(body); var cityLabel = CityLabel(s.DivarCity); // every result is from the city we searched foreach (var (text, token) in Harvest(doc.RootElement).Take(25)) { var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir"; var itemText = text; // Stamp the city so the parser/AI always resolve a location (Divar's own location // line isn't always in the search row; the searched city is authoritative). if (!string.IsNullOrWhiteSpace(cityLabel) && !text.Contains(cityLabel)) itemText += $"\n📍 {cityLabel}"; double? lat = null, lng = null; if (token is not null) { // One detail fetch yields BOTH the phone and the map coordinates. var (phones, gLat, gLng) = await FetchDetailAsync(client, token, ct); if (phones.Count > 0 && !phones.Any(itemText.Contains)) itemText += "\nشماره تماس: " + string.Join("، ", phones); lat = gLat; lng = gLng; } items.Add(new ScrapedItem("دیوار", itemText, url, lat, lng)); } } catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); } } return items; } /// Divar uses numeric city IDs in its API. Pass a number through; map common slugs; /// default to Tehran (1). Admin can enter the numeric id directly in settings. private static string CityId(string? city) { city = (city ?? "").Trim().ToLowerInvariant(); if (int.TryParse(city, out _)) return city; return city switch { "tehran" or "تهران" => "1", "isfahan" or "esfahan" or "اصفهان" => "3", "mashhad" or "مشهد" => "4", "shiraz" or "شیراز" => "5", "tabriz" or "تبریز" => "6", "karaj" or "کرج" => "1745", _ => "1", }; } /// Persian display name for the searched city (slug/number/Persian → Persian), used to /// stamp every Divar result with its (authoritative) location. private static string CityLabel(string? city) => (city ?? "").Trim().ToLowerInvariant() switch { "1" or "tehran" or "تهران" => "تهران", "3" or "isfahan" or "esfahan" or "اصفهان" => "اصفهان", "4" or "mashhad" or "مشهد" => "مشهد", "5" or "shiraz" or "شیراز" => "شیراز", "6" or "tabriz" or "تبریز" => "تبریز", "1745" or "karaj" or "کرج" => "کرج", _ => (city ?? "").Trim(), }; // The post detail endpoint returns the FULL description — many Divar job ads write the phone // straight into the body, so we can harvest it without Divar's (login-gated) contact reveal. private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/"; /// /// Fetch a post's detail JSON ONCE and harvest both (a) any contact number it contains (mostly /// numbers the poster wrote into the description; Divar's true "نمایش شماره" reveal is auth-gated) /// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a /// circle). Fails soft — returns whatever it could extract. /// private async Task<(List phones, double? lat, double? lng)> FetchDetailAsync( HttpClient client, string token, CancellationToken ct) { try { using var req = new HttpRequestMessage(HttpMethod.Get, PostDetailUrl + token); req.Headers.TryAddWithoutValidation("User-Agent", Ua); req.Headers.TryAddWithoutValidation("Accept", "application/json"); using var resp = await client.SendAsync(req, ct); if (!resp.IsSuccessStatusCode) return (new(), null, null); var body = await resp.Content.ReadAsStringAsync(ct); if (body.Contains("BLOCKING_VIEW")) return (new(), null, null); var phones = HtmlUtil.HarvestPhones(body); double? lat = null, lng = null; try { using var doc = JsonDocument.Parse(body); if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } } catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ } return (phones, lat, lng); } catch (Exception ex) { _log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token); return (new(), null, null); } } // Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…). private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64; /// /// Tolerantly find an approximate (lat, lng) anywhere in Divar's detail JSON. Divar's shape /// shifts (sometimes `latitude`/`longitude`, sometimes nested under `location`/`coordinates`), /// so we walk the tree and accept the first OBJECT that holds BOTH a latitude-like and a /// longitude-like numeric property whose values fall inside Iran. Pairing within one object /// avoids matching a stray lat to an unrelated lng. Returns null if nothing plausible is found. /// private static (double lat, double lng)? FindLatLng(JsonElement el) { if (el.ValueKind == JsonValueKind.Object) { double? lat = null, lng = null; foreach (var p in el.EnumerateObject()) { if (lat is null && IsLatKey(p.Name) && TryNum(p.Value, out var la)) lat = la; else if (lng is null && IsLngKey(p.Name) && TryNum(p.Value, out var lo)) lng = lo; } if (lat is double L && lng is double G && L is >= MinLat and <= MaxLat && G is >= MinLng and <= MaxLng) return (L, G); foreach (var p in el.EnumerateObject()) if (FindLatLng(p.Value) is { } r) return r; } else if (el.ValueKind == JsonValueKind.Array) foreach (var item in el.EnumerateArray()) if (FindLatLng(item) is { } r) return r; return null; } private static bool IsLatKey(string k) => k.Equals("latitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lat", StringComparison.OrdinalIgnoreCase); private static bool IsLngKey(string k) => k.Equals("longitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lng", StringComparison.OrdinalIgnoreCase) || k.Equals("lon", StringComparison.OrdinalIgnoreCase) || k.Equals("long", StringComparison.OrdinalIgnoreCase); /// Coordinate may be a JSON number or a numeric string ("35.7"). Invariant culture. private static bool TryNum(JsonElement v, out double d) { if (v.ValueKind == JsonValueKind.Number) return v.TryGetDouble(out d); if (v.ValueKind == JsonValueKind.String) return double.TryParse(v.GetString(), System.Globalization.NumberStyles.Float, System.Globalization.CultureInfo.InvariantCulture, out d); d = 0; return false; } private static readonly string[] DescKeys = { "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" }; private static IEnumerable<(string text, string? token)> Harvest(JsonElement el) { if (el.ValueKind == JsonValueKind.Object) { if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String) { var sb = new StringBuilder(t.GetString()); // Append ALL present description fields — the location/time line («… در تهران، جنت‌آباد») // is usually in bottom_description_text, so don't stop at the first match. foreach (var k in DescKeys) if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String && d.GetString() is { Length: > 0 } v) sb.Append(" — ").Append(v); var text = sb.ToString().Trim(); if (text.Length >= 15) yield return (text, FindToken(el)); } foreach (var p in el.EnumerateObject()) foreach (var x in Harvest(p.Value)) yield return x; } else if (el.ValueKind == JsonValueKind.Array) { foreach (var item in el.EnumerateArray()) foreach (var x in Harvest(item)) yield return x; } } /// Find the post token within a widget object (Divar tokens: 6–16 alphanumerics). private static string? FindToken(JsonElement el) { if (el.ValueKind == JsonValueKind.Object) { foreach (var p in el.EnumerateObject()) if (p.NameEquals("token") && p.Value.ValueKind == JsonValueKind.String) { var v = p.Value.GetString(); if (v is not null && v.Length is >= 6 and <= 16 && v.All(char.IsLetterOrDigit)) return v; } foreach (var p in el.EnumerateObject()) { var r = FindToken(p.Value); if (r is not null) return r; } } else if (el.ValueKind == JsonValueKind.Array) foreach (var item in el.EnumerateArray()) { var r = FindToken(item); if (r is not null) return r; } return null; } }