using System.Text;
using System.Text.Json;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
///
/// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post
/// titles + descriptions. Enabled + city + queries come from admin settings (DB). Divar's
/// private API shifts shape, so we walk JSON tolerantly and fail soft.
///
public class DivarListingSource : IListingSource
{
// Divar's web-search GET is anti-bot protected (returns a BLOCKING_VIEW). Their real search
// is this POST endpoint, which returns POST_ROW widgets we can harvest.
private const string SearchUrl = "https://api.divar.ir/v8/postlist/w/search";
private const string Ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36";
private readonly ScrapeHttpClients _clients;
private readonly ILogger _log;
public DivarListingSource(ScrapeHttpClients clients, ILogger log)
{
_clients = clients;
_log = log;
}
public string Name => "دیوار";
public async Task> FetchAsync(AppSetting s, CancellationToken ct = default)
{
var queries = AppSetting.SplitList(s.DivarQueries);
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty();
var cityId = CityId(s.DivarCity);
var client = _clients.For(s, s.DivarUseProxy);
var items = new List();
foreach (var q in queries)
{
try
{
var payload = JsonSerializer.Serialize(new
{
city_ids = new[] { cityId },
search_data = new
{
form_data = new { data = new { category = new { str = new { value = "jobs" } } } },
query = q
}
});
using var req = new HttpRequestMessage(HttpMethod.Post, SearchUrl)
{ Content = new StringContent(payload, Encoding.UTF8, "application/json") };
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
using var resp = await client.SendAsync(req, ct);
var body = await resp.Content.ReadAsStringAsync(ct);
if (!resp.IsSuccessStatusCode || body.Contains("BLOCKING_VIEW"))
{
_log.LogWarning("Divar blocked/failed for query {Query} (HTTP {Status})", q, (int)resp.StatusCode);
continue;
}
using var doc = JsonDocument.Parse(body);
var cityLabel = CityLabel(s.DivarCity); // every result is from the city we searched
foreach (var (text, token) in Harvest(doc.RootElement).Take(25))
{
var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir";
var itemText = text;
// Stamp the city so the parser/AI always resolve a location (Divar's own location
// line isn't always in the search row; the searched city is authoritative).
if (!string.IsNullOrWhiteSpace(cityLabel) && !text.Contains(cityLabel))
itemText += $"\n📍 {cityLabel}";
double? lat = null, lng = null;
if (token is not null)
{
// One detail fetch yields the FULL description, the phone, AND the map center.
// (The search row only carries a short one-line summary — the rich ad body lives
// on the post detail, so without this the listing looked "censored".)
var (phones, gLat, gLng, fullDesc) = await FetchDetailAsync(client, token, ct);
if (!string.IsNullOrWhiteSpace(fullDesc) && !itemText.Contains(fullDesc))
itemText += "\n" + fullDesc;
if (phones.Count > 0 && !phones.Any(itemText.Contains))
itemText += "\nشماره تماس: " + string.Join("، ", phones);
lat = gLat; lng = gLng;
}
items.Add(new ScrapedItem("دیوار", itemText, url, lat, lng));
}
}
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
}
return items;
}
/// Divar uses numeric city IDs in its API. Pass a number through; map common slugs;
/// default to Tehran (1). Admin can enter the numeric id directly in settings.
private static string CityId(string? city)
{
city = (city ?? "").Trim().ToLowerInvariant();
if (int.TryParse(city, out _)) return city;
return city switch
{
"tehran" or "تهران" => "1",
"isfahan" or "esfahan" or "اصفهان" => "3",
"mashhad" or "مشهد" => "4",
"shiraz" or "شیراز" => "5",
"tabriz" or "تبریز" => "6",
"karaj" or "کرج" => "1745",
_ => "1",
};
}
/// Persian display name for the searched city (slug/number/Persian → Persian), used to
/// stamp every Divar result with its (authoritative) location.
private static string CityLabel(string? city) => (city ?? "").Trim().ToLowerInvariant() switch
{
"1" or "tehran" or "تهران" => "تهران",
"3" or "isfahan" or "esfahan" or "اصفهان" => "اصفهان",
"4" or "mashhad" or "مشهد" => "مشهد",
"5" or "shiraz" or "شیراز" => "شیراز",
"6" or "tabriz" or "تبریز" => "تبریز",
"1745" or "karaj" or "کرج" => "کرج",
_ => (city ?? "").Trim(),
};
// The post detail endpoint returns the FULL description — many Divar job ads write the phone
// straight into the body, so we can harvest it without Divar's (login-gated) contact reveal.
private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/";
///
/// Fetch a post's detail JSON ONCE and harvest both (a) any contact number it contains (mostly
/// numbers the poster wrote into the description; Divar's true "نمایش شماره" reveal is auth-gated)
/// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a
/// circle). Fails soft — returns whatever it could extract.
///
private async Task<(List phones, double? lat, double? lng, string? description)> FetchDetailAsync(
HttpClient client, string token, CancellationToken ct)
{
try
{
using var req = new HttpRequestMessage(HttpMethod.Get, PostDetailUrl + token);
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
req.Headers.TryAddWithoutValidation("Accept", "application/json");
using var resp = await client.SendAsync(req, ct);
if (!resp.IsSuccessStatusCode) return (new(), null, null, null);
var body = await resp.Content.ReadAsStringAsync(ct);
if (body.Contains("BLOCKING_VIEW")) return (new(), null, null, null);
var phones = HtmlUtil.HarvestPhones(body);
double? lat = null, lng = null; string? desc = null;
try
{
using var doc = JsonDocument.Parse(body);
if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; }
desc = FindLongestText(doc.RootElement); // the full ad body
}
catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ }
return (phones, lat, lng, desc);
}
catch (Exception ex)
{
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
return (new(), null, null, null);
}
}
/// The full ad description in Divar's detail JSON = the longest free-text string. We skip
/// Divar's own safety/boilerplate notices (which mention «دیوار») and absurdly long blobs.
private static string? FindLongestText(JsonElement root)
{
string? best = null;
var stack = new Stack();
stack.Push(root);
while (stack.Count > 0)
{
var e = stack.Pop();
switch (e.ValueKind)
{
case JsonValueKind.Object:
foreach (var p in e.EnumerateObject()) stack.Push(p.Value);
break;
case JsonValueKind.Array:
foreach (var it in e.EnumerateArray()) stack.Push(it);
break;
case JsonValueKind.String:
var s = e.GetString();
if (s is { Length: >= 40 and <= 4000 } && s.Contains(' ') && !s.Contains("دیوار")
&& (best is null || s.Length > best.Length)) best = s;
break;
}
}
return best?.Trim();
}
// Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…).
private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;
///
/// Tolerantly find an approximate (lat, lng) anywhere in Divar's detail JSON. Divar's shape
/// shifts (sometimes `latitude`/`longitude`, sometimes nested under `location`/`coordinates`),
/// so we walk the tree and accept the first OBJECT that holds BOTH a latitude-like and a
/// longitude-like numeric property whose values fall inside Iran. Pairing within one object
/// avoids matching a stray lat to an unrelated lng. Returns null if nothing plausible is found.
///
private static (double lat, double lng)? FindLatLng(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
double? lat = null, lng = null;
foreach (var p in el.EnumerateObject())
{
if (lat is null && IsLatKey(p.Name) && TryNum(p.Value, out var la)) lat = la;
else if (lng is null && IsLngKey(p.Name) && TryNum(p.Value, out var lo)) lng = lo;
}
if (lat is double L && lng is double G && L is >= MinLat and <= MaxLat && G is >= MinLng and <= MaxLng)
return (L, G);
foreach (var p in el.EnumerateObject())
if (FindLatLng(p.Value) is { } r) return r;
}
else if (el.ValueKind == JsonValueKind.Array)
foreach (var item in el.EnumerateArray())
if (FindLatLng(item) is { } r) return r;
return null;
}
private static bool IsLatKey(string k) => k.Equals("latitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lat", StringComparison.OrdinalIgnoreCase);
private static bool IsLngKey(string k) =>
k.Equals("longitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lng", StringComparison.OrdinalIgnoreCase)
|| k.Equals("lon", StringComparison.OrdinalIgnoreCase) || k.Equals("long", StringComparison.OrdinalIgnoreCase);
/// Coordinate may be a JSON number or a numeric string ("35.7"). Invariant culture.
private static bool TryNum(JsonElement v, out double d)
{
if (v.ValueKind == JsonValueKind.Number) return v.TryGetDouble(out d);
if (v.ValueKind == JsonValueKind.String)
return double.TryParse(v.GetString(), System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out d);
d = 0; return false;
}
private static readonly string[] DescKeys =
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
private static IEnumerable<(string text, string? token)> Harvest(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
{
var sb = new StringBuilder(t.GetString());
// Append ALL present description fields — the location/time line («… در تهران، جنتآباد»)
// is usually in bottom_description_text, so don't stop at the first match.
foreach (var k in DescKeys)
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String && d.GetString() is { Length: > 0 } v)
sb.Append(" — ").Append(v);
var text = sb.ToString().Trim();
if (text.Length >= 15) yield return (text, FindToken(el));
}
foreach (var p in el.EnumerateObject())
foreach (var x in Harvest(p.Value)) yield return x;
}
else if (el.ValueKind == JsonValueKind.Array)
{
foreach (var item in el.EnumerateArray())
foreach (var x in Harvest(item)) yield return x;
}
}
/// Find the post token within a widget object (Divar tokens: 6–16 alphanumerics).
private static string? FindToken(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
foreach (var p in el.EnumerateObject())
if (p.NameEquals("token") && p.Value.ValueKind == JsonValueKind.String)
{
var v = p.Value.GetString();
if (v is not null && v.Length is >= 6 and <= 16 && v.All(char.IsLetterOrDigit)) return v;
}
foreach (var p in el.EnumerateObject())
{ var r = FindToken(p.Value); if (r is not null) return r; }
}
else if (el.ValueKind == JsonValueKind.Array)
foreach (var item in el.EnumerateArray())
{ var r = FindToken(item); if (r is not null) return r; }
return null;
}
}