Files
hamkadr/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs
T
soroush.asadi 380243b669
CI/CD / CI · dotnet build (push) Successful in 2m6s
CI/CD / Deploy · hamkadr (push) Successful in 2m3s
Divar geo-coords to facility map + medical gate + RawListing FK/geo migrations
2026-06-09 21:38:55 +03:30

248 lines
12 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text;
using System.Text.Json;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post
/// titles + descriptions. Enabled + city + queries come from admin settings (DB). Divar's
/// private API shifts shape, so we walk JSON tolerantly and fail soft.
/// </summary>
public class DivarListingSource : IListingSource
{
// Divar's web-search GET is anti-bot protected (returns a BLOCKING_VIEW). Their real search
// is this POST endpoint, which returns POST_ROW widgets we can harvest.
private const string SearchUrl = "https://api.divar.ir/v8/postlist/w/search";
private const string Ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36";
private readonly ScrapeHttpClients _clients;
private readonly ILogger<DivarListingSource> _log;
public DivarListingSource(ScrapeHttpClients clients, ILogger<DivarListingSource> log)
{
_clients = clients;
_log = log;
}
public string Name => "دیوار";
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
var queries = AppSetting.SplitList(s.DivarQueries);
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty<ScrapedItem>();
var cityId = CityId(s.DivarCity);
var client = _clients.For(s, s.DivarUseProxy);
var items = new List<ScrapedItem>();
foreach (var q in queries)
{
try
{
var payload = JsonSerializer.Serialize(new
{
city_ids = new[] { cityId },
search_data = new
{
form_data = new { data = new { category = new { str = new { value = "jobs" } } } },
query = q
}
});
using var req = new HttpRequestMessage(HttpMethod.Post, SearchUrl)
{ Content = new StringContent(payload, Encoding.UTF8, "application/json") };
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
using var resp = await client.SendAsync(req, ct);
var body = await resp.Content.ReadAsStringAsync(ct);
if (!resp.IsSuccessStatusCode || body.Contains("BLOCKING_VIEW"))
{
_log.LogWarning("Divar blocked/failed for query {Query} (HTTP {Status})", q, (int)resp.StatusCode);
continue;
}
using var doc = JsonDocument.Parse(body);
var cityLabel = CityLabel(s.DivarCity); // every result is from the city we searched
foreach (var (text, token) in Harvest(doc.RootElement).Take(25))
{
var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir";
var itemText = text;
// Stamp the city so the parser/AI always resolve a location (Divar's own location
// line isn't always in the search row; the searched city is authoritative).
if (!string.IsNullOrWhiteSpace(cityLabel) && !text.Contains(cityLabel))
itemText += $"\n📍 {cityLabel}";
double? lat = null, lng = null;
if (token is not null)
{
// One detail fetch yields BOTH the phone and the map coordinates.
var (phones, gLat, gLng) = await FetchDetailAsync(client, token, ct);
if (phones.Count > 0 && !phones.Any(itemText.Contains))
itemText += "\nشماره تماس: " + string.Join("، ", phones);
lat = gLat; lng = gLng;
}
items.Add(new ScrapedItem("دیوار", itemText, url, lat, lng));
}
}
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
}
return items;
}
/// <summary>Divar uses numeric city IDs in its API. Pass a number through; map common slugs;
/// default to Tehran (1). Admin can enter the numeric id directly in settings.</summary>
private static string CityId(string? city)
{
city = (city ?? "").Trim().ToLowerInvariant();
if (int.TryParse(city, out _)) return city;
return city switch
{
"tehran" or "تهران" => "1",
"isfahan" or "esfahan" or "اصفهان" => "3",
"mashhad" or "مشهد" => "4",
"shiraz" or "شیراز" => "5",
"tabriz" or "تبریز" => "6",
"karaj" or "کرج" => "1745",
_ => "1",
};
}
/// <summary>Persian display name for the searched city (slug/number/Persian → Persian), used to
/// stamp every Divar result with its (authoritative) location.</summary>
private static string CityLabel(string? city) => (city ?? "").Trim().ToLowerInvariant() switch
{
"1" or "tehran" or "تهران" => "تهران",
"3" or "isfahan" or "esfahan" or "اصفهان" => "اصفهان",
"4" or "mashhad" or "مشهد" => "مشهد",
"5" or "shiraz" or "شیراز" => "شیراز",
"6" or "tabriz" or "تبریز" => "تبریز",
"1745" or "karaj" or "کرج" => "کرج",
_ => (city ?? "").Trim(),
};
// The post detail endpoint returns the FULL description — many Divar job ads write the phone
// straight into the body, so we can harvest it without Divar's (login-gated) contact reveal.
private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/";
/// <summary>
/// Fetch a post's detail JSON ONCE and harvest both (a) any contact number it contains (mostly
/// numbers the poster wrote into the description; Divar's true "نمایش شماره" reveal is auth-gated)
/// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a
/// circle). Fails soft — returns whatever it could extract.
/// </summary>
private async Task<(List<string> phones, double? lat, double? lng)> FetchDetailAsync(
HttpClient client, string token, CancellationToken ct)
{
try
{
using var req = new HttpRequestMessage(HttpMethod.Get, PostDetailUrl + token);
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
req.Headers.TryAddWithoutValidation("Accept", "application/json");
using var resp = await client.SendAsync(req, ct);
if (!resp.IsSuccessStatusCode) return (new(), null, null);
var body = await resp.Content.ReadAsStringAsync(ct);
if (body.Contains("BLOCKING_VIEW")) return (new(), null, null);
var phones = HtmlUtil.HarvestPhones(body);
double? lat = null, lng = null;
try { using var doc = JsonDocument.Parse(body); if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } }
catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ }
return (phones, lat, lng);
}
catch (Exception ex)
{
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
return (new(), null, null);
}
}
// Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…).
private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;
/// <summary>
/// Tolerantly find an approximate (lat, lng) anywhere in Divar's detail JSON. Divar's shape
/// shifts (sometimes `latitude`/`longitude`, sometimes nested under `location`/`coordinates`),
/// so we walk the tree and accept the first OBJECT that holds BOTH a latitude-like and a
/// longitude-like numeric property whose values fall inside Iran. Pairing within one object
/// avoids matching a stray lat to an unrelated lng. Returns null if nothing plausible is found.
/// </summary>
private static (double lat, double lng)? FindLatLng(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
double? lat = null, lng = null;
foreach (var p in el.EnumerateObject())
{
if (lat is null && IsLatKey(p.Name) && TryNum(p.Value, out var la)) lat = la;
else if (lng is null && IsLngKey(p.Name) && TryNum(p.Value, out var lo)) lng = lo;
}
if (lat is double L && lng is double G && L is >= MinLat and <= MaxLat && G is >= MinLng and <= MaxLng)
return (L, G);
foreach (var p in el.EnumerateObject())
if (FindLatLng(p.Value) is { } r) return r;
}
else if (el.ValueKind == JsonValueKind.Array)
foreach (var item in el.EnumerateArray())
if (FindLatLng(item) is { } r) return r;
return null;
}
private static bool IsLatKey(string k) => k.Equals("latitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lat", StringComparison.OrdinalIgnoreCase);
private static bool IsLngKey(string k) =>
k.Equals("longitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lng", StringComparison.OrdinalIgnoreCase)
|| k.Equals("lon", StringComparison.OrdinalIgnoreCase) || k.Equals("long", StringComparison.OrdinalIgnoreCase);
/// <summary>Coordinate may be a JSON number or a numeric string ("35.7"). Invariant culture.</summary>
private static bool TryNum(JsonElement v, out double d)
{
if (v.ValueKind == JsonValueKind.Number) return v.TryGetDouble(out d);
if (v.ValueKind == JsonValueKind.String)
return double.TryParse(v.GetString(), System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out d);
d = 0; return false;
}
private static readonly string[] DescKeys =
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
private static IEnumerable<(string text, string? token)> Harvest(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
{
var sb = new StringBuilder(t.GetString());
// Append ALL present description fields — the location/time line («… در تهران، جنت‌آباد»)
// is usually in bottom_description_text, so don't stop at the first match.
foreach (var k in DescKeys)
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String && d.GetString() is { Length: > 0 } v)
sb.Append(" — ").Append(v);
var text = sb.ToString().Trim();
if (text.Length >= 15) yield return (text, FindToken(el));
}
foreach (var p in el.EnumerateObject())
foreach (var x in Harvest(p.Value)) yield return x;
}
else if (el.ValueKind == JsonValueKind.Array)
{
foreach (var item in el.EnumerateArray())
foreach (var x in Harvest(item)) yield return x;
}
}
/// <summary>Find the post token within a widget object (Divar tokens: 616 alphanumerics).</summary>
private static string? FindToken(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
foreach (var p in el.EnumerateObject())
if (p.NameEquals("token") && p.Value.ValueKind == JsonValueKind.String)
{
var v = p.Value.GetString();
if (v is not null && v.Length is >= 6 and <= 16 && v.All(char.IsLetterOrDigit)) return v;
}
foreach (var p in el.EnumerateObject())
{ var r = FindToken(p.Value); if (r is not null) return r; }
}
else if (el.ValueKind == JsonValueKind.Array)
foreach (var item in el.EnumerateArray())
{ var r = FindToken(item); if (r is not null) return r; }
return null;
}
}