Capture the full Divar ad description, not just the search-row summary
CI/CD / CI · dotnet build (push) Successful in 1m28s
CI/CD / Deploy · hamkadr (push) Successful in 2m22s

Divar listings showed only a one-line summary («پرستار کودک ۳ روز … — پرداخت توافقی — … در
شادمان») because the scraper stored the search-result row text and only pulled phone + coords from
the post detail. Now FetchDetailAsync also extracts the full ad body (the longest free-text string
in the detail JSON, skipping Divar safety boilerplate that mentions «دیوار») and appends it, so the
listing carries the rich description users see on Divar. Applies to new crawls; existing rows keep
their short text until re-ingested.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 19:04:30 +03:30
parent b1d0d0d4fd
commit c778b87e79
@@ -71,8 +71,12 @@ public class DivarListingSource : IListingSource
double? lat = null, lng = null; double? lat = null, lng = null;
if (token is not null) if (token is not null)
{ {
// One detail fetch yields BOTH the phone and the map coordinates. // One detail fetch yields the FULL description, the phone, AND the map center.
var (phones, gLat, gLng) = await FetchDetailAsync(client, token, ct); // (The search row only carries a short one-line summary — the rich ad body lives
// on the post detail, so without this the listing looked "censored".)
var (phones, gLat, gLng, fullDesc) = await FetchDetailAsync(client, token, ct);
if (!string.IsNullOrWhiteSpace(fullDesc) && !itemText.Contains(fullDesc))
itemText += "\n" + fullDesc;
if (phones.Count > 0 && !phones.Any(itemText.Contains)) if (phones.Count > 0 && !phones.Any(itemText.Contains))
itemText += "\nشماره تماس: " + string.Join("، ", phones); itemText += "\nشماره تماس: " + string.Join("، ", phones);
lat = gLat; lng = gLng; lat = gLat; lng = gLng;
@@ -126,7 +130,7 @@ public class DivarListingSource : IListingSource
/// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a /// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a
/// circle). Fails soft — returns whatever it could extract. /// circle). Fails soft — returns whatever it could extract.
/// </summary> /// </summary>
private async Task<(List<string> phones, double? lat, double? lng)> FetchDetailAsync( private async Task<(List<string> phones, double? lat, double? lng, string? description)> FetchDetailAsync(
HttpClient client, string token, CancellationToken ct) HttpClient client, string token, CancellationToken ct)
{ {
try try
@@ -135,22 +139,55 @@ public class DivarListingSource : IListingSource
req.Headers.TryAddWithoutValidation("User-Agent", Ua); req.Headers.TryAddWithoutValidation("User-Agent", Ua);
req.Headers.TryAddWithoutValidation("Accept", "application/json"); req.Headers.TryAddWithoutValidation("Accept", "application/json");
using var resp = await client.SendAsync(req, ct); using var resp = await client.SendAsync(req, ct);
if (!resp.IsSuccessStatusCode) return (new(), null, null); if (!resp.IsSuccessStatusCode) return (new(), null, null, null);
var body = await resp.Content.ReadAsStringAsync(ct); var body = await resp.Content.ReadAsStringAsync(ct);
if (body.Contains("BLOCKING_VIEW")) return (new(), null, null); if (body.Contains("BLOCKING_VIEW")) return (new(), null, null, null);
var phones = HtmlUtil.HarvestPhones(body); var phones = HtmlUtil.HarvestPhones(body);
double? lat = null, lng = null; double? lat = null, lng = null; string? desc = null;
try { using var doc = JsonDocument.Parse(body); if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } } try
{
using var doc = JsonDocument.Parse(body);
if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; }
desc = FindLongestText(doc.RootElement); // the full ad body
}
catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ } catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ }
return (phones, lat, lng); return (phones, lat, lng, desc);
} }
catch (Exception ex) catch (Exception ex)
{ {
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token); _log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
return (new(), null, null); return (new(), null, null, null);
} }
} }
/// <summary>The full ad description in Divar's detail JSON = the longest free-text string. We skip
/// Divar's own safety/boilerplate notices (which mention «دیوار») and absurdly long blobs.</summary>
private static string? FindLongestText(JsonElement root)
{
string? best = null;
var stack = new Stack<JsonElement>();
stack.Push(root);
while (stack.Count > 0)
{
var e = stack.Pop();
switch (e.ValueKind)
{
case JsonValueKind.Object:
foreach (var p in e.EnumerateObject()) stack.Push(p.Value);
break;
case JsonValueKind.Array:
foreach (var it in e.EnumerateArray()) stack.Push(it);
break;
case JsonValueKind.String:
var s = e.GetString();
if (s is { Length: >= 40 and <= 4000 } && s.Contains(' ') && !s.Contains("دیوار")
&& (best is null || s.Length > best.Length)) best = s;
break;
}
}
return best?.Trim();
}
// Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…). // Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…).
private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64; private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;