Capture the full Divar ad description, not just the search-row summary
Divar listings showed only a one-line summary («پرستار کودک ۳ روز … — پرداخت توافقی — … در شادمان») because the scraper stored the search-result row text and only pulled phone + coords from the post detail. Now FetchDetailAsync also extracts the full ad body (the longest free-text string in the detail JSON, skipping Divar safety boilerplate that mentions «دیوار») and appends it, so the listing carries the rich description users see on Divar. Applies to new crawls; existing rows keep their short text until re-ingested. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -71,8 +71,12 @@ public class DivarListingSource : IListingSource
|
|||||||
double? lat = null, lng = null;
|
double? lat = null, lng = null;
|
||||||
if (token is not null)
|
if (token is not null)
|
||||||
{
|
{
|
||||||
// One detail fetch yields BOTH the phone and the map coordinates.
|
// One detail fetch yields the FULL description, the phone, AND the map center.
|
||||||
var (phones, gLat, gLng) = await FetchDetailAsync(client, token, ct);
|
// (The search row only carries a short one-line summary — the rich ad body lives
|
||||||
|
// on the post detail, so without this the listing looked "censored".)
|
||||||
|
var (phones, gLat, gLng, fullDesc) = await FetchDetailAsync(client, token, ct);
|
||||||
|
if (!string.IsNullOrWhiteSpace(fullDesc) && !itemText.Contains(fullDesc))
|
||||||
|
itemText += "\n" + fullDesc;
|
||||||
if (phones.Count > 0 && !phones.Any(itemText.Contains))
|
if (phones.Count > 0 && !phones.Any(itemText.Contains))
|
||||||
itemText += "\nشماره تماس: " + string.Join("، ", phones);
|
itemText += "\nشماره تماس: " + string.Join("، ", phones);
|
||||||
lat = gLat; lng = gLng;
|
lat = gLat; lng = gLng;
|
||||||
@@ -126,7 +130,7 @@ public class DivarListingSource : IListingSource
|
|||||||
/// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a
|
/// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a
|
||||||
/// circle). Fails soft — returns whatever it could extract.
|
/// circle). Fails soft — returns whatever it could extract.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private async Task<(List<string> phones, double? lat, double? lng)> FetchDetailAsync(
|
private async Task<(List<string> phones, double? lat, double? lng, string? description)> FetchDetailAsync(
|
||||||
HttpClient client, string token, CancellationToken ct)
|
HttpClient client, string token, CancellationToken ct)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
@@ -135,22 +139,55 @@ public class DivarListingSource : IListingSource
|
|||||||
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
||||||
req.Headers.TryAddWithoutValidation("Accept", "application/json");
|
req.Headers.TryAddWithoutValidation("Accept", "application/json");
|
||||||
using var resp = await client.SendAsync(req, ct);
|
using var resp = await client.SendAsync(req, ct);
|
||||||
if (!resp.IsSuccessStatusCode) return (new(), null, null);
|
if (!resp.IsSuccessStatusCode) return (new(), null, null, null);
|
||||||
var body = await resp.Content.ReadAsStringAsync(ct);
|
var body = await resp.Content.ReadAsStringAsync(ct);
|
||||||
if (body.Contains("BLOCKING_VIEW")) return (new(), null, null);
|
if (body.Contains("BLOCKING_VIEW")) return (new(), null, null, null);
|
||||||
var phones = HtmlUtil.HarvestPhones(body);
|
var phones = HtmlUtil.HarvestPhones(body);
|
||||||
double? lat = null, lng = null;
|
double? lat = null, lng = null; string? desc = null;
|
||||||
try { using var doc = JsonDocument.Parse(body); if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } }
|
try
|
||||||
|
{
|
||||||
|
using var doc = JsonDocument.Parse(body);
|
||||||
|
if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; }
|
||||||
|
desc = FindLongestText(doc.RootElement); // the full ad body
|
||||||
|
}
|
||||||
catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ }
|
catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ }
|
||||||
return (phones, lat, lng);
|
return (phones, lat, lng, desc);
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
|
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
|
||||||
return (new(), null, null);
|
return (new(), null, null, null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>The full ad description in Divar's detail JSON = the longest free-text string. We skip
|
||||||
|
/// Divar's own safety/boilerplate notices (which mention «دیوار») and absurdly long blobs.</summary>
|
||||||
|
private static string? FindLongestText(JsonElement root)
|
||||||
|
{
|
||||||
|
string? best = null;
|
||||||
|
var stack = new Stack<JsonElement>();
|
||||||
|
stack.Push(root);
|
||||||
|
while (stack.Count > 0)
|
||||||
|
{
|
||||||
|
var e = stack.Pop();
|
||||||
|
switch (e.ValueKind)
|
||||||
|
{
|
||||||
|
case JsonValueKind.Object:
|
||||||
|
foreach (var p in e.EnumerateObject()) stack.Push(p.Value);
|
||||||
|
break;
|
||||||
|
case JsonValueKind.Array:
|
||||||
|
foreach (var it in e.EnumerateArray()) stack.Push(it);
|
||||||
|
break;
|
||||||
|
case JsonValueKind.String:
|
||||||
|
var s = e.GetString();
|
||||||
|
if (s is { Length: >= 40 and <= 4000 } && s.Contains(' ') && !s.Contains("دیوار")
|
||||||
|
&& (best is null || s.Length > best.Length)) best = s;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return best?.Trim();
|
||||||
|
}
|
||||||
|
|
||||||
// Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…).
|
// Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…).
|
||||||
private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;
|
private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user