From c778b87e79f523c30f83f9b6c1ee0238b7b6d109 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 19:04:30 +0330 Subject: [PATCH] Capture the full Divar ad description, not just the search-row summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Divar listings showed only a one-line summary («پرستار کودک ۳ روز … — پرداخت توافقی — … در شادمان») because the scraper stored the search-result row text and only pulled phone + coords from the post detail. Now FetchDetailAsync also extracts the full ad body (the longest free-text string in the detail JSON, skipping Divar safety boilerplate that mentions «دیوار») and appends it, so the listing carries the rich description users see on Divar. Applies to new crawls; existing rows keep their short text until re-ingested. Co-Authored-By: Claude Opus 4.8 --- .../Services/Scraping/DivarListingSource.cs | 55 ++++++++++++++++--- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs b/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs index a6e0a1d..a4dcde6 100644 --- a/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs @@ -71,8 +71,12 @@ public class DivarListingSource : IListingSource double? lat = null, lng = null; if (token is not null) { - // One detail fetch yields BOTH the phone and the map coordinates. - var (phones, gLat, gLng) = await FetchDetailAsync(client, token, ct); + // One detail fetch yields the FULL description, the phone, AND the map center. + // (The search row only carries a short one-line summary — the rich ad body lives + // on the post detail, so without this the listing looked "censored".) + var (phones, gLat, gLng, fullDesc) = await FetchDetailAsync(client, token, ct); + if (!string.IsNullOrWhiteSpace(fullDesc) && !itemText.Contains(fullDesc)) + itemText += "\n" + fullDesc; if (phones.Count > 0 && !phones.Any(itemText.Contains)) itemText += "\nشماره تماس: " + string.Join("، ", phones); lat = gLat; lng = gLng; @@ -126,7 +130,7 @@ public class DivarListingSource : IListingSource /// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a /// circle). Fails soft — returns whatever it could extract. /// - private async Task<(List phones, double? lat, double? lng)> FetchDetailAsync( + private async Task<(List phones, double? lat, double? lng, string? description)> FetchDetailAsync( HttpClient client, string token, CancellationToken ct) { try @@ -135,22 +139,55 @@ public class DivarListingSource : IListingSource req.Headers.TryAddWithoutValidation("User-Agent", Ua); req.Headers.TryAddWithoutValidation("Accept", "application/json"); using var resp = await client.SendAsync(req, ct); - if (!resp.IsSuccessStatusCode) return (new(), null, null); + if (!resp.IsSuccessStatusCode) return (new(), null, null, null); var body = await resp.Content.ReadAsStringAsync(ct); - if (body.Contains("BLOCKING_VIEW")) return (new(), null, null); + if (body.Contains("BLOCKING_VIEW")) return (new(), null, null, null); var phones = HtmlUtil.HarvestPhones(body); - double? lat = null, lng = null; - try { using var doc = JsonDocument.Parse(body); if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } } + double? lat = null, lng = null; string? desc = null; + try + { + using var doc = JsonDocument.Parse(body); + if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } + desc = FindLongestText(doc.RootElement); // the full ad body + } catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ } - return (phones, lat, lng); + return (phones, lat, lng, desc); } catch (Exception ex) { _log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token); - return (new(), null, null); + return (new(), null, null, null); } } + /// The full ad description in Divar's detail JSON = the longest free-text string. We skip + /// Divar's own safety/boilerplate notices (which mention «دیوار») and absurdly long blobs. + private static string? FindLongestText(JsonElement root) + { + string? best = null; + var stack = new Stack(); + stack.Push(root); + while (stack.Count > 0) + { + var e = stack.Pop(); + switch (e.ValueKind) + { + case JsonValueKind.Object: + foreach (var p in e.EnumerateObject()) stack.Push(p.Value); + break; + case JsonValueKind.Array: + foreach (var it in e.EnumerateArray()) stack.Push(it); + break; + case JsonValueKind.String: + var s = e.GetString(); + if (s is { Length: >= 40 and <= 4000 } && s.Contains(' ') && !s.Contains("دیوار") + && (best is null || s.Length > best.Length)) best = s; + break; + } + } + return best?.Trim(); + } + // Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…). private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;