From a5d6e212e2bd9550061087f41299b357a6c996da Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Mon, 8 Jun 2026 08:28:37 +0330 Subject: [PATCH] Divar: capture post token + harvest phone from full ad detail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Harvest now keeps each post's token, so we build a real post URL (divar.ir/v/{token}) instead of a generic link. - For each post we fetch the detail JSON (posts-v2/web/{token}) and harvest any contact number from it — covering the very common case where the poster writes the phone into the ad description. Divar's click-to-reveal is login-gated, so this gets the in-text numbers without auth; fails soft (blocking/errors → skip). - HarvestPhones hardened with digit-boundary guards so it can't grab a slice of a longer numeric id/timestamp inside JSON. Co-Authored-By: Claude Opus 4.8 --- .../Services/Scraping/DivarListingSource.cs | 67 +++++++++++++++++-- .../Scraping/TelegramListingSource.cs | 7 +- 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs b/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs index ae83213..1583d1d 100644 --- a/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs @@ -59,8 +59,18 @@ public class DivarListingSource : IListingSource continue; } using var doc = JsonDocument.Parse(body); - foreach (var text in Harvest(doc.RootElement).Take(25)) - items.Add(new ScrapedItem("دیوار", text, "https://divar.ir")); + foreach (var (text, token) in Harvest(doc.RootElement).Take(25)) + { + var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir"; + var withPhone = text; + if (token is not null) + { + var phones = await RevealPhonesAsync(client, token, s, ct); + if (phones.Count > 0 && !phones.Any(text.Contains)) + withPhone = text + "\nشماره تماس: " + string.Join("، ", phones); + } + items.Add(new ScrapedItem("دیوار", withPhone, url)); + } } catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); } } @@ -85,10 +95,39 @@ public class DivarListingSource : IListingSource }; } + // The post detail endpoint returns the FULL description — many Divar job ads write the phone + // straight into the body, so we can harvest it without Divar's (login-gated) contact reveal. + private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/"; + + /// + /// Fetch a post's detail JSON and harvest any contact number it contains (mostly numbers the + /// poster wrote into the description). Divar's true "نمایش شماره" reveal is auth-gated; this + /// covers the common case where the number is in the ad text. Fails soft. + /// + private async Task> RevealPhonesAsync(HttpClient client, string token, AppSetting s, CancellationToken ct) + { + try + { + using var req = new HttpRequestMessage(HttpMethod.Get, PostDetailUrl + token); + req.Headers.TryAddWithoutValidation("User-Agent", Ua); + req.Headers.TryAddWithoutValidation("Accept", "application/json"); + using var resp = await client.SendAsync(req, ct); + if (!resp.IsSuccessStatusCode) return new(); + var body = await resp.Content.ReadAsStringAsync(ct); + if (body.Contains("BLOCKING_VIEW")) return new(); + return HtmlUtil.HarvestPhones(body); + } + catch (Exception ex) + { + _log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token); + return new(); + } + } + private static readonly string[] DescKeys = { "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" }; - private static IEnumerable Harvest(JsonElement el) + private static IEnumerable<(string text, string? token)> Harvest(JsonElement el) { if (el.ValueKind == JsonValueKind.Object) { @@ -99,7 +138,7 @@ public class DivarListingSource : IListingSource if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String) { sb.Append(" — ").Append(d.GetString()); break; } var text = sb.ToString().Trim(); - if (text.Length >= 15) yield return text; + if (text.Length >= 15) yield return (text, FindToken(el)); } foreach (var p in el.EnumerateObject()) foreach (var x in Harvest(p.Value)) yield return x; @@ -110,4 +149,24 @@ public class DivarListingSource : IListingSource foreach (var x in Harvest(item)) yield return x; } } + + /// Find the post token within a widget object (Divar tokens: 6–16 alphanumerics). + private static string? FindToken(JsonElement el) + { + if (el.ValueKind == JsonValueKind.Object) + { + foreach (var p in el.EnumerateObject()) + if (p.NameEquals("token") && p.Value.ValueKind == JsonValueKind.String) + { + var v = p.Value.GetString(); + if (v is not null && v.Length is >= 6 and <= 16 && v.All(char.IsLetterOrDigit)) return v; + } + foreach (var p in el.EnumerateObject()) + { var r = FindToken(p.Value); if (r is not null) return r; } + } + else if (el.ValueKind == JsonValueKind.Array) + foreach (var item in el.EnumerateArray()) + { var r = FindToken(item); if (r is not null) return r; } + return null; + } } diff --git a/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs b/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs index 91b2a6b..9d1bccc 100644 --- a/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs @@ -101,9 +101,10 @@ internal static class HtmlUtil foreach (Match m in Regex.Matches(latin, @"tel:\+?([\d\s\-]{7,})")) Add(m.Groups[1].Value); foreach (Match m in Regex.Matches(latin, "\"telephone\"\\s*:\\s*\"([^\"]+)\"")) Add(m.Groups[1].Value); foreach (Match m in Regex.Matches(latin, "data-[\\w-]*phone[\\w-]*=[\"']([^\"']+)[\"']", RegexOptions.IgnoreCase)) Add(m.Groups[1].Value); - // Then bare numbers anywhere in the markup — mobiles, then landlines. - foreach (Match m in Regex.Matches(latin, @"(?:\+?98|0)?9\d{9}")) Add(m.Value); - foreach (Match m in Regex.Matches(latin, @"0\d{2,3}[\s-]?\d{7,8}")) Add(m.Value); + // Then bare numbers anywhere in the text — mobiles, then landlines. The (?