Divar: capture post token + harvest phone from full ad detail
- Harvest now keeps each post's token, so we build a real post URL
(divar.ir/v/{token}) instead of a generic link.
- For each post we fetch the detail JSON (posts-v2/web/{token}) and
harvest any contact number from it — covering the very common case
where the poster writes the phone into the ad description. Divar's
click-to-reveal is login-gated, so this gets the in-text numbers
without auth; fails soft (blocking/errors → skip).
- HarvestPhones hardened with digit-boundary guards so it can't grab a
slice of a longer numeric id/timestamp inside JSON.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -59,8 +59,18 @@ public class DivarListingSource : IListingSource
|
||||
continue;
|
||||
}
|
||||
using var doc = JsonDocument.Parse(body);
|
||||
foreach (var text in Harvest(doc.RootElement).Take(25))
|
||||
items.Add(new ScrapedItem("دیوار", text, "https://divar.ir"));
|
||||
foreach (var (text, token) in Harvest(doc.RootElement).Take(25))
|
||||
{
|
||||
var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir";
|
||||
var withPhone = text;
|
||||
if (token is not null)
|
||||
{
|
||||
var phones = await RevealPhonesAsync(client, token, s, ct);
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
withPhone = text + "\nشماره تماس: " + string.Join("، ", phones);
|
||||
}
|
||||
items.Add(new ScrapedItem("دیوار", withPhone, url));
|
||||
}
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
|
||||
}
|
||||
@@ -85,10 +95,39 @@ public class DivarListingSource : IListingSource
|
||||
};
|
||||
}
|
||||
|
||||
// The post detail endpoint returns the FULL description — many Divar job ads write the phone
|
||||
// straight into the body, so we can harvest it without Divar's (login-gated) contact reveal.
|
||||
private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/";
|
||||
|
||||
/// <summary>
|
||||
/// Fetch a post's detail JSON and harvest any contact number it contains (mostly numbers the
|
||||
/// poster wrote into the description). Divar's true "نمایش شماره" reveal is auth-gated; this
|
||||
/// covers the common case where the number is in the ad text. Fails soft.
|
||||
/// </summary>
|
||||
private async Task<List<string>> RevealPhonesAsync(HttpClient client, string token, AppSetting s, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var req = new HttpRequestMessage(HttpMethod.Get, PostDetailUrl + token);
|
||||
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
||||
req.Headers.TryAddWithoutValidation("Accept", "application/json");
|
||||
using var resp = await client.SendAsync(req, ct);
|
||||
if (!resp.IsSuccessStatusCode) return new();
|
||||
var body = await resp.Content.ReadAsStringAsync(ct);
|
||||
if (body.Contains("BLOCKING_VIEW")) return new();
|
||||
return HtmlUtil.HarvestPhones(body);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
|
||||
return new();
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly string[] DescKeys =
|
||||
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
||||
|
||||
private static IEnumerable<string> Harvest(JsonElement el)
|
||||
private static IEnumerable<(string text, string? token)> Harvest(JsonElement el)
|
||||
{
|
||||
if (el.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
@@ -99,7 +138,7 @@ public class DivarListingSource : IListingSource
|
||||
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
|
||||
{ sb.Append(" — ").Append(d.GetString()); break; }
|
||||
var text = sb.ToString().Trim();
|
||||
if (text.Length >= 15) yield return text;
|
||||
if (text.Length >= 15) yield return (text, FindToken(el));
|
||||
}
|
||||
foreach (var p in el.EnumerateObject())
|
||||
foreach (var x in Harvest(p.Value)) yield return x;
|
||||
@@ -110,4 +149,24 @@ public class DivarListingSource : IListingSource
|
||||
foreach (var x in Harvest(item)) yield return x;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Find the post token within a widget object (Divar tokens: 6–16 alphanumerics).</summary>
|
||||
private static string? FindToken(JsonElement el)
|
||||
{
|
||||
if (el.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
foreach (var p in el.EnumerateObject())
|
||||
if (p.NameEquals("token") && p.Value.ValueKind == JsonValueKind.String)
|
||||
{
|
||||
var v = p.Value.GetString();
|
||||
if (v is not null && v.Length is >= 6 and <= 16 && v.All(char.IsLetterOrDigit)) return v;
|
||||
}
|
||||
foreach (var p in el.EnumerateObject())
|
||||
{ var r = FindToken(p.Value); if (r is not null) return r; }
|
||||
}
|
||||
else if (el.ValueKind == JsonValueKind.Array)
|
||||
foreach (var item in el.EnumerateArray())
|
||||
{ var r = FindToken(item); if (r is not null) return r; }
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user