Locs(string xml)
=> Regex.Matches(xml, "([^<]+)").Select(m => m.Groups[1].Value.Trim());
/// Title (og:title, site suffix stripped) + body (entry/description content or og:description).
private static string ExtractAd(string html)
{
var title = Meta(html, "og:title");
if (title is not null)
{
var bar = title.IndexOf('|');
if (bar > 10) title = title[..bar].Trim();
}
string? body = BetweenClass(html, "rtcl-description")
?? BetweenClass(html, "entry-content")
?? Meta(html, "og:description");
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
if (text.Length > 1800) text = text[..1800];
// The contact number is often outside the description (in a tel: link / data attribute the
// page reveals on click). Harvest it from the full HTML and append so the parser/AI see it.
var phones = HtmlUtil.HarvestPhones(html);
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
}
private static string? Meta(string html, string prop)
{
var m = Regex.Match(html, $"]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
}
/// Grab the inner HTML of the first <div class="...name..."> (best-effort).
private static string? BetweenClass(string html, string cls)
{
var m = Regex.Match(html, $"]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)
",
RegexOptions.Singleline);
return m.Success ? m.Groups[1].Value : null;
}
}