Fix: site-wide phone on every Medjobs ad + phone mistaken for price
CI/CD / CI · dotnet build (push) Successful in 2m7s
CI/CD / Deploy · hamkadr (push) Successful in 1m59s

- HarvestPhones was run over the whole page, so Medjobs' own header/footer
  number (09101016110) was appended to every ad. Now harvest only the ad's
  description region in Medjobs + Website sources; the protected number
  still comes from the reveal call. No more duplicate number across ads.
- The amount extractor read phone digits as a Toman price
  (۹,۱۰۱,۰۱۶,۱۱۰ تومان). The parser now strips «شماره تماس…» lines and
  mobile/landline numbers before extracting money, and only accepts 6–10
  digit numbers with no leading zero (phones/ids start with 0 or are 11+).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-08 08:42:21 +03:30
parent b092a5cfe5
commit 0622270cd2
3 changed files with 23 additions and 10 deletions
+16 -5
View File
@@ -119,11 +119,11 @@ public class HeuristicListingParser : IListingParser
else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت")) else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت"))
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); } { p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
// --- Fixed pay --- // --- Fixed pay (strip phone numbers first so they're never read as money) ---
if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); } if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
else else
{ {
var amount = ExtractAmount(text); var amount = ExtractAmount(StripPhones(text));
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); } if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد"); else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
} }
@@ -244,6 +244,16 @@ public class HeuristicListingParser : IListingParser
return null; return null;
} }
/// <summary>Remove phone numbers (and «شماره تماس…» lines) so they're not mistaken for money.</summary>
private static string StripPhones(string text)
{
var t = Regex.Replace(text, @"شماره\s*(?:تماس|موبایل|همراه|ثابت|تلفن)[^\n]*", " ");
t = ToLatinDigits(t);
t = Regex.Replace(t, @"(?<!\d)(?:\+?98|0)?9\d{9}(?!\d)", " "); // mobile
t = Regex.Replace(t, @"(?<!\d)0\d{2,3}[\s-]?\d{7,8}(?!\d)", " "); // landline
return t;
}
/// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary> /// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary>
private static long? ExtractAmount(string text) private static long? ExtractAmount(string text)
{ {
@@ -254,12 +264,13 @@ public class HeuristicListingParser : IListingParser
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m)) System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
return (long)(m * 1_000_000); return (long)(m * 1_000_000);
// Otherwise the largest plain number that looks like money (>= 6 digits after removing separators). // Otherwise the largest plain number that looks like money (610 digits, no leading zero —
// a leading zero or 11+ digits means it's a phone/id, not a price).
long best = 0; long best = 0;
foreach (Match num in Regex.Matches(latin, @"[\d٬,،.]{6,}")) foreach (Match num in Regex.Matches(latin, @"(?<!\d)[1-9][\d٬,،.]{4,}"))
{ {
var digits = Regex.Replace(num.Value, @"[^\d]", ""); var digits = Regex.Replace(num.Value, @"[^\d]", "");
if (digits.Length >= 6 && long.TryParse(digits, out var v) && v > best) best = v; if (digits.Length is >= 6 and <= 10 && long.TryParse(digits, out var v) && v > best) best = v;
} }
return best > 0 ? best : null; return best > 0 ? best : null;
} }
@@ -151,9 +151,10 @@ public class MedjobsListingSource : IListingSource
var text = HtmlUtil.ToPlainText(string.Join("\n", parts)); var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
if (text.Length > 1800) text = text[..1800]; if (text.Length > 1800) text = text[..1800];
// The contact number is often outside the description (in a tel: link / data attribute the // Only harvest a number written inside the ad's own DESCRIPTION — never the full page,
// page reveals on click). Harvest it from the full HTML and append so the parser/AI see it. // which carries the site's own header/footer number on every ad. The real protected
var phones = HtmlUtil.HarvestPhones(html); // number comes from RevealPhonesAsync (the admin-ajax reveal).
var phones = HtmlUtil.HarvestPhones(body ?? "");
if (phones.Count > 0 && !phones.Any(text.Contains)) if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones); text += "\nشماره تماس: " + string.Join("، ", phones);
return text; return text;
@@ -53,8 +53,9 @@ public class WebsiteListingSource : IListingSource
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x)))); var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x))));
if (text.Length > 1800) text = text[..1800]; if (text.Length > 1800) text = text[..1800];
// Append any contact number found in the full markup (tel:/data-phone/JSON-LD/inline). // Harvest a number from the ad's own content region only (not the whole page, which would
var phones = HtmlUtil.HarvestPhones(html); // pick up the site's header/footer number on every listing).
var phones = HtmlUtil.HarvestPhones(body ?? "");
if (phones.Count > 0 && !phones.Any(text.Contains)) if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones); text += "\nشماره تماس: " + string.Join("، ", phones);
return text; return text;