From 0622270cd2690dfa2480222934ca5c467d2314f6 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Mon, 8 Jun 2026 08:42:21 +0330 Subject: [PATCH] Fix: site-wide phone on every Medjobs ad + phone mistaken for price MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - HarvestPhones was run over the whole page, so Medjobs' own header/footer number (09101016110) was appended to every ad. Now harvest only the ad's description region in Medjobs + Website sources; the protected number still comes from the reveal call. No more duplicate number across ads. - The amount extractor read phone digits as a Toman price (۹,۱۰۱,۰۱۶,۱۱۰ تومان). The parser now strips «شماره تماس…» lines and mobile/landline numbers before extracting money, and only accepts 6–10 digit numbers with no leading zero (phones/ids start with 0 or are 11+). Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Services/ListingParser.cs | 21 ++++++++++++++----- .../Services/Scraping/MedjobsListingSource.cs | 7 ++++--- .../Services/Scraping/WebsiteListingSource.cs | 5 +++-- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/JobsMedical.Web/Services/ListingParser.cs b/src/JobsMedical.Web/Services/ListingParser.cs index 98096af..c45005b 100644 --- a/src/JobsMedical.Web/Services/ListingParser.cs +++ b/src/JobsMedical.Web/Services/ListingParser.cs @@ -119,11 +119,11 @@ public class HeuristicListingParser : IListingParser else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت")) { p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); } - // --- Fixed pay --- + // --- Fixed pay (strip phone numbers first so they're never read as money) --- if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); } else { - var amount = ExtractAmount(text); + var amount = ExtractAmount(StripPhones(text)); if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); } else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد"); } @@ -244,6 +244,16 @@ public class HeuristicListingParser : IListingParser return null; } + /// Remove phone numbers (and «شماره تماس…» lines) so they're not mistaken for money. + private static string StripPhones(string text) + { + var t = Regex.Replace(text, @"شماره\s*(?:تماس|موبایل|همراه|ثابت|تلفن)[^\n]*", " "); + t = ToLatinDigits(t); + t = Regex.Replace(t, @"(?Pull a Toman figure out of free text, handling «میلیون» and Persian digits. private static long? ExtractAmount(string text) { @@ -254,12 +264,13 @@ public class HeuristicListingParser : IListingParser System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m)) return (long)(m * 1_000_000); - // Otherwise the largest plain number that looks like money (>= 6 digits after removing separators). + // Otherwise the largest plain number that looks like money (6–10 digits, no leading zero — + // a leading zero or 11+ digits means it's a phone/id, not a price). long best = 0; - foreach (Match num in Regex.Matches(latin, @"[\d٬,،.]{6,}")) + foreach (Match num in Regex.Matches(latin, @"(?= 6 && long.TryParse(digits, out var v) && v > best) best = v; + if (digits.Length is >= 6 and <= 10 && long.TryParse(digits, out var v) && v > best) best = v; } return best > 0 ? best : null; } diff --git a/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs b/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs index 7435622..9febf49 100644 --- a/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs @@ -151,9 +151,10 @@ public class MedjobsListingSource : IListingSource var text = HtmlUtil.ToPlainText(string.Join("\n", parts)); if (text.Length > 1800) text = text[..1800]; - // The contact number is often outside the description (in a tel: link / data attribute the - // page reveals on click). Harvest it from the full HTML and append so the parser/AI see it. - var phones = HtmlUtil.HarvestPhones(html); + // Only harvest a number written inside the ad's own DESCRIPTION — never the full page, + // which carries the site's own header/footer number on every ad. The real protected + // number comes from RevealPhonesAsync (the admin-ajax reveal). + var phones = HtmlUtil.HarvestPhones(body ?? ""); if (phones.Count > 0 && !phones.Any(text.Contains)) text += "\nشماره تماس: " + string.Join("، ", phones); return text; diff --git a/src/JobsMedical.Web/Services/Scraping/WebsiteListingSource.cs b/src/JobsMedical.Web/Services/Scraping/WebsiteListingSource.cs index c50e8df..3710870 100644 --- a/src/JobsMedical.Web/Services/Scraping/WebsiteListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/WebsiteListingSource.cs @@ -53,8 +53,9 @@ public class WebsiteListingSource : IListingSource var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x)))); if (text.Length > 1800) text = text[..1800]; - // Append any contact number found in the full markup (tel:/data-phone/JSON-LD/inline). - var phones = HtmlUtil.HarvestPhones(html); + // Harvest a number from the ad's own content region only (not the whole page, which would + // pick up the site's header/footer number on every listing). + var phones = HtmlUtil.HarvestPhones(body ?? ""); if (phones.Count > 0 && !phones.Any(text.Contains)) text += "\nشماره تماس: " + string.Join("، ", phones); return text;