Normalize ریال→تومان pricing; stop exposing crawl source (medjobs/telegram)
CI/CD / CI · dotnet build (push) Successful in 29s
CI/CD / Deploy · hamkadr (push) Successful in 42s

- Parser now reads the currency: ریال amounts (incl. «میلیون ریال» and
  numbers with no تومان unit but ≥200M) are converted to تومان (÷10), so
  «۴۰۰٬۰۰۰٬۰۰۰ ریال» shows as ۴۰٬۰۰۰٬۰۰۰ تومان instead of 400M.
- Aggregated facility fallback name no longer embeds the source
  («مرکز درمانی (از مدجابز)» → «مرکز درمانی (نامشخص)»).
- Talent details only ever names Divar as a fallback source (when the
  number couldn't be extracted); medjobs/telegram are never shown publicly.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-08 09:05:34 +03:30
parent 490821a637
commit 2bb8771ade
3 changed files with 38 additions and 23 deletions
+28 -11
View File
@@ -254,24 +254,41 @@ public class HeuristicListingParser : IListingParser
return t;
}
/// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary>
/// <summary>Pull a figure out of free text and normalize to TOMAN (ریال → تومان = ÷۱۰),
/// handling «میلیون» and Persian digits.</summary>
private static long? ExtractAmount(string text)
{
var latin = ToLatinDigits(text);
// e.g. "۲ میلیون" / "2.5 میلیون"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون");
bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;
// e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
return (long)(m * 1_000_000);
// Otherwise the largest plain number that looks like money (610 digits, no leading zero —
// a leading zero or 11+ digits means it's a phone/id, not a price).
long best = 0;
foreach (Match num in Regex.Matches(latin, @"(?<!\d)[1-9][\d٬,،.]{4,}"))
{
var digits = Regex.Replace(num.Value, @"[^\d]", "");
if (digits.Length is >= 6 and <= 10 && long.TryParse(digits, out var v) && v > best) best = v;
var val = (long)(m * 1_000_000);
if (million.Groups[2].Success) val /= 10; // «میلیون ریال»
return val;
}
// Largest plain number that looks like money (610 digits, no leading zero — a leading
// zero or 11+ digits means it's a phone/id). Convert ریال→تومان by the unit next to the
// number, else by the ad's overall currency.
long best = 0;
foreach (Match num in Regex.Matches(latin, @"(?<!\d)([1-9][\d٬,،.]{4,})\s*(ریال|ريال|تومان|تومن)?"))
{
var digits = Regex.Replace(num.Groups[1].Value, @"[^\d]", "");
if (digits.Length is < 6 or > 10 || !long.TryParse(digits, out var v)) continue;
var unit = num.Groups[2].Value;
bool isRial = unit is "ریال" or "ريال" || (unit.Length == 0 && hasRial);
if (isRial) v /= 10;
if (v > best) best = v;
}
// Sanity: a monthly figure of 200M+ تومان is implausible in Iran — if the ad never said
// «تومان», it was almost certainly ریال, so normalize.
if (best >= 200_000_000 && !hasToman) best /= 10;
return best > 0 ? best : null;
}
@@ -198,9 +198,10 @@ public class IngestionService
return;
}
// Never surface the crawl source (e.g. «مدجابز») in a public facility name.
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
: $"مرکز درمانی (از {raw.SourceChannel})";
: "مرکز درمانی (نامشخص)";
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
if (facility is null)