Normalize ریال→تومان pricing; stop exposing crawl source (medjobs/telegram)
CI/CD / CI · dotnet build (push) Successful in 29s
CI/CD / Deploy · hamkadr (push) Successful in 42s

- Parser now reads the currency: ریال amounts (incl. «میلیون ریال» and
  numbers with no تومان unit but ≥200M) are converted to تومان (÷10), so
  «۴۰۰٬۰۰۰٬۰۰۰ ریال» shows as ۴۰٬۰۰۰٬۰۰۰ تومان instead of 400M.
- Aggregated facility fallback name no longer embeds the source
  («مرکز درمانی (از مدجابز)» → «مرکز درمانی (نامشخص)»).
- Talent details only ever names Divar as a fallback source (when the
  number couldn't be extracted); medjobs/telegram are never shown publicly.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-08 09:05:34 +03:30
parent 490821a637
commit 2bb8771ade
3 changed files with 38 additions and 23 deletions
@@ -19,13 +19,10 @@
var digits = new string(t.Phone.Where(char.IsDigit).ToArray());
if (digits.Length >= 7) telHref = "tel:" + digits;
}
// Friendly source name (used to point users to the original ad when no number was extracted).
string? sourceName = null;
if (!string.IsNullOrWhiteSpace(t.SourceUrl))
{
var host = System.Uri.TryCreate(t.SourceUrl, UriKind.Absolute, out var su) ? su.Host : t.SourceUrl!;
sourceName = host.Contains("divar") ? "دیوار" : host.Contains("medjobs") ? "مدجابز" : host;
}
// Only Divar is surfaced as a fallback source (and only when no number was extracted).
// We never name other crawl sources (medjobs/telegram/…) publicly.
bool isDivar = !string.IsNullOrWhiteSpace(t.SourceUrl)
&& System.Uri.TryCreate(t.SourceUrl, UriKind.Absolute, out var su) && su.Host.Contains("divar");
}
<div class="page-head">
@@ -75,12 +72,12 @@
<a href="@telHref" class="btn btn-accent btn-block btn-lg" dir="ltr">📞 @t.Phone</a>
<p class="muted" style="font-size:12px; margin:10px 0 0;">با این فرد مستقیم تماس بگیرید.</p>
}
else if (!string.IsNullOrWhiteSpace(t.SourceUrl))
else if (isDivar)
{
@* Number wasn't extractable (e.g. behind a login-gated reveal) — point to the source. *@
@* Divar hides the number behind a login-gated reveal — point to the original ad. *@
<p class="muted" style="margin-top:0;">شماره مستقیم استخراج نشد.</p>
<a href="@t.SourceUrl" target="_blank" rel="nofollow noopener" class="btn btn-accent btn-block btn-lg">مشاهده شماره در @sourceName ↗</a>
<p class="muted" style="font-size:12px; margin:10px 0 0;">این آگهی از @sourceName جمع‌آوری شده؛ برای دریافت شماره به آگهی اصلی مراجعه کن.</p>
<a href="@t.SourceUrl" target="_blank" rel="nofollow noopener" class="btn btn-accent btn-block btn-lg">مشاهده شماره در دیوار ↗</a>
<p class="muted" style="font-size:12px; margin:10px 0 0;">برای دریافت شماره به آگهی اصلی در دیوار مراجعه کن.</p>
}
else
{
+28 -11
View File
@@ -254,24 +254,41 @@ public class HeuristicListingParser : IListingParser
return t;
}
/// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary>
/// <summary>Pull a figure out of free text and normalize to TOMAN (ریال → تومان = ÷۱۰),
/// handling «میلیون» and Persian digits.</summary>
private static long? ExtractAmount(string text)
{
var latin = ToLatinDigits(text);
// e.g. "۲ میلیون" / "2.5 میلیون"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون");
bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;
// e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
return (long)(m * 1_000_000);
// Otherwise the largest plain number that looks like money (610 digits, no leading zero —
// a leading zero or 11+ digits means it's a phone/id, not a price).
long best = 0;
foreach (Match num in Regex.Matches(latin, @"(?<!\d)[1-9][\d٬,،.]{4,}"))
{
var digits = Regex.Replace(num.Value, @"[^\d]", "");
if (digits.Length is >= 6 and <= 10 && long.TryParse(digits, out var v) && v > best) best = v;
var val = (long)(m * 1_000_000);
if (million.Groups[2].Success) val /= 10; // «میلیون ریال»
return val;
}
// Largest plain number that looks like money (610 digits, no leading zero — a leading
// zero or 11+ digits means it's a phone/id). Convert ریال→تومان by the unit next to the
// number, else by the ad's overall currency.
long best = 0;
foreach (Match num in Regex.Matches(latin, @"(?<!\d)([1-9][\d٬,،.]{4,})\s*(ریال|ريال|تومان|تومن)?"))
{
var digits = Regex.Replace(num.Groups[1].Value, @"[^\d]", "");
if (digits.Length is < 6 or > 10 || !long.TryParse(digits, out var v)) continue;
var unit = num.Groups[2].Value;
bool isRial = unit is "ریال" or "ريال" || (unit.Length == 0 && hasRial);
if (isRial) v /= 10;
if (v > best) best = v;
}
// Sanity: a monthly figure of 200M+ تومان is implausible in Iran — if the ad never said
// «تومان», it was almost certainly ریال, so normalize.
if (best >= 200_000_000 && !hasToman) best /= 10;
return best > 0 ? best : null;
}
@@ -198,9 +198,10 @@ public class IngestionService
return;
}
// Never surface the crawl source (e.g. «مدجابز») in a public facility name.
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
: $"مرکز درمانی (از {raw.SourceChannel})";
: "مرکز درمانی (نامشخص)";
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
if (facility is null)