Normalize ریال→تومان pricing; stop exposing crawl source (medjobs/telegram)
- Parser now reads the currency: ریال amounts (incl. «میلیون ریال» and numbers with no تومان unit but ≥200M) are converted to تومان (÷10), so «۴۰۰٬۰۰۰٬۰۰۰ ریال» shows as ۴۰٬۰۰۰٬۰۰۰ تومان instead of 400M. - Aggregated facility fallback name no longer embeds the source («مرکز درمانی (از مدجابز)» → «مرکز درمانی (نامشخص)»). - Talent details only ever names Divar as a fallback source (when the number couldn't be extracted); medjobs/telegram are never shown publicly. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -19,13 +19,10 @@
|
||||
var digits = new string(t.Phone.Where(char.IsDigit).ToArray());
|
||||
if (digits.Length >= 7) telHref = "tel:" + digits;
|
||||
}
|
||||
// Friendly source name (used to point users to the original ad when no number was extracted).
|
||||
string? sourceName = null;
|
||||
if (!string.IsNullOrWhiteSpace(t.SourceUrl))
|
||||
{
|
||||
var host = System.Uri.TryCreate(t.SourceUrl, UriKind.Absolute, out var su) ? su.Host : t.SourceUrl!;
|
||||
sourceName = host.Contains("divar") ? "دیوار" : host.Contains("medjobs") ? "مدجابز" : host;
|
||||
}
|
||||
// Only Divar is surfaced as a fallback source (and only when no number was extracted).
|
||||
// We never name other crawl sources (medjobs/telegram/…) publicly.
|
||||
bool isDivar = !string.IsNullOrWhiteSpace(t.SourceUrl)
|
||||
&& System.Uri.TryCreate(t.SourceUrl, UriKind.Absolute, out var su) && su.Host.Contains("divar");
|
||||
}
|
||||
|
||||
<div class="page-head">
|
||||
@@ -75,12 +72,12 @@
|
||||
<a href="@telHref" class="btn btn-accent btn-block btn-lg" dir="ltr">📞 @t.Phone</a>
|
||||
<p class="muted" style="font-size:12px; margin:10px 0 0;">با این فرد مستقیم تماس بگیرید.</p>
|
||||
}
|
||||
else if (!string.IsNullOrWhiteSpace(t.SourceUrl))
|
||||
else if (isDivar)
|
||||
{
|
||||
@* Number wasn't extractable (e.g. behind a login-gated reveal) — point to the source. *@
|
||||
@* Divar hides the number behind a login-gated reveal — point to the original ad. *@
|
||||
<p class="muted" style="margin-top:0;">شماره مستقیم استخراج نشد.</p>
|
||||
<a href="@t.SourceUrl" target="_blank" rel="nofollow noopener" class="btn btn-accent btn-block btn-lg">مشاهده شماره در @sourceName ↗</a>
|
||||
<p class="muted" style="font-size:12px; margin:10px 0 0;">این آگهی از @sourceName جمعآوری شده؛ برای دریافت شماره به آگهی اصلی مراجعه کن.</p>
|
||||
<a href="@t.SourceUrl" target="_blank" rel="nofollow noopener" class="btn btn-accent btn-block btn-lg">مشاهده شماره در دیوار ↗</a>
|
||||
<p class="muted" style="font-size:12px; margin:10px 0 0;">برای دریافت شماره به آگهی اصلی در دیوار مراجعه کن.</p>
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -254,24 +254,41 @@ public class HeuristicListingParser : IListingParser
|
||||
return t;
|
||||
}
|
||||
|
||||
/// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary>
|
||||
/// <summary>Pull a figure out of free text and normalize to TOMAN (ریال → تومان = ÷۱۰),
|
||||
/// handling «میلیون» and Persian digits.</summary>
|
||||
private static long? ExtractAmount(string text)
|
||||
{
|
||||
var latin = ToLatinDigits(text);
|
||||
// e.g. "۲ میلیون" / "2.5 میلیون"
|
||||
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون");
|
||||
bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
|
||||
bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;
|
||||
|
||||
// e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
|
||||
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
|
||||
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
|
||||
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
|
||||
return (long)(m * 1_000_000);
|
||||
|
||||
// Otherwise the largest plain number that looks like money (6–10 digits, no leading zero —
|
||||
// a leading zero or 11+ digits means it's a phone/id, not a price).
|
||||
long best = 0;
|
||||
foreach (Match num in Regex.Matches(latin, @"(?<!\d)[1-9][\d٬,،.]{4,}"))
|
||||
{
|
||||
var digits = Regex.Replace(num.Value, @"[^\d]", "");
|
||||
if (digits.Length is >= 6 and <= 10 && long.TryParse(digits, out var v) && v > best) best = v;
|
||||
var val = (long)(m * 1_000_000);
|
||||
if (million.Groups[2].Success) val /= 10; // «میلیون ریال»
|
||||
return val;
|
||||
}
|
||||
|
||||
// Largest plain number that looks like money (6–10 digits, no leading zero — a leading
|
||||
// zero or 11+ digits means it's a phone/id). Convert ریال→تومان by the unit next to the
|
||||
// number, else by the ad's overall currency.
|
||||
long best = 0;
|
||||
foreach (Match num in Regex.Matches(latin, @"(?<!\d)([1-9][\d٬,،.]{4,})\s*(ریال|ريال|تومان|تومن)?"))
|
||||
{
|
||||
var digits = Regex.Replace(num.Groups[1].Value, @"[^\d]", "");
|
||||
if (digits.Length is < 6 or > 10 || !long.TryParse(digits, out var v)) continue;
|
||||
var unit = num.Groups[2].Value;
|
||||
bool isRial = unit is "ریال" or "ريال" || (unit.Length == 0 && hasRial);
|
||||
if (isRial) v /= 10;
|
||||
if (v > best) best = v;
|
||||
}
|
||||
|
||||
// Sanity: a monthly figure of 200M+ تومان is implausible in Iran — if the ad never said
|
||||
// «تومان», it was almost certainly ریال, so normalize.
|
||||
if (best >= 200_000_000 && !hasToman) best /= 10;
|
||||
return best > 0 ? best : null;
|
||||
}
|
||||
|
||||
|
||||
@@ -198,9 +198,10 @@ public class IngestionService
|
||||
return;
|
||||
}
|
||||
|
||||
// Never surface the crawl source (e.g. «مدجابز») in a public facility name.
|
||||
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
||||
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
||||
: $"مرکز درمانی (از {raw.SourceChannel})";
|
||||
: "مرکز درمانی (نامشخص)";
|
||||
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
||||
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
||||
if (facility is null)
|
||||
|
||||
Reference in New Issue
Block a user