Extract Iranian salary shorthand (X تومان = millions) + pay backfill
Parser: most jobs read «توافقی» because the amount extractor only saw 6–10 digit numbers, missing the way Iranian ads actually state pay — «۱۵ تومان»، «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م» all mean MILLIONS of toman. Add colloquial detection (1–3 digit number + تومان/م/میلیون → ×1,000,000, lower bound of a range), guarded so it never matches dates/hours or a long literal-toman figure. Also: a stated amount now wins over «توافقی» (ads often say a number AND «… بقیه توافقی»). Backfill: BackfillPayAsync re-parses existing aggregated jobs/talent that have no salary and fills it in place (no AI, no ID/URL change) — wired into the post-ingest auto-cleanup and exposed as an admin button. Existing «توافقی» listings with a stated number get their salary; genuinely-negotiable ads stay توافقی. Also improves the baseSalary in JobPosting rich results. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -137,13 +137,12 @@ public class HeuristicListingParser : IListingParser
|
||||
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
|
||||
|
||||
// --- Fixed pay (strip phone numbers first so they're never read as money) ---
|
||||
if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
|
||||
else
|
||||
{
|
||||
var amount = ExtractAmount(StripPhones(text));
|
||||
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
|
||||
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
|
||||
}
|
||||
// A STATED amount wins over «توافقی»: ads often say a number AND «… بقیه توافقی»; showing the
|
||||
// figure is far more useful than «توافقی». Fall back to negotiable only when no amount is found.
|
||||
var amount = ExtractAmount(StripPhones(text));
|
||||
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
|
||||
else if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
|
||||
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
|
||||
|
||||
// --- Talent extras (only meaningful for «آماده به کار») ---
|
||||
if (p.Kind == ListingKind.Talent)
|
||||
@@ -291,6 +290,14 @@ public class HeuristicListingParser : IListingParser
|
||||
bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
|
||||
bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;
|
||||
|
||||
// Iranian salary shorthand: a 1–3 digit number means MILLIONS of toman — «۱۵ تومان»،
|
||||
// «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م». Take the LOWER bound of a range. The lookarounds keep
|
||||
// this from ever matching part of a long literal-toman number (the digits must end at the unit).
|
||||
var collo = Regex.Match(latin,
|
||||
@"(?<!\d)(\d{1,3})(?:\s*تا\s*(\d{1,3}))?\s*(?:میلیون|م(?![ا-یA-Za-z])|تومان|تومن)(?!\s*\d)");
|
||||
if (collo.Success && int.TryParse(collo.Groups[1].Value, out var lo) && lo is > 0 and <= 500)
|
||||
return (long)lo * 1_000_000;
|
||||
|
||||
// e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
|
||||
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
|
||||
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
|
||||
|
||||
Reference in New Issue
Block a user