Extract Iranian salary shorthand (X تومان = millions) + pay backfill
CI/CD / CI · dotnet build (push) Successful in 2m15s
CI/CD / Deploy · hamkadr (push) Successful in 1m58s

Parser: most jobs read «توافقی» because the amount extractor only saw 6–10 digit numbers, missing
the way Iranian ads actually state pay — «۱۵ تومان»، «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م» all mean
MILLIONS of toman. Add colloquial detection (1–3 digit number + تومان/م/میلیون → ×1,000,000, lower
bound of a range), guarded so it never matches dates/hours or a long literal-toman figure. Also: a
stated amount now wins over «توافقی» (ads often say a number AND «… بقیه توافقی»).

Backfill: BackfillPayAsync re-parses existing aggregated jobs/talent that have no salary and fills
it in place (no AI, no ID/URL change) — wired into the post-ingest auto-cleanup and exposed as an
admin button. Existing «توافقی» listings with a stated number get their salary; genuinely-negotiable
ads stay توافقی. Also improves the baseSalary in JobPosting rich results.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-22 17:21:32 +03:30
parent 219207ad68
commit b3e7123d74
4 changed files with 67 additions and 9 deletions
+14 -7
View File
@@ -137,13 +137,12 @@ public class HeuristicListingParser : IListingParser
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
// --- Fixed pay (strip phone numbers first so they're never read as money) ---
if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
else
{
var amount = ExtractAmount(StripPhones(text));
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
}
// A STATED amount wins over «توافقی»: ads often say a number AND «… بقیه توافقی»; showing the
// figure is far more useful than «توافقی». Fall back to negotiable only when no amount is found.
var amount = ExtractAmount(StripPhones(text));
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
else if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
// --- Talent extras (only meaningful for «آماده به کار») ---
if (p.Kind == ListingKind.Talent)
@@ -291,6 +290,14 @@ public class HeuristicListingParser : IListingParser
bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;
// Iranian salary shorthand: a 13 digit number means MILLIONS of toman — «۱۵ تومان»،
// «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م». Take the LOWER bound of a range. The lookarounds keep
// this from ever matching part of a long literal-toman number (the digits must end at the unit).
var collo = Regex.Match(latin,
@"(?<!\d)(\d{1,3})(?:\s*تا\s*(\d{1,3}))?\s*(?:میلیون|م(?![ا-یA-Za-z])|تومان|تومن)(?!\s*\d)");
if (collo.Success && int.TryParse(collo.Groups[1].Value, out var lo) && lo is > 0 and <= 500)
return (long)lo * 1_000_000;
// e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),