Extract Iranian salary shorthand (X تومان = millions) + pay backfill
Parser: most jobs read «توافقی» because the amount extractor only saw 6–10 digit numbers, missing the way Iranian ads actually state pay — «۱۵ تومان»، «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م» all mean MILLIONS of toman. Add colloquial detection (1–3 digit number + تومان/م/میلیون → ×1,000,000, lower bound of a range), guarded so it never matches dates/hours or a long literal-toman figure. Also: a stated amount now wins over «توافقی» (ads often say a number AND «… بقیه توافقی»). Backfill: BackfillPayAsync re-parses existing aggregated jobs/talent that have no salary and fills it in place (no AI, no ID/URL change) — wired into the post-ingest auto-cleanup and exposed as an admin button. Existing «توافقی» listings with a stated number get their salary; genuinely-negotiable ads stay توافقی. Also improves the baseSalary in JobPosting rich results. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -67,6 +67,15 @@
|
||||
شیفت/استخدام/آمادهبهکارِ جمعآوریشدهای که مختصات ندارند، از روی محلهٔ ذکرشده در متنِ آگهی روی نقشه قرار میگیرند (محدودهٔ تقریبی). فقط مختصاتِ خالی پر میشود؛ موقعیتِ واقعیِ مراکز دستنخورده میماند.
|
||||
</p>
|
||||
|
||||
<form method="post">
|
||||
<button type="submit" asp-page-handler="BackfillPay" class="btn btn-primary btn-block" style="margin-top:10px;">
|
||||
💰 استخراجِ حقوق برای آگهیهای «توافقی»
|
||||
</button>
|
||||
</form>
|
||||
<p class="muted" style="font-size:11px; margin:6px 0 0;">
|
||||
آگهیهایی که حقوقشان «توافقی» است ولی در متن مبلغ دارند (مثل «۴۰ تا ۵۰ تومان» = میلیون)، مبلغشان استخراج و ثبت میشود (درجا، بدون تغییر شناسه/آدرس).
|
||||
</p>
|
||||
|
||||
<form method="post" onsubmit="return confirm('آگهیهای جمعآوریشدهٔ شیفت/استخدام که اکنون خارج از حوزهاند (خدمات منزل/نظافت، تبلیغاتی/آموزشی، اسپم) و استخدامهای تکراری «بایگانی» میشوند: از سایت پنهان میشوند ولی ردیفشان نگه داشته میشود (قابل بازگشت). آگهیهای معتبر و شناسه/آدرسشان دستنخورده میماند. ادامه؟');">
|
||||
<button type="submit" asp-page-handler="PurgeInvalid" class="btn btn-outline btn-block" style="margin-top:10px; color:var(--danger); border-color:var(--danger);">
|
||||
🧽 بایگانیِ درجای آگهیهای خارج از حوزه و تکراری (شیفت/استخدام)
|
||||
|
||||
@@ -139,6 +139,15 @@ public class IndexModel : PageModel
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
/// <summary>Fill missing salary on existing aggregated listings from the stored text (now reading
|
||||
/// Iranian «X تومان» = millions shorthand). In place — no AI, no ID/URL change.</summary>
|
||||
public async Task<IActionResult> OnPostBackfillPayAsync()
|
||||
{
|
||||
var n = await _ingest.BackfillPayAsync();
|
||||
IngestMessage = $"حقوق برای {n} آگهیِ «توافقی» که در متن مبلغ داشت (مثل «۴۰ تا ۵۰ تومان») استخراج و ثبت شد. بدون تغییر شناسه/آدرس.";
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-place cleanup of existing aggregated jobs/shifts: ARCHIVE (hide, keep the row) only the
|
||||
/// out-of-scope ones (domestic-helper / promotional / spam) per the current validator, plus
|
||||
|
||||
@@ -137,13 +137,12 @@ public class HeuristicListingParser : IListingParser
|
||||
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
|
||||
|
||||
// --- Fixed pay (strip phone numbers first so they're never read as money) ---
|
||||
if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
|
||||
else
|
||||
{
|
||||
var amount = ExtractAmount(StripPhones(text));
|
||||
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
|
||||
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
|
||||
}
|
||||
// A STATED amount wins over «توافقی»: ads often say a number AND «… بقیه توافقی»; showing the
|
||||
// figure is far more useful than «توافقی». Fall back to negotiable only when no amount is found.
|
||||
var amount = ExtractAmount(StripPhones(text));
|
||||
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
|
||||
else if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
|
||||
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
|
||||
|
||||
// --- Talent extras (only meaningful for «آماده به کار») ---
|
||||
if (p.Kind == ListingKind.Talent)
|
||||
@@ -291,6 +290,14 @@ public class HeuristicListingParser : IListingParser
|
||||
bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
|
||||
bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;
|
||||
|
||||
// Iranian salary shorthand: a 1–3 digit number means MILLIONS of toman — «۱۵ تومان»،
|
||||
// «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م». Take the LOWER bound of a range. The lookarounds keep
|
||||
// this from ever matching part of a long literal-toman number (the digits must end at the unit).
|
||||
var collo = Regex.Match(latin,
|
||||
@"(?<!\d)(\d{1,3})(?:\s*تا\s*(\d{1,3}))?\s*(?:میلیون|م(?![ا-یA-Za-z])|تومان|تومن)(?!\s*\d)");
|
||||
if (collo.Success && int.TryParse(collo.Groups[1].Value, out var lo) && lo is > 0 and <= 500)
|
||||
return (long)lo * 1_000_000;
|
||||
|
||||
// e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
|
||||
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
|
||||
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
|
||||
|
||||
@@ -341,6 +341,38 @@ public class IngestionService
|
||||
return filled;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-place pay backfill: for existing AGGREGATED listings with no salary, re-parse the stored ad
|
||||
/// text with the CURRENT parser (which now reads Iranian shorthand «۴۰ تا ۵۰ تومان» = millions) and
|
||||
/// set the figure. No AI, no re-fetch, no delete/recreate — IDs/URLs unchanged. Only fills an empty
|
||||
/// salary, so a genuinely-«توافقی» ad stays توافقی. Returns how many got a salary.
|
||||
/// </summary>
|
||||
public async Task<int> BackfillPayAsync(CancellationToken ct = default)
|
||||
{
|
||||
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
||||
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
||||
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
|
||||
long? Pay(string? text) => _parser.Parse(text ?? "", roleNames, cityNames, districtNames).PayAmount;
|
||||
|
||||
int filled = 0;
|
||||
|
||||
var jobs = await _db.JobOpenings
|
||||
.Where(j => j.SalaryMin == null && j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
||||
.ToListAsync(ct);
|
||||
foreach (var j in jobs)
|
||||
if (Pay(j.Description) is long p) { j.SalaryMin = p; filled++; }
|
||||
|
||||
var talent = await _db.TalentListings
|
||||
.Where(t => t.PayAmount == null && t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated)
|
||||
.ToListAsync(ct);
|
||||
foreach (var t in talent)
|
||||
if (Pay(t.Description) is long p) { t.PayAmount = p; t.PayType = PayType.PerShift; filled++; }
|
||||
|
||||
if (filled > 0) await _db.SaveChangesAsync(ct);
|
||||
_log.LogInformation("Pay backfill set a salary on {N} aggregated listings.", filled);
|
||||
return filled;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
|
||||
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
|
||||
@@ -354,8 +386,9 @@ public class IngestionService
|
||||
var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct);
|
||||
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
|
||||
var coords = await BackfillCoordsAsync(ct);
|
||||
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}",
|
||||
archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||
var pay = await BackfillPayAsync(ct);
|
||||
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C} pay={P}",
|
||||
archived, dedupedJobs, mergedFac, cleanedFac, coords, pay);
|
||||
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user