Extract Iranian salary shorthand (X تومان = millions) + pay backfill
Parser: most jobs read «توافقی» because the amount extractor only saw 6–10 digit numbers, missing the way Iranian ads actually state pay — «۱۵ تومان»، «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م» all mean MILLIONS of toman. Add colloquial detection (1–3 digit number + تومان/م/میلیون → ×1,000,000, lower bound of a range), guarded so it never matches dates/hours or a long literal-toman figure. Also: a stated amount now wins over «توافقی» (ads often say a number AND «… بقیه توافقی»). Backfill: BackfillPayAsync re-parses existing aggregated jobs/talent that have no salary and fills it in place (no AI, no ID/URL change) — wired into the post-ingest auto-cleanup and exposed as an admin button. Existing «توافقی» listings with a stated number get their salary; genuinely-negotiable ads stay توافقی. Also improves the baseSalary in JobPosting rich results. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -341,6 +341,38 @@ public class IngestionService
|
||||
return filled;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-place pay backfill: for existing AGGREGATED listings with no salary, re-parse the stored ad
|
||||
/// text with the CURRENT parser (which now reads Iranian shorthand «۴۰ تا ۵۰ تومان» = millions) and
|
||||
/// set the figure. No AI, no re-fetch, no delete/recreate — IDs/URLs unchanged. Only fills an empty
|
||||
/// salary, so a genuinely-«توافقی» ad stays توافقی. Returns how many got a salary.
|
||||
/// </summary>
|
||||
public async Task<int> BackfillPayAsync(CancellationToken ct = default)
|
||||
{
|
||||
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
||||
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
||||
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
|
||||
long? Pay(string? text) => _parser.Parse(text ?? "", roleNames, cityNames, districtNames).PayAmount;
|
||||
|
||||
int filled = 0;
|
||||
|
||||
var jobs = await _db.JobOpenings
|
||||
.Where(j => j.SalaryMin == null && j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
||||
.ToListAsync(ct);
|
||||
foreach (var j in jobs)
|
||||
if (Pay(j.Description) is long p) { j.SalaryMin = p; filled++; }
|
||||
|
||||
var talent = await _db.TalentListings
|
||||
.Where(t => t.PayAmount == null && t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated)
|
||||
.ToListAsync(ct);
|
||||
foreach (var t in talent)
|
||||
if (Pay(t.Description) is long p) { t.PayAmount = p; t.PayType = PayType.PerShift; filled++; }
|
||||
|
||||
if (filled > 0) await _db.SaveChangesAsync(ct);
|
||||
_log.LogInformation("Pay backfill set a salary on {N} aggregated listings.", filled);
|
||||
return filled;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
|
||||
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
|
||||
@@ -354,8 +386,9 @@ public class IngestionService
|
||||
var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct);
|
||||
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
|
||||
var coords = await BackfillCoordsAsync(ct);
|
||||
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}",
|
||||
archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||
var pay = await BackfillPayAsync(ct);
|
||||
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C} pay={P}",
|
||||
archived, dedupedJobs, mergedFac, cleanedFac, coords, pay);
|
||||
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user