Ingestion data-quality + map fixes: AI salary, geocode coverage, in-place backfill & purge
CI/CD / CI · dotnet build (push) Successful in 30s
CI/CD / Deploy · hamkadr (push) Successful in 1m11s

- Jobs now keep the AI-extracted salary (d.PayAmount ?? parsed.PayAmount); they
  previously used only the parser figure, so every aggregated opening showed «توافقی».
- Geocoder also scans the ad body, so Tehran ads that name a neighbourhood only in
  free text («… در سهروردی») get an approximate map point.
- New BackfillCoordsAsync (+ admin button): fills missing coords on existing aggregated
  listings from their stored text, in place — no ID/URL churn, SEO-safe.
- New PurgeInvalidAggregatedAsync + DedupeJobsAsync (+ admin button): in-place removal of
  out-of-scope (domestic/promo/spam) aggregated jobs/shifts and duplicate job reposts,
  keeping valid listings' IDs.
- Jobs detail page always renders the location card (matches Shifts) instead of hiding it
  when coords are missing.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 05:09:39 +03:30
parent a16a805869
commit e2011d335e
4 changed files with 173 additions and 10 deletions
@@ -299,6 +299,117 @@ public class IngestionService
return removed;
}
/// <summary>
/// In-place geocoding backfill: for existing AGGREGATED listings in Tehran that still have no map
/// coords, derive an APPROXIMATE neighbourhood center from the stored ad text (TehranGeo) and fill
/// Lat/Lng. Unlike <see cref="ReprocessAsync"/> it never deletes or recreates rows, so listing IDs —
/// and the indexed shift/job URLs in the sitemap — are untouched; safe to run on the live board.
/// Only ever FILLS a null coordinate; a real point (Divar/employer/AI) is never overwritten.
/// Returns how many listings were newly placed on the map.
/// </summary>
public async Task<int> BackfillCoordsAsync(CancellationToken ct = default)
{
var tehran = await _db.Cities.FirstOrDefaultAsync(c => c.Name == "تهران", ct);
if (tehran is null) return 0;
int filled = 0;
var jobs = await _db.JobOpenings
.Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var j in jobs)
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
var shifts = await _db.Shifts
.Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var s in shifts)
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
var talent = await _db.TalentListings
.Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var t in talent)
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
if (filled > 0) await _db.SaveChangesAsync(ct);
_log.LogInformation("Coordinate backfill placed {N} aggregated listings on the map.", filled);
return filled;
}
/// <summary>
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's
/// stored text through the CURRENT validator and delete only the ones that are now clearly
/// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e.
/// <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-but-legit ads are KEPT. Then collapse
/// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs —
/// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped).
/// </summary>
public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
{
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
bool IsOutOfScope(string? text)
{
var t = text ?? "";
var parsed = _parser.Parse(t, roleNames, cityNames, districtNames);
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
}
int removed = 0;
var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated)
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
if (jobIds.Count > 0)
removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct);
var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated)
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
if (shiftIds.Count > 0)
removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct);
var deduped = await DedupeJobsAsync(ct);
_log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped);
return (removed, deduped);
}
/// <summary>
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
/// group. Per-role fan-out of one ad is preserved (different RoleId → different signature).
/// </summary>
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
{
var rows = await _db.JobOpenings
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
.Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
.ToListAsync(ct);
string? Sig(int roleId, int facId, string? desc)
{
var core = NormalizeFa(Regex.Replace(desc ?? "",
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
if (core.Length < 15) return null; // too little to call it a dup safely
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
}
var toRemove = rows
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
.Where(x => x.Key is not null)
.GroupBy(x => x.Key)
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
.ToList();
if (toRemove.Count == 0) return 0;
var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct);
_log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed);
return removed;
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide(
@@ -366,8 +477,11 @@ public class IngestionService
// Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough
// center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin.
double? appLat = raw.Lat, appLng = raw.Lng;
// Geocode from the structured location fields first, then fall back to scanning the ad body
// itself — many Tehran ads name the neighbourhood only in free text («… نیم ساعت پیش در سهروردی»)
// and never populate a district/area field, which is why most aggregated listings had no map.
if (appLat is null && city.Name == "تهران"
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote) is { } g)
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote, raw.RawText) is { } g)
{ appLat = g.lat; appLng = g.lng; }
// Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran
// (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide.
@@ -446,7 +560,10 @@ public class IngestionService
Facility = facility, Role = role,
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
SalaryMin = parsed.PayAmount,
// Prefer the AI-extracted salary, falling back to the parser's — matching the talent
// path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure,
// so every aggregated opening showed «توافقی» even when the ad stated a number.)
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center