From e2011d335ee1d96eb4b90078303d3d5e20bbcca3 Mon Sep 17 00:00:00 2001
From: "soroush.asadi"
Date: Sun, 21 Jun 2026 05:09:39 +0330
Subject: [PATCH] Ingestion data-quality + map fixes: AI salary, geocode
coverage, in-place backfill & purge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Jobs now keep the AI-extracted salary (d.PayAmount ?? parsed.PayAmount); they
previously used only the parser figure, so every aggregated opening showed «توافقی».
- Geocoder also scans the ad body, so Tehran ads that name a neighbourhood only in
free text («… در سهروردی») get an approximate map point.
- New BackfillCoordsAsync (+ admin button): fills missing coords on existing aggregated
listings from their stored text, in place — no ID/URL churn, SEO-safe.
- New PurgeInvalidAggregatedAsync + DedupeJobsAsync (+ admin button): in-place removal of
out-of-scope (domestic/promo/spam) aggregated jobs/shifts and duplicate job reposts,
keeping valid listings' IDs.
- Jobs detail page always renders the location card (matches Shifts) instead of hiding it
when coords are missing.
Co-Authored-By: Claude Opus 4.8
---
src/JobsMedical.Web/Pages/Admin/Index.cshtml | 18 +++
.../Pages/Admin/Index.cshtml.cs | 24 ++++
src/JobsMedical.Web/Pages/Jobs/Details.cshtml | 20 +--
.../Services/Scraping/IngestionService.cs | 121 +++++++++++++++++-
4 files changed, 173 insertions(+), 10 deletions(-)
diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml b/src/JobsMedical.Web/Pages/Admin/Index.cshtml
index f31663f..b86dc1c 100644
--- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml
+++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml
@@ -58,6 +58,24 @@
توصیهشده برای پاکسازیِ آمادهبهکارها: متنِ خام نگه داشته میشود و فقط با منطقِ جدید (یکنفر=یکآگهی، نقش پایه، گروه ثابت، تگ تمیز، موقعیت تقریبی) بازساخته میشوند. صفحاتِ «آماده به کار» ایندکس نمیشوند، پس آدرسِ ایندکسشدهای تغییر نمیکند؛ شیفت/استخدام بهمرور با ایمیجستِ تازه پاک میشوند.
+
+
+ شیفت/استخدام/آمادهبهکارِ جمعآوریشدهای که مختصات ندارند، از روی محلهٔ ذکرشده در متنِ آگهی روی نقشه قرار میگیرند (محدودهٔ تقریبی). فقط مختصاتِ خالی پر میشود؛ موقعیتِ واقعیِ مراکز دستنخورده میماند.
+
+
+
+
+ فقط آگهیهایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده میشوند (نه صرفاً ناقص) و استخدامهای تکراری پاک میشوند. آگهیهای معتبر دستنخوردهاند، پس آدرسِ ایندکسشدهشان تغییر نمیکند؛ فقط صفحاتِ بد ۴۰۴ میشوند.
+
+
افزودن دستی
diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs
index e4d49c2..b4be44d 100644
--- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs
+++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs
@@ -120,6 +120,30 @@ public class IndexModel : PageModel
return RedirectToPage();
}
+ ///
+ /// Fill missing map coordinates on existing aggregated Tehran listings from their stored ad text
+ /// (TehranGeo). In place — no AI calls, no re-fetch, and crucially no delete/recreate, so indexed
+ /// shift/job URLs keep their IDs. Fast (pure DB + string matching), so it runs inline.
+ ///
+ public async Task OnPostBackfillCoordsAsync()
+ {
+ var n = await _ingest.BackfillCoordsAsync();
+ IngestMessage = $"مختصات تقریبی برای {n} آگهی جمعآوریشده از روی متن آگهی تکمیل شد (بدون تغییر شناسه یا آدرس صفحه).";
+ return RedirectToPage();
+ }
+
+ ///
+ /// In-place cleanup of existing aggregated jobs/shifts: delete only the out-of-scope ones
+ /// (domestic-helper / promotional / spam) per the current validator, plus near-duplicate job
+ /// reposts. Valid listings keep their IDs/URLs. No re-fetch, no AI — runs inline.
+ ///
+ public async Task OnPostPurgeInvalidAsync()
+ {
+ var (removed, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
+ IngestMessage = $"پاکسازیِ درجا: {removed} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری حذف شد. سایر آگهیها و شناسه/آدرسشان دستنخورده ماند.";
+ return RedirectToPage();
+ }
+
private async Task LoadAsync()
{
Queue = await _db.RawListings
diff --git a/src/JobsMedical.Web/Pages/Jobs/Details.cshtml b/src/JobsMedical.Web/Pages/Jobs/Details.cshtml
index d91807b..0b4164a 100644
--- a/src/JobsMedical.Web/Pages/Jobs/Details.cshtml
+++ b/src/JobsMedical.Web/Pages/Jobs/Details.cshtml
@@ -161,12 +161,12 @@
}
- @if (mapLat is not null && mapLng is not null)
- {
- var latS = mapLat.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
- var lngS = mapLng.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
-
-
موقعیت مکانی
+
+
موقعیت مکانی
+ @if (mapLat is not null && mapLng is not null)
+ {
+ var latS = mapLat.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
+ var lngS = mapLng.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
@if (!string.IsNullOrEmpty(Model.MapKey))
{
@@ -183,8 +183,12 @@
}
مسیریابی در نشان
-
- }
+ }
+ else
+ {
+
مختصات این آگهی ثبت نشده است.
+ }
+
diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
index 732d5d4..6778052 100644
--- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
+++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
@@ -299,6 +299,117 @@ public class IngestionService
return removed;
}
+ ///
+ /// In-place geocoding backfill: for existing AGGREGATED listings in Tehran that still have no map
+ /// coords, derive an APPROXIMATE neighbourhood center from the stored ad text (TehranGeo) and fill
+ /// Lat/Lng. Unlike it never deletes or recreates rows, so listing IDs —
+ /// and the indexed shift/job URLs in the sitemap — are untouched; safe to run on the live board.
+ /// Only ever FILLS a null coordinate; a real point (Divar/employer/AI) is never overwritten.
+ /// Returns how many listings were newly placed on the map.
+ ///
+ public async Task BackfillCoordsAsync(CancellationToken ct = default)
+ {
+ var tehran = await _db.Cities.FirstOrDefaultAsync(c => c.Name == "تهران", ct);
+ if (tehran is null) return 0;
+ int filled = 0;
+
+ var jobs = await _db.JobOpenings
+ .Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
+ .ToListAsync(ct);
+ foreach (var j in jobs)
+ if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
+
+ var shifts = await _db.Shifts
+ .Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
+ .ToListAsync(ct);
+ foreach (var s in shifts)
+ if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
+
+ var talent = await _db.TalentListings
+ .Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
+ .ToListAsync(ct);
+ foreach (var t in talent)
+ if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
+
+ if (filled > 0) await _db.SaveChangesAsync(ct);
+ _log.LogInformation("Coordinate backfill placed {N} aggregated listings on the map.", filled);
+ return filled;
+ }
+
+ ///
+ /// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's
+ /// stored text through the CURRENT validator and delete only the ones that are now clearly
+ /// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e.
+ /// ). Merely-incomplete-but-legit ads are KEPT. Then collapse
+ /// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs —
+ /// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped).
+ ///
+ public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
+ {
+ var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
+ var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
+ var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
+
+ bool IsOutOfScope(string? text)
+ {
+ var t = text ?? "";
+ var parsed = _parser.Parse(t, roleNames, cityNames, districtNames);
+ return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
+ }
+
+ int removed = 0;
+
+ var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated)
+ .Select(j => new { j.Id, j.Description }).ToListAsync(ct))
+ .Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
+ if (jobIds.Count > 0)
+ removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct);
+
+ var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated)
+ .Select(s => new { s.Id, s.Description }).ToListAsync(ct))
+ .Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
+ if (shiftIds.Count > 0)
+ removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct);
+
+ var deduped = await DedupeJobsAsync(ct);
+ _log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped);
+ return (removed, deduped);
+ }
+
+ ///
+ /// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
+ /// with slightly different surrounding text → different ContentHash). Signature = role + facility +
+ /// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
+ /// group. Per-role fan-out of one ad is preserved (different RoleId → different signature).
+ ///
+ public async Task DedupeJobsAsync(CancellationToken ct = default)
+ {
+ var rows = await _db.JobOpenings
+ .Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
+ .Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
+ .ToListAsync(ct);
+
+ string? Sig(int roleId, int facId, string? desc)
+ {
+ var core = NormalizeFa(Regex.Replace(desc ?? "",
+ @"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
+ if (core.Length < 15) return null; // too little to call it a dup safely
+ return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
+ }
+
+ var toRemove = rows
+ .Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
+ .Where(x => x.Key is not null)
+ .GroupBy(x => x.Key)
+ .SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
+ .ToList();
+
+ if (toRemove.Count == 0) return 0;
+ var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct);
+ _log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed);
+ return removed;
+ }
+
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide(
@@ -366,8 +477,11 @@ public class IngestionService
// Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough
// center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin.
double? appLat = raw.Lat, appLng = raw.Lng;
+ // Geocode from the structured location fields first, then fall back to scanning the ad body
+ // itself — many Tehran ads name the neighbourhood only in free text («… نیم ساعت پیش در سهروردی»)
+ // and never populate a district/area field, which is why most aggregated listings had no map.
if (appLat is null && city.Name == "تهران"
- && TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote) is { } g)
+ && TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote, raw.RawText) is { } g)
{ appLat = g.lat; appLng = g.lng; }
// Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran
// (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide.
@@ -446,7 +560,10 @@ public class IngestionService
Facility = facility, Role = role,
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
- SalaryMin = parsed.PayAmount,
+ // Prefer the AI-extracted salary, falling back to the parser's — matching the talent
+ // path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure,
+ // so every aggregated opening showed «توافقی» even when the ad stated a number.)
+ SalaryMin = d?.PayAmount ?? parsed.PayAmount,
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center