From e2011d335ee1d96eb4b90078303d3d5e20bbcca3 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 05:09:39 +0330 Subject: [PATCH] Ingestion data-quality + map fixes: AI salary, geocode coverage, in-place backfill & purge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Jobs now keep the AI-extracted salary (d.PayAmount ?? parsed.PayAmount); they previously used only the parser figure, so every aggregated opening showed «توافقی». - Geocoder also scans the ad body, so Tehran ads that name a neighbourhood only in free text («… در سهروردی») get an approximate map point. - New BackfillCoordsAsync (+ admin button): fills missing coords on existing aggregated listings from their stored text, in place — no ID/URL churn, SEO-safe. - New PurgeInvalidAggregatedAsync + DedupeJobsAsync (+ admin button): in-place removal of out-of-scope (domestic/promo/spam) aggregated jobs/shifts and duplicate job reposts, keeping valid listings' IDs. - Jobs detail page always renders the location card (matches Shifts) instead of hiding it when coords are missing. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Pages/Admin/Index.cshtml | 18 +++ .../Pages/Admin/Index.cshtml.cs | 24 ++++ src/JobsMedical.Web/Pages/Jobs/Details.cshtml | 20 +-- .../Services/Scraping/IngestionService.cs | 121 +++++++++++++++++- 4 files changed, 173 insertions(+), 10 deletions(-) diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml b/src/JobsMedical.Web/Pages/Admin/Index.cshtml index f31663f..b86dc1c 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml @@ -58,6 +58,24 @@ توصیه‌شده برای پاک‌سازیِ آماده‌به‌کارها: متنِ خام نگه داشته می‌شود و فقط با منطقِ جدید (یک‌نفر=یک‌آگهی، نقش پایه، گروه ثابت، تگ تمیز، موقعیت تقریبی) بازساخته می‌شوند. صفحاتِ «آماده به کار» ایندکس نمی‌شوند، پس آدرسِ ایندکس‌شده‌ای تغییر نمی‌کند؛ شیفت/استخدام به‌مرور با ایمیجستِ تازه پاک می‌شوند.

+
+ +
+

+ شیفت/استخدام/آماده‌به‌کارِ جمع‌آوری‌شده‌ای که مختصات ندارند، از روی محلهٔ ذکرشده در متنِ آگهی روی نقشه قرار می‌گیرند (محدودهٔ تقریبی). فقط مختصاتِ خالی پر می‌شود؛ موقعیتِ واقعیِ مراکز دست‌نخورده می‌ماند. +

+ +
+ +
+

+ فقط آگهی‌هایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده می‌شوند (نه صرفاً ناقص) و استخدام‌های تکراری پاک می‌شوند. آگهی‌های معتبر دست‌نخورده‌اند، پس آدرسِ ایندکس‌شده‌شان تغییر نمی‌کند؛ فقط صفحاتِ بد ۴۰۴ می‌شوند. +

+

افزودن دستی

diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs index e4d49c2..b4be44d 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs @@ -120,6 +120,30 @@ public class IndexModel : PageModel return RedirectToPage(); } + /// + /// Fill missing map coordinates on existing aggregated Tehran listings from their stored ad text + /// (TehranGeo). In place — no AI calls, no re-fetch, and crucially no delete/recreate, so indexed + /// shift/job URLs keep their IDs. Fast (pure DB + string matching), so it runs inline. + /// + public async Task OnPostBackfillCoordsAsync() + { + var n = await _ingest.BackfillCoordsAsync(); + IngestMessage = $"مختصات تقریبی برای {n} آگهی جمع‌آوری‌شده از روی متن آگهی تکمیل شد (بدون تغییر شناسه یا آدرس صفحه)."; + return RedirectToPage(); + } + + /// + /// In-place cleanup of existing aggregated jobs/shifts: delete only the out-of-scope ones + /// (domestic-helper / promotional / spam) per the current validator, plus near-duplicate job + /// reposts. Valid listings keep their IDs/URLs. No re-fetch, no AI — runs inline. + /// + public async Task OnPostPurgeInvalidAsync() + { + var (removed, deduped) = await _ingest.PurgeInvalidAggregatedAsync(); + IngestMessage = $"پاک‌سازیِ درجا: {removed} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری حذف شد. سایر آگهی‌ها و شناسه/آدرسشان دست‌نخورده ماند."; + return RedirectToPage(); + } + private async Task LoadAsync() { Queue = await _db.RawListings diff --git a/src/JobsMedical.Web/Pages/Jobs/Details.cshtml b/src/JobsMedical.Web/Pages/Jobs/Details.cshtml index d91807b..0b4164a 100644 --- a/src/JobsMedical.Web/Pages/Jobs/Details.cshtml +++ b/src/JobsMedical.Web/Pages/Jobs/Details.cshtml @@ -161,12 +161,12 @@ } - @if (mapLat is not null && mapLng is not null) - { - var latS = mapLat.Value.ToString(System.Globalization.CultureInfo.InvariantCulture); - var lngS = mapLng.Value.ToString(System.Globalization.CultureInfo.InvariantCulture); -
-

موقعیت مکانی

+
+

موقعیت مکانی

+ @if (mapLat is not null && mapLng is not null) + { + var latS = mapLat.Value.ToString(System.Globalization.CultureInfo.InvariantCulture); + var lngS = mapLng.Value.ToString(System.Globalization.CultureInfo.InvariantCulture); @if (!string.IsNullOrEmpty(Model.MapKey)) {
@@ -183,8 +183,12 @@ } مسیریابی در نشان -
- } + } + else + { +

مختصات این آگهی ثبت نشده است.

+ } +
diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 732d5d4..6778052 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -299,6 +299,117 @@ public class IngestionService return removed; } + /// + /// In-place geocoding backfill: for existing AGGREGATED listings in Tehran that still have no map + /// coords, derive an APPROXIMATE neighbourhood center from the stored ad text (TehranGeo) and fill + /// Lat/Lng. Unlike it never deletes or recreates rows, so listing IDs — + /// and the indexed shift/job URLs in the sitemap — are untouched; safe to run on the live board. + /// Only ever FILLS a null coordinate; a real point (Divar/employer/AI) is never overwritten. + /// Returns how many listings were newly placed on the map. + /// + public async Task BackfillCoordsAsync(CancellationToken ct = default) + { + var tehran = await _db.Cities.FirstOrDefaultAsync(c => c.Name == "تهران", ct); + if (tehran is null) return 0; + int filled = 0; + + var jobs = await _db.JobOpenings + .Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id) + .ToListAsync(ct); + foreach (var j in jobs) + if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; } + + var shifts = await _db.Shifts + .Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id) + .ToListAsync(ct); + foreach (var s in shifts) + if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; } + + var talent = await _db.TalentListings + .Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id) + .ToListAsync(ct); + foreach (var t in talent) + if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; } + + if (filled > 0) await _db.SaveChangesAsync(ct); + _log.LogInformation("Coordinate backfill placed {N} aggregated listings on the map.", filled); + return filled; + } + + /// + /// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's + /// stored text through the CURRENT validator and delete only the ones that are now clearly + /// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e. + /// ). Merely-incomplete-but-legit ads are KEPT. Then collapse + /// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs — + /// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped). + /// + public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default) + { + var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct); + var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct); + var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct); + + bool IsOutOfScope(string? text) + { + var t = text ?? ""; + var parsed = _parser.Parse(t, roleNames, cityNames, districtNames); + return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper + } + + int removed = 0; + + var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated) + .Select(j => new { j.Id, j.Description }).ToListAsync(ct)) + .Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList(); + if (jobIds.Count > 0) + removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct); + + var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated) + .Select(s => new { s.Id, s.Description }).ToListAsync(ct)) + .Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList(); + if (shiftIds.Count > 0) + removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct); + + var deduped = await DedupeJobsAsync(ct); + _log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped); + return (removed, deduped); + } + + /// + /// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled + /// with slightly different surrounding text → different ContentHash). Signature = role + facility + + /// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each + /// group. Per-role fan-out of one ad is preserved (different RoleId → different signature). + /// + public async Task DedupeJobsAsync(CancellationToken ct = default) + { + var rows = await _db.JobOpenings + .Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated) + .Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt }) + .ToListAsync(ct); + + string? Sig(int roleId, int facId, string? desc) + { + var core = NormalizeFa(Regex.Replace(desc ?? "", + @"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim(); + if (core.Length < 15) return null; // too little to call it a dup safely + return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}"; + } + + var toRemove = rows + .Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) }) + .Where(x => x.Key is not null) + .GroupBy(x => x.Key) + .SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id)) + .ToList(); + + if (toRemove.Count == 0) return 0; + var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct); + _log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed); + return removed; + } + private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray()); private static (RawListingStatus status, string? reason, int confidence) Decide( @@ -366,8 +477,11 @@ public class IngestionService // Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough // center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin. double? appLat = raw.Lat, appLng = raw.Lng; + // Geocode from the structured location fields first, then fall back to scanning the ad body + // itself — many Tehran ads name the neighbourhood only in free text («… نیم ساعت پیش در سهروردی») + // and never populate a district/area field, which is why most aggregated listings had no map. if (appLat is null && city.Name == "تهران" - && TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote) is { } g) + && TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote, raw.RawText) is { } g) { appLat = g.lat; appLng = g.lng; } // Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran // (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide. @@ -446,7 +560,10 @@ public class IngestionService Facility = facility, Role = role, Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}", EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType), - SalaryMin = parsed.PayAmount, + // Prefer the AI-extracted salary, falling back to the parser's — matching the talent + // path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure, + // so every aggregated opening showed «توافقی» even when the ad stated a number.) + SalaryMin = d?.PayAmount ?? parsed.PayAmount, Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center