From 8be275596b6a2a69f8e80cc60876aa97caad8529 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 05:25:51 +0330 Subject: [PATCH] Make the listing purge SEO-standard: archive (not delete) + 410 Gone Per the project archive-not-delete convention, the in-place purge now sets out-of-scope and duplicate aggregated jobs/shifts to ShiftStatus.Archived instead of hard-deleting: - The row is retained for analysis and the change is reversible. - The listing drops out of every public screen and the sitemap (which filter Status == Open). - Its detail page now returns 410 Gone (the standard permanent-removal signal) so search engines deindex it cleanly, instead of leaving the off-topic page live at 200 or hard-404ing. Dedupe of job reposts archives the older copies the same way. Coordinate backfill now also skips non-Open rows. Valid listings are untouched, so IDs/URLs stay stable. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Pages/Admin/Index.cshtml | 6 +-- .../Pages/Admin/Index.cshtml.cs | 11 ++-- .../Pages/Jobs/Details.cshtml.cs | 3 ++ .../Pages/Shifts/Details.cshtml.cs | 3 ++ .../Services/Scraping/IngestionService.cs | 54 +++++++++++-------- 5 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml b/src/JobsMedical.Web/Pages/Admin/Index.cshtml index b86dc1c..c52986e 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml @@ -67,13 +67,13 @@ شیفت/استخدام/آماده‌به‌کارِ جمع‌آوری‌شده‌ای که مختصات ندارند، از روی محلهٔ ذکرشده در متنِ آگهی روی نقشه قرار می‌گیرند (محدودهٔ تقریبی). فقط مختصاتِ خالی پر می‌شود؛ موقعیتِ واقعیِ مراکز دست‌نخورده می‌ماند.

-
+

- فقط آگهی‌هایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده می‌شوند (نه صرفاً ناقص) و استخدام‌های تکراری پاک می‌شوند. آگهی‌های معتبر دست‌نخورده‌اند، پس آدرسِ ایندکس‌شده‌شان تغییر نمی‌کند؛ فقط صفحاتِ بد ۴۰۴ می‌شوند. + فقط آگهی‌هایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده می‌شوند (نه صرفاً ناقص) و استخدام‌های تکراری بایگانی می‌شوند (وضعیت «بایگانی»، نه حذف). آگهی‌های معتبر دست‌نخورده‌اند، پس آدرسِ ایندکس‌شده‌شان تغییر نمی‌کند؛ صفحهٔ موارد بایگانی‌شده ۴۱۰ Gone می‌دهد تا گوگل تمیز حذفشان کند.


diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs index b4be44d..35c8c8f 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs @@ -133,14 +133,15 @@ public class IndexModel : PageModel } /// - /// In-place cleanup of existing aggregated jobs/shifts: delete only the out-of-scope ones - /// (domestic-helper / promotional / spam) per the current validator, plus near-duplicate job - /// reposts. Valid listings keep their IDs/URLs. No re-fetch, no AI — runs inline. + /// In-place cleanup of existing aggregated jobs/shifts: ARCHIVE (hide, keep the row) only the + /// out-of-scope ones (domestic-helper / promotional / spam) per the current validator, plus + /// near-duplicate job reposts. Archived pages drop from lists + sitemap and return 410 Gone. + /// Valid listings keep their IDs/URLs. Reversible, no re-fetch, no AI — runs inline. /// public async Task OnPostPurgeInvalidAsync() { - var (removed, deduped) = await _ingest.PurgeInvalidAggregatedAsync(); - IngestMessage = $"پاک‌سازیِ درجا: {removed} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری حذف شد. سایر آگهی‌ها و شناسه/آدرسشان دست‌نخورده ماند."; + var (archived, deduped) = await _ingest.PurgeInvalidAggregatedAsync(); + IngestMessage = $"بایگانیِ درجا: {archived} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری از سایت پنهان شد (وضعیت «بایگانی»؛ ردیف نگه داشته شد و قابل بازگشت است؛ صفحه‌شان ۴۱۰ Gone می‌دهد). آگهی‌های معتبر و شناسه/آدرسشان دست‌نخورده ماند."; return RedirectToPage(); } diff --git a/src/JobsMedical.Web/Pages/Jobs/Details.cshtml.cs b/src/JobsMedical.Web/Pages/Jobs/Details.cshtml.cs index 9f99884..e5d97ad 100644 --- a/src/JobsMedical.Web/Pages/Jobs/Details.cshtml.cs +++ b/src/JobsMedical.Web/Pages/Jobs/Details.cshtml.cs @@ -31,6 +31,9 @@ public class DetailsModel : PageModel { await LoadAsync(id); if (Job is null) return NotFound(); + // Intentionally removed (admin-archived out-of-scope/duplicate ad): 410 Gone is the standard + // signal for permanent removal, so search engines deindex it cleanly (we keep the row for audit). + if (Job.Status == ShiftStatus.Archived) return StatusCode(StatusCodes.Status410Gone); MapKey = (await _settings.GetAsync()).NeshanMapKey; Reported = Request.Query["reported"] == "1"; await _interest.LogJobAsync(InterestEventType.View, id); diff --git a/src/JobsMedical.Web/Pages/Shifts/Details.cshtml.cs b/src/JobsMedical.Web/Pages/Shifts/Details.cshtml.cs index a3d7065..887fc22 100644 --- a/src/JobsMedical.Web/Pages/Shifts/Details.cshtml.cs +++ b/src/JobsMedical.Web/Pages/Shifts/Details.cshtml.cs @@ -34,6 +34,9 @@ public class DetailsModel : PageModel { await LoadAsync(id); if (Shift is null) return NotFound(); + // Intentionally removed (admin-archived out-of-scope/duplicate ad): 410 Gone is the standard + // signal for permanent removal, so search engines deindex it cleanly (we keep the row for audit). + if (Shift.Status == ShiftStatus.Archived) return StatusCode(StatusCodes.Status410Gone); MapKey = (await _settings.GetAsync()).NeshanMapKey; Reported = Request.Query["reported"] == "1"; await _interest.LogAsync(InterestEventType.View, id); // behavioral signal for recommendations diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 6778052..21a7742 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -314,19 +314,19 @@ public class IngestionService int filled = 0; var jobs = await _db.JobOpenings - .Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id) + .Where(j => j.Status == ShiftStatus.Open && j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id) .ToListAsync(ct); foreach (var j in jobs) if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; } var shifts = await _db.Shifts - .Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id) + .Where(s => s.Status == ShiftStatus.Open && s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id) .ToListAsync(ct); foreach (var s in shifts) if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; } var talent = await _db.TalentListings - .Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id) + .Where(t => t.Status == ShiftStatus.Open && t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id) .ToListAsync(ct); foreach (var t in talent) if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; } @@ -337,14 +337,18 @@ public class IngestionService } /// - /// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's - /// stored text through the CURRENT validator and delete only the ones that are now clearly - /// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e. - /// ). Merely-incomplete-but-legit ads are KEPT. Then collapse - /// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs — - /// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped). + /// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open + /// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete) + /// only the ones that are now clearly out-of-scope — domestic-helper («امور منزل»), + /// promotional/training, or spam (i.e. ). Merely-incomplete- + /// but-legit ads are KEPT. Then collapse near-duplicate job reposts the same way. Archiving (vs + /// hard delete) is the project convention: the row is retained for analysis and the change is + /// reversible, the listing drops out of every public screen + the sitemap (which filter Status == + /// Open), and its detail page returns 410 Gone (the standard "permanently removed" signal Google + /// uses to deindex). Valid listings are never touched, so their IDs/URLs stay stable. + /// Returns (archived, deduped). /// - public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default) + public async Task<(int archived, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default) { var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct); var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct); @@ -357,30 +361,33 @@ public class IngestionService return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper } - int removed = 0; + int archived = 0; - var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated) + var jobIds = (await _db.JobOpenings.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated) .Select(j => new { j.Id, j.Description }).ToListAsync(ct)) .Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList(); if (jobIds.Count > 0) - removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct); + archived += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)) + .ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct); - var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated) + var shiftIds = (await _db.Shifts.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated) .Select(s => new { s.Id, s.Description }).ToListAsync(ct)) .Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList(); if (shiftIds.Count > 0) - removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct); + archived += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)) + .ExecuteUpdateAsync(u => u.SetProperty(s => s.Status, ShiftStatus.Archived), ct); var deduped = await DedupeJobsAsync(ct); - _log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped); - return (removed, deduped); + _log.LogInformation("Purge archived {R} out-of-scope aggregated listings; deduped {D} jobs.", archived, deduped); + return (archived, deduped); } /// /// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled /// with slightly different surrounding text → different ContentHash). Signature = role + facility + /// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each - /// group. Per-role fan-out of one ad is preserved (different RoleId → different signature). + /// group and ARCHIVES the rest (Status → Archived, reversible — same rationale as the purge). + /// Per-role fan-out of one ad is preserved (different RoleId → different signature). /// public async Task DedupeJobsAsync(CancellationToken ct = default) { @@ -397,17 +404,18 @@ public class IngestionService return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}"; } - var toRemove = rows + var toArchive = rows .Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) }) .Where(x => x.Key is not null) .GroupBy(x => x.Key) .SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id)) .ToList(); - if (toRemove.Count == 0) return 0; - var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct); - _log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed); - return removed; + if (toArchive.Count == 0) return 0; + var archived = await _db.JobOpenings.Where(j => toArchive.Contains(j.Id)) + .ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct); + _log.LogInformation("Archived {N} near-duplicate aggregated jobs.", archived); + return archived; } private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());