Make the listing purge SEO-standard: archive (not delete) + 410 Gone
CI/CD / CI · dotnet build (push) Successful in 49s
CI/CD / Deploy · hamkadr (push) Successful in 2m13s

Per the project archive-not-delete convention, the in-place purge now sets out-of-scope
and duplicate aggregated jobs/shifts to ShiftStatus.Archived instead of hard-deleting:
- The row is retained for analysis and the change is reversible.
- The listing drops out of every public screen and the sitemap (which filter Status == Open).
- Its detail page now returns 410 Gone (the standard permanent-removal signal) so search
  engines deindex it cleanly, instead of leaving the off-topic page live at 200 or hard-404ing.
Dedupe of job reposts archives the older copies the same way. Coordinate backfill now also
skips non-Open rows. Valid listings are untouched, so IDs/URLs stay stable.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 05:25:51 +03:30
parent e2011d335e
commit 8be275596b
5 changed files with 46 additions and 31 deletions
+3 -3
View File
@@ -67,13 +67,13 @@
شیفت/استخدام/آماده‌به‌کارِ جمع‌آوری‌شده‌ای که مختصات ندارند، از روی محلهٔ ذکرشده در متنِ آگهی روی نقشه قرار می‌گیرند (محدودهٔ تقریبی). فقط مختصاتِ خالی پر می‌شود؛ موقعیتِ واقعیِ مراکز دست‌نخورده می‌ماند.
</p>
<form method="post" onsubmit="return confirm('آگهی‌های جمع‌آوری‌شدهٔ شیفت/استخدام که اکنون خارج از حوزه‌اند (خدمات منزل/نظافت، تبلیغاتی/آموزشی، اسپم) و استخدام‌های تکراری حذف می‌شوند. آگهی‌های معتبر و شناسه/آدرسشان دست‌نخورده می‌ماند. این کار بازگشت‌ناپذیر است. ادامه؟');">
<form method="post" onsubmit="return confirm('آگهی‌های جمع‌آوری‌شدهٔ شیفت/استخدام که اکنون خارج از حوزه‌اند (خدمات منزل/نظافت، تبلیغاتی/آموزشی، اسپم) و استخدام‌های تکراری «بایگانی» می‌شوند: از سایت پنهان می‌شوند ولی ردیفشان نگه داشته می‌شود (قابل بازگشت). آگهی‌های معتبر و شناسه/آدرسشان دست‌نخورده می‌ماند. ادامه؟');">
<button type="submit" asp-page-handler="PurgeInvalid" class="btn btn-outline btn-block" style="margin-top:10px; color:var(--danger); border-color:var(--danger);">
🧽 حذفِ درجای آگهی‌های خارج از حوزه و تکراری (شیفت/استخدام)
🧽 بایگانیِ درجای آگهی‌های خارج از حوزه و تکراری (شیفت/استخدام)
</button>
</form>
<p class="muted" style="font-size:11px; margin:6px 0 0;">
فقط آگهی‌هایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده می‌شوند (نه صرفاً ناقص) و استخدام‌های تکراری پاک می‌شوند. آگهی‌های معتبر دست‌نخورده‌اند، پس آدرسِ ایندکس‌شده‌شان تغییر نمی‌کند؛ فقط صفحاتِ بد ۴۰۴ می‌شوند.
فقط آگهی‌هایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده می‌شوند (نه صرفاً ناقص) و استخدام‌های تکراری بایگانی می‌شوند (وضعیت «بایگانی»، نه حذف). آگهی‌های معتبر دست‌نخورده‌اند، پس آدرسِ ایندکس‌شده‌شان تغییر نمی‌کند؛ صفحهٔ موارد بایگانی‌شده ۴۱۰ Gone می‌دهد تا گوگل تمیز حذفشان کند.
</p>
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
@@ -133,14 +133,15 @@ public class IndexModel : PageModel
}
/// <summary>
/// In-place cleanup of existing aggregated jobs/shifts: delete only the out-of-scope ones
/// (domestic-helper / promotional / spam) per the current validator, plus near-duplicate job
/// reposts. Valid listings keep their IDs/URLs. No re-fetch, no AI — runs inline.
/// In-place cleanup of existing aggregated jobs/shifts: ARCHIVE (hide, keep the row) only the
/// out-of-scope ones (domestic-helper / promotional / spam) per the current validator, plus
/// near-duplicate job reposts. Archived pages drop from lists + sitemap and return 410 Gone.
/// Valid listings keep their IDs/URLs. Reversible, no re-fetch, no AI — runs inline.
/// </summary>
public async Task<IActionResult> OnPostPurgeInvalidAsync()
{
var (removed, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
IngestMessage = $"پاک‌سازیِ درجا: {removed} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری حذف شد. سایر آگهی‌ها و شناسه/آدرسشان دست‌نخورده ماند.";
var (archived, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
IngestMessage = $"بایگانیِ درجا: {archived} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری از سایت پنهان شد (وضعیت «بایگانی»؛ ردیف نگه داشته شد و قابل بازگشت است؛ صفحه‌شان ۴۱۰ Gone می‌دهد). آگهی‌های معتبر و شناسه/آدرسشان دست‌نخورده ماند.";
return RedirectToPage();
}
@@ -31,6 +31,9 @@ public class DetailsModel : PageModel
{
await LoadAsync(id);
if (Job is null) return NotFound();
// Intentionally removed (admin-archived out-of-scope/duplicate ad): 410 Gone is the standard
// signal for permanent removal, so search engines deindex it cleanly (we keep the row for audit).
if (Job.Status == ShiftStatus.Archived) return StatusCode(StatusCodes.Status410Gone);
MapKey = (await _settings.GetAsync()).NeshanMapKey;
Reported = Request.Query["reported"] == "1";
await _interest.LogJobAsync(InterestEventType.View, id);
@@ -34,6 +34,9 @@ public class DetailsModel : PageModel
{
await LoadAsync(id);
if (Shift is null) return NotFound();
// Intentionally removed (admin-archived out-of-scope/duplicate ad): 410 Gone is the standard
// signal for permanent removal, so search engines deindex it cleanly (we keep the row for audit).
if (Shift.Status == ShiftStatus.Archived) return StatusCode(StatusCodes.Status410Gone);
MapKey = (await _settings.GetAsync()).NeshanMapKey;
Reported = Request.Query["reported"] == "1";
await _interest.LogAsync(InterestEventType.View, id); // behavioral signal for recommendations
@@ -314,19 +314,19 @@ public class IngestionService
int filled = 0;
var jobs = await _db.JobOpenings
.Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
.Where(j => j.Status == ShiftStatus.Open && j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var j in jobs)
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
var shifts = await _db.Shifts
.Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
.Where(s => s.Status == ShiftStatus.Open && s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var s in shifts)
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
var talent = await _db.TalentListings
.Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
.Where(t => t.Status == ShiftStatus.Open && t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var t in talent)
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
@@ -337,14 +337,18 @@ public class IngestionService
}
/// <summary>
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's
/// stored text through the CURRENT validator and delete only the ones that are now clearly
/// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e.
/// <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-but-legit ads are KEPT. Then collapse
/// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs —
/// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped).
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
/// only the ones that are now clearly out-of-scope — domestic-helper («امور منزل»),
/// promotional/training, or spam (i.e. <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-
/// but-legit ads are KEPT. Then collapse near-duplicate job reposts the same way. Archiving (vs
/// hard delete) is the project convention: the row is retained for analysis and the change is
/// reversible, the listing drops out of every public screen + the sitemap (which filter Status ==
/// Open), and its detail page returns 410 Gone (the standard "permanently removed" signal Google
/// uses to deindex). Valid listings are never touched, so their IDs/URLs stay stable.
/// Returns (archived, deduped).
/// </summary>
public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
public async Task<(int archived, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
{
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
@@ -357,30 +361,33 @@ public class IngestionService
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
}
int removed = 0;
int archived = 0;
var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated)
var jobIds = (await _db.JobOpenings.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
if (jobIds.Count > 0)
removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct);
archived += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id))
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated)
var shiftIds = (await _db.Shifts.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated)
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
if (shiftIds.Count > 0)
removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct);
archived += await _db.Shifts.Where(s => shiftIds.Contains(s.Id))
.ExecuteUpdateAsync(u => u.SetProperty(s => s.Status, ShiftStatus.Archived), ct);
var deduped = await DedupeJobsAsync(ct);
_log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped);
return (removed, deduped);
_log.LogInformation("Purge archived {R} out-of-scope aggregated listings; deduped {D} jobs.", archived, deduped);
return (archived, deduped);
}
/// <summary>
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
/// group. Per-role fan-out of one ad is preserved (different RoleId → different signature).
/// group and ARCHIVES the rest (Status → Archived, reversible — same rationale as the purge).
/// Per-role fan-out of one ad is preserved (different RoleId → different signature).
/// </summary>
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
{
@@ -397,17 +404,18 @@ public class IngestionService
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
}
var toRemove = rows
var toArchive = rows
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
.Where(x => x.Key is not null)
.GroupBy(x => x.Key)
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
.ToList();
if (toRemove.Count == 0) return 0;
var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct);
_log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed);
return removed;
if (toArchive.Count == 0) return 0;
var archived = await _db.JobOpenings.Where(j => toArchive.Contains(j.Id))
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
_log.LogInformation("Archived {N} near-duplicate aggregated jobs.", archived);
return archived;
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());