Make the listing purge SEO-standard: archive (not delete) + 410 Gone
CI/CD / CI · dotnet build (push) Successful in 49s
CI/CD / Deploy · hamkadr (push) Successful in 2m13s

Per the project archive-not-delete convention, the in-place purge now sets out-of-scope
and duplicate aggregated jobs/shifts to ShiftStatus.Archived instead of hard-deleting:
- The row is retained for analysis and the change is reversible.
- The listing drops out of every public screen and the sitemap (which filter Status == Open).
- Its detail page now returns 410 Gone (the standard permanent-removal signal) so search
  engines deindex it cleanly, instead of leaving the off-topic page live at 200 or hard-404ing.
Dedupe of job reposts archives the older copies the same way. Coordinate backfill now also
skips non-Open rows. Valid listings are untouched, so IDs/URLs stay stable.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 05:25:51 +03:30
parent e2011d335e
commit 8be275596b
5 changed files with 46 additions and 31 deletions
@@ -314,19 +314,19 @@ public class IngestionService
int filled = 0;
var jobs = await _db.JobOpenings
.Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
.Where(j => j.Status == ShiftStatus.Open && j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var j in jobs)
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
var shifts = await _db.Shifts
.Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
.Where(s => s.Status == ShiftStatus.Open && s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var s in shifts)
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
var talent = await _db.TalentListings
.Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
.Where(t => t.Status == ShiftStatus.Open && t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var t in talent)
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
@@ -337,14 +337,18 @@ public class IngestionService
}
/// <summary>
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's
/// stored text through the CURRENT validator and delete only the ones that are now clearly
/// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e.
/// <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-but-legit ads are KEPT. Then collapse
/// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs —
/// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped).
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
/// only the ones that are now clearly out-of-scope — domestic-helper («امور منزل»),
/// promotional/training, or spam (i.e. <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-
/// but-legit ads are KEPT. Then collapse near-duplicate job reposts the same way. Archiving (vs
/// hard delete) is the project convention: the row is retained for analysis and the change is
/// reversible, the listing drops out of every public screen + the sitemap (which filter Status ==
/// Open), and its detail page returns 410 Gone (the standard "permanently removed" signal Google
/// uses to deindex). Valid listings are never touched, so their IDs/URLs stay stable.
/// Returns (archived, deduped).
/// </summary>
public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
public async Task<(int archived, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
{
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
@@ -357,30 +361,33 @@ public class IngestionService
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
}
int removed = 0;
int archived = 0;
var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated)
var jobIds = (await _db.JobOpenings.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
if (jobIds.Count > 0)
removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct);
archived += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id))
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated)
var shiftIds = (await _db.Shifts.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated)
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
if (shiftIds.Count > 0)
removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct);
archived += await _db.Shifts.Where(s => shiftIds.Contains(s.Id))
.ExecuteUpdateAsync(u => u.SetProperty(s => s.Status, ShiftStatus.Archived), ct);
var deduped = await DedupeJobsAsync(ct);
_log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped);
return (removed, deduped);
_log.LogInformation("Purge archived {R} out-of-scope aggregated listings; deduped {D} jobs.", archived, deduped);
return (archived, deduped);
}
/// <summary>
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
/// group. Per-role fan-out of one ad is preserved (different RoleId → different signature).
/// group and ARCHIVES the rest (Status → Archived, reversible — same rationale as the purge).
/// Per-role fan-out of one ad is preserved (different RoleId → different signature).
/// </summary>
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
{
@@ -397,17 +404,18 @@ public class IngestionService
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
}
var toRemove = rows
var toArchive = rows
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
.Where(x => x.Key is not null)
.GroupBy(x => x.Key)
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
.ToList();
if (toRemove.Count == 0) return 0;
var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct);
_log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed);
return removed;
if (toArchive.Count == 0) return 0;
var archived = await _db.JobOpenings.Where(j => toArchive.Contains(j.Id))
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
_log.LogInformation("Archived {N} near-duplicate aggregated jobs.", archived);
return archived;
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());