Make the listing purge SEO-standard: archive (not delete) + 410 Gone
Per the project archive-not-delete convention, the in-place purge now sets out-of-scope and duplicate aggregated jobs/shifts to ShiftStatus.Archived instead of hard-deleting: - The row is retained for analysis and the change is reversible. - The listing drops out of every public screen and the sitemap (which filter Status == Open). - Its detail page now returns 410 Gone (the standard permanent-removal signal) so search engines deindex it cleanly, instead of leaving the off-topic page live at 200 or hard-404ing. Dedupe of job reposts archives the older copies the same way. Coordinate backfill now also skips non-Open rows. Valid listings are untouched, so IDs/URLs stay stable. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -314,19 +314,19 @@ public class IngestionService
|
||||
int filled = 0;
|
||||
|
||||
var jobs = await _db.JobOpenings
|
||||
.Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
|
||||
.Where(j => j.Status == ShiftStatus.Open && j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var j in jobs)
|
||||
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
|
||||
|
||||
var shifts = await _db.Shifts
|
||||
.Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
|
||||
.Where(s => s.Status == ShiftStatus.Open && s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var s in shifts)
|
||||
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
|
||||
|
||||
var talent = await _db.TalentListings
|
||||
.Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
|
||||
.Where(t => t.Status == ShiftStatus.Open && t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var t in talent)
|
||||
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
|
||||
@@ -337,14 +337,18 @@ public class IngestionService
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's
|
||||
/// stored text through the CURRENT validator and delete only the ones that are now clearly
|
||||
/// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e.
|
||||
/// <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-but-legit ads are KEPT. Then collapse
|
||||
/// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs —
|
||||
/// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped).
|
||||
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
|
||||
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
|
||||
/// only the ones that are now clearly out-of-scope — domestic-helper («امور منزل»),
|
||||
/// promotional/training, or spam (i.e. <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-
|
||||
/// but-legit ads are KEPT. Then collapse near-duplicate job reposts the same way. Archiving (vs
|
||||
/// hard delete) is the project convention: the row is retained for analysis and the change is
|
||||
/// reversible, the listing drops out of every public screen + the sitemap (which filter Status ==
|
||||
/// Open), and its detail page returns 410 Gone (the standard "permanently removed" signal Google
|
||||
/// uses to deindex). Valid listings are never touched, so their IDs/URLs stay stable.
|
||||
/// Returns (archived, deduped).
|
||||
/// </summary>
|
||||
public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
|
||||
public async Task<(int archived, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
|
||||
{
|
||||
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
||||
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
||||
@@ -357,30 +361,33 @@ public class IngestionService
|
||||
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
|
||||
}
|
||||
|
||||
int removed = 0;
|
||||
int archived = 0;
|
||||
|
||||
var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated)
|
||||
var jobIds = (await _db.JobOpenings.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
||||
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
|
||||
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
|
||||
if (jobIds.Count > 0)
|
||||
removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct);
|
||||
archived += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id))
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
|
||||
|
||||
var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated)
|
||||
var shiftIds = (await _db.Shifts.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated)
|
||||
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
|
||||
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
|
||||
if (shiftIds.Count > 0)
|
||||
removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct);
|
||||
archived += await _db.Shifts.Where(s => shiftIds.Contains(s.Id))
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(s => s.Status, ShiftStatus.Archived), ct);
|
||||
|
||||
var deduped = await DedupeJobsAsync(ct);
|
||||
_log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped);
|
||||
return (removed, deduped);
|
||||
_log.LogInformation("Purge archived {R} out-of-scope aggregated listings; deduped {D} jobs.", archived, deduped);
|
||||
return (archived, deduped);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
|
||||
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
|
||||
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
|
||||
/// group. Per-role fan-out of one ad is preserved (different RoleId → different signature).
|
||||
/// group and ARCHIVES the rest (Status → Archived, reversible — same rationale as the purge).
|
||||
/// Per-role fan-out of one ad is preserved (different RoleId → different signature).
|
||||
/// </summary>
|
||||
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
|
||||
{
|
||||
@@ -397,17 +404,18 @@ public class IngestionService
|
||||
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
||||
}
|
||||
|
||||
var toRemove = rows
|
||||
var toArchive = rows
|
||||
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
|
||||
.Where(x => x.Key is not null)
|
||||
.GroupBy(x => x.Key)
|
||||
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
|
||||
.ToList();
|
||||
|
||||
if (toRemove.Count == 0) return 0;
|
||||
var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct);
|
||||
_log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed);
|
||||
return removed;
|
||||
if (toArchive.Count == 0) return 0;
|
||||
var archived = await _db.JobOpenings.Where(j => toArchive.Contains(j.Id))
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
|
||||
_log.LogInformation("Archived {N} near-duplicate aggregated jobs.", archived);
|
||||
return archived;
|
||||
}
|
||||
|
||||
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
|
||||
|
||||
Reference in New Issue
Block a user