Make the listing purge SEO-standard: archive (not delete) + 410 Gone
Per the project archive-not-delete convention, the in-place purge now sets out-of-scope and duplicate aggregated jobs/shifts to ShiftStatus.Archived instead of hard-deleting: - The row is retained for analysis and the change is reversible. - The listing drops out of every public screen and the sitemap (which filter Status == Open). - Its detail page now returns 410 Gone (the standard permanent-removal signal) so search engines deindex it cleanly, instead of leaving the off-topic page live at 200 or hard-404ing. Dedupe of job reposts archives the older copies the same way. Coordinate backfill now also skips non-Open rows. Valid listings are untouched, so IDs/URLs stay stable. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -67,13 +67,13 @@
|
||||
شیفت/استخدام/آمادهبهکارِ جمعآوریشدهای که مختصات ندارند، از روی محلهٔ ذکرشده در متنِ آگهی روی نقشه قرار میگیرند (محدودهٔ تقریبی). فقط مختصاتِ خالی پر میشود؛ موقعیتِ واقعیِ مراکز دستنخورده میماند.
|
||||
</p>
|
||||
|
||||
<form method="post" onsubmit="return confirm('آگهیهای جمعآوریشدهٔ شیفت/استخدام که اکنون خارج از حوزهاند (خدمات منزل/نظافت، تبلیغاتی/آموزشی، اسپم) و استخدامهای تکراری حذف میشوند. آگهیهای معتبر و شناسه/آدرسشان دستنخورده میماند. این کار بازگشتناپذیر است. ادامه؟');">
|
||||
<form method="post" onsubmit="return confirm('آگهیهای جمعآوریشدهٔ شیفت/استخدام که اکنون خارج از حوزهاند (خدمات منزل/نظافت، تبلیغاتی/آموزشی، اسپم) و استخدامهای تکراری «بایگانی» میشوند: از سایت پنهان میشوند ولی ردیفشان نگه داشته میشود (قابل بازگشت). آگهیهای معتبر و شناسه/آدرسشان دستنخورده میماند. ادامه؟');">
|
||||
<button type="submit" asp-page-handler="PurgeInvalid" class="btn btn-outline btn-block" style="margin-top:10px; color:var(--danger); border-color:var(--danger);">
|
||||
🧽 حذفِ درجای آگهیهای خارج از حوزه و تکراری (شیفت/استخدام)
|
||||
🧽 بایگانیِ درجای آگهیهای خارج از حوزه و تکراری (شیفت/استخدام)
|
||||
</button>
|
||||
</form>
|
||||
<p class="muted" style="font-size:11px; margin:6px 0 0;">
|
||||
فقط آگهیهایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده میشوند (نه صرفاً ناقص) و استخدامهای تکراری پاک میشوند. آگهیهای معتبر دستنخوردهاند، پس آدرسِ ایندکسشدهشان تغییر نمیکند؛ فقط صفحاتِ بد ۴۰۴ میشوند.
|
||||
فقط آگهیهایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده میشوند (نه صرفاً ناقص) و استخدامهای تکراری بایگانی میشوند (وضعیت «بایگانی»، نه حذف). آگهیهای معتبر دستنخوردهاند، پس آدرسِ ایندکسشدهشان تغییر نمیکند؛ صفحهٔ موارد بایگانیشده ۴۱۰ Gone میدهد تا گوگل تمیز حذفشان کند.
|
||||
</p>
|
||||
|
||||
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
|
||||
|
||||
@@ -133,14 +133,15 @@ public class IndexModel : PageModel
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-place cleanup of existing aggregated jobs/shifts: delete only the out-of-scope ones
|
||||
/// (domestic-helper / promotional / spam) per the current validator, plus near-duplicate job
|
||||
/// reposts. Valid listings keep their IDs/URLs. No re-fetch, no AI — runs inline.
|
||||
/// In-place cleanup of existing aggregated jobs/shifts: ARCHIVE (hide, keep the row) only the
|
||||
/// out-of-scope ones (domestic-helper / promotional / spam) per the current validator, plus
|
||||
/// near-duplicate job reposts. Archived pages drop from lists + sitemap and return 410 Gone.
|
||||
/// Valid listings keep their IDs/URLs. Reversible, no re-fetch, no AI — runs inline.
|
||||
/// </summary>
|
||||
public async Task<IActionResult> OnPostPurgeInvalidAsync()
|
||||
{
|
||||
var (removed, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
|
||||
IngestMessage = $"پاکسازیِ درجا: {removed} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری حذف شد. سایر آگهیها و شناسه/آدرسشان دستنخورده ماند.";
|
||||
var (archived, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
|
||||
IngestMessage = $"بایگانیِ درجا: {archived} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری از سایت پنهان شد (وضعیت «بایگانی»؛ ردیف نگه داشته شد و قابل بازگشت است؛ صفحهشان ۴۱۰ Gone میدهد). آگهیهای معتبر و شناسه/آدرسشان دستنخورده ماند.";
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
|
||||
@@ -31,6 +31,9 @@ public class DetailsModel : PageModel
|
||||
{
|
||||
await LoadAsync(id);
|
||||
if (Job is null) return NotFound();
|
||||
// Intentionally removed (admin-archived out-of-scope/duplicate ad): 410 Gone is the standard
|
||||
// signal for permanent removal, so search engines deindex it cleanly (we keep the row for audit).
|
||||
if (Job.Status == ShiftStatus.Archived) return StatusCode(StatusCodes.Status410Gone);
|
||||
MapKey = (await _settings.GetAsync()).NeshanMapKey;
|
||||
Reported = Request.Query["reported"] == "1";
|
||||
await _interest.LogJobAsync(InterestEventType.View, id);
|
||||
|
||||
@@ -34,6 +34,9 @@ public class DetailsModel : PageModel
|
||||
{
|
||||
await LoadAsync(id);
|
||||
if (Shift is null) return NotFound();
|
||||
// Intentionally removed (admin-archived out-of-scope/duplicate ad): 410 Gone is the standard
|
||||
// signal for permanent removal, so search engines deindex it cleanly (we keep the row for audit).
|
||||
if (Shift.Status == ShiftStatus.Archived) return StatusCode(StatusCodes.Status410Gone);
|
||||
MapKey = (await _settings.GetAsync()).NeshanMapKey;
|
||||
Reported = Request.Query["reported"] == "1";
|
||||
await _interest.LogAsync(InterestEventType.View, id); // behavioral signal for recommendations
|
||||
|
||||
@@ -314,19 +314,19 @@ public class IngestionService
|
||||
int filled = 0;
|
||||
|
||||
var jobs = await _db.JobOpenings
|
||||
.Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
|
||||
.Where(j => j.Status == ShiftStatus.Open && j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var j in jobs)
|
||||
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
|
||||
|
||||
var shifts = await _db.Shifts
|
||||
.Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
|
||||
.Where(s => s.Status == ShiftStatus.Open && s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var s in shifts)
|
||||
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
|
||||
|
||||
var talent = await _db.TalentListings
|
||||
.Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
|
||||
.Where(t => t.Status == ShiftStatus.Open && t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var t in talent)
|
||||
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
|
||||
@@ -337,14 +337,18 @@ public class IngestionService
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's
|
||||
/// stored text through the CURRENT validator and delete only the ones that are now clearly
|
||||
/// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e.
|
||||
/// <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-but-legit ads are KEPT. Then collapse
|
||||
/// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs —
|
||||
/// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped).
|
||||
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
|
||||
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
|
||||
/// only the ones that are now clearly out-of-scope — domestic-helper («امور منزل»),
|
||||
/// promotional/training, or spam (i.e. <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-
|
||||
/// but-legit ads are KEPT. Then collapse near-duplicate job reposts the same way. Archiving (vs
|
||||
/// hard delete) is the project convention: the row is retained for analysis and the change is
|
||||
/// reversible, the listing drops out of every public screen + the sitemap (which filter Status ==
|
||||
/// Open), and its detail page returns 410 Gone (the standard "permanently removed" signal Google
|
||||
/// uses to deindex). Valid listings are never touched, so their IDs/URLs stay stable.
|
||||
/// Returns (archived, deduped).
|
||||
/// </summary>
|
||||
public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
|
||||
public async Task<(int archived, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
|
||||
{
|
||||
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
||||
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
||||
@@ -357,30 +361,33 @@ public class IngestionService
|
||||
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
|
||||
}
|
||||
|
||||
int removed = 0;
|
||||
int archived = 0;
|
||||
|
||||
var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated)
|
||||
var jobIds = (await _db.JobOpenings.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
||||
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
|
||||
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
|
||||
if (jobIds.Count > 0)
|
||||
removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct);
|
||||
archived += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id))
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
|
||||
|
||||
var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated)
|
||||
var shiftIds = (await _db.Shifts.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated)
|
||||
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
|
||||
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
|
||||
if (shiftIds.Count > 0)
|
||||
removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct);
|
||||
archived += await _db.Shifts.Where(s => shiftIds.Contains(s.Id))
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(s => s.Status, ShiftStatus.Archived), ct);
|
||||
|
||||
var deduped = await DedupeJobsAsync(ct);
|
||||
_log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped);
|
||||
return (removed, deduped);
|
||||
_log.LogInformation("Purge archived {R} out-of-scope aggregated listings; deduped {D} jobs.", archived, deduped);
|
||||
return (archived, deduped);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
|
||||
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
|
||||
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
|
||||
/// group. Per-role fan-out of one ad is preserved (different RoleId → different signature).
|
||||
/// group and ARCHIVES the rest (Status → Archived, reversible — same rationale as the purge).
|
||||
/// Per-role fan-out of one ad is preserved (different RoleId → different signature).
|
||||
/// </summary>
|
||||
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
|
||||
{
|
||||
@@ -397,17 +404,18 @@ public class IngestionService
|
||||
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
||||
}
|
||||
|
||||
var toRemove = rows
|
||||
var toArchive = rows
|
||||
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
|
||||
.Where(x => x.Key is not null)
|
||||
.GroupBy(x => x.Key)
|
||||
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
|
||||
.ToList();
|
||||
|
||||
if (toRemove.Count == 0) return 0;
|
||||
var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct);
|
||||
_log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed);
|
||||
return removed;
|
||||
if (toArchive.Count == 0) return 0;
|
||||
var archived = await _db.JobOpenings.Where(j => toArchive.Contains(j.Id))
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
|
||||
_log.LogInformation("Archived {N} near-duplicate aggregated jobs.", archived);
|
||||
return archived;
|
||||
}
|
||||
|
||||
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
|
||||
|
||||
Reference in New Issue
Block a user