From fb7bfad9ceb53d2fbc94916a9f30c05ff1eb2275 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sat, 20 Jun 2026 16:08:20 +0330 Subject: [PATCH] Reprocess: SEO-safe applicants-only default (don't churn indexed shift/job URLs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reprocess deletes+rebuilds aggregated listings, which changes their IDs. Shift/Job detail pages are indexed and in the sitemap, so churning them would 404 ranked URLs. «آماده به کار» pages are NoIndex + Disallow, so rebuilding them has zero SEO impact — and that's where all the duplicate/sprawl problems were. ReprocessAsync(talentOnly: true) now only deletes/rebuilds TalentListings and skips non-talent raws (leaving shift/job listings + their RawListing links untouched). Admin button relabelled «پردازش مجددِ آماده به کارها (امن برای SEO)». Shifts/jobs self-clean via normal ingestion turnover. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Pages/Admin/Index.cshtml | 6 ++--- .../Pages/Admin/Index.cshtml.cs | 4 ++- .../Services/Scraping/IngestionService.cs | 25 ++++++++++++++----- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml b/src/JobsMedical.Web/Pages/Admin/Index.cshtml index 9a4123d..f79d58e 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml @@ -49,13 +49,13 @@ کش حذف تکراری و آگهی‌های جمع‌آوری‌شده پاک و از نو با AI پردازش می‌شوند. (آگهی‌های مراکز حذف نمی‌شوند.)

-
+

- توصیه‌شده برای پاک‌سازیِ داده‌های فعلی: متنِ خام نگه داشته می‌شود و فقط آگهی‌ها با منطقِ جدید (یک‌نفر=یک‌آگهی، نقش پایه، گروه ثابت، تگ تمیز) بازساخته می‌شوند. + توصیه‌شده برای پاک‌سازیِ آماده‌به‌کارها: متنِ خام نگه داشته می‌شود و فقط با منطقِ جدید (یک‌نفر=یک‌آگهی، نقش پایه، گروه ثابت، تگ تمیز، موقعیت تقریبی) بازساخته می‌شوند. صفحاتِ «آماده به کار» ایندکس نمی‌شوند، پس آدرسِ ایندکس‌شده‌ای تغییر نمی‌کند؛ شیفت/استخدام به‌مرور با ایمیجستِ تازه پاک می‌شوند.


diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs index bff4ae9..e4d49c2 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs @@ -111,7 +111,9 @@ public class IndexModel : PageModel using var scope = _scopes.CreateScope(); var svc = scope.ServiceProvider.GetRequiredService(); var log = scope.ServiceProvider.GetRequiredService>(); - try { await svc.ReprocessAsync(); } + // talentOnly: «آماده به کار» is NoIndex/Disallow → rebuilding it doesn't churn any indexed + // URL. Shift/Job detail pages ARE indexed, so they're left to self-clean via turnover. + try { await svc.ReprocessAsync(talentOnly: true); } catch (Exception ex) { log.LogError(ex, "Background reprocess failed"); } }); IngestMessage = "پردازش مجدد آیتم‌های ذخیره‌شده در پس‌زمینه آغاز شد. نتیجه پس از اتمام در «تاریخچهٔ اجرا» نمایش داده می‌شود (بسته به تعداد آیتم‌ها و سرعت هوش مصنوعی، چند دقیقه طول می‌کشد)."; diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 490d800..8984b3f 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -176,7 +176,11 @@ public class IngestionService /// Deletes the old aggregated posts, then republishes from the stored raw text. Long-running /// (one AI call per item) — call it on a background scope, not inside a request. /// - public async Task ReprocessAsync(CancellationToken ct = default) + /// SEO-safe default: only «آماده به کار» (which is NoIndex/Disallow) is + /// deleted & rebuilt, so no INDEXED url changes. Shift/Job detail pages are indexed + in the + /// sitemap, so churning their IDs would 404 ranked pages — instead they self-clean via turnover. + /// Pass false only when you accept that SEO hit. + public async Task ReprocessAsync(bool talentOnly = true, CancellationToken ct = default) { var settings = await _settings.GetAsync(); var roles = await _db.Roles.ToListAsync(ct); @@ -189,19 +193,28 @@ public class IngestionService // Drop previously-published aggregated content; it's regenerated below from the raw text. // DB cascade clears their ContactMethods/Applications/InterestEvents; RawListing back-refs SetNull. - await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct); - await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct); await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct); + if (!talentOnly) + { + await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct); + await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct); + } int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0; var raws = await _db.RawListings.OrderBy(r => r.Id).ToListAsync(ct); foreach (var raw in raws) { ct.ThrowIfCancellationRequested(); - fetched++; - raw.LinkedShiftId = null; raw.LinkedTalentId = null; // old links were just deleted - var parsed = _parser.Parse(raw.RawText, roleNames, cityNames, districtNames); + + // SEO-safe scope: in talent-only mode, leave indexed shift/job listings (and their + // RawListing links/status) completely untouched — only applicants are rebuilt. + if (talentOnly && parsed.Kind != ListingKind.Talent) continue; + + fetched++; + raw.LinkedTalentId = null; // talent rows were just deleted + if (!talentOnly) raw.LinkedShiftId = null; + var val = _validator.Validate(raw.RawText, parsed); // Stale-applicant filter — age from the Persian "time ago" phrase in the text (Divar).