8be275596b
Per the project archive-not-delete convention, the in-place purge now sets out-of-scope and duplicate aggregated jobs/shifts to ShiftStatus.Archived instead of hard-deleting: - The row is retained for analysis and the change is reversible. - The listing drops out of every public screen and the sitemap (which filter Status == Open). - Its detail page now returns 410 Gone (the standard permanent-removal signal) so search engines deindex it cleanly, instead of leaving the off-topic page live at 200 or hard-404ing. Dedupe of job reposts archives the older copies the same way. Coordinate backfill now also skips non-Open rows. Valid listings are untouched, so IDs/URLs stay stable. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
162 lines
8.6 KiB
C#
162 lines
8.6 KiB
C#
using JobsMedical.Web.Data;
|
|
using JobsMedical.Web.Models;
|
|
using JobsMedical.Web.Services.Scraping;
|
|
using Microsoft.AspNetCore.Authorization;
|
|
using Microsoft.AspNetCore.Mvc;
|
|
using Microsoft.AspNetCore.Mvc.RazorPages;
|
|
using Microsoft.EntityFrameworkCore;
|
|
|
|
namespace JobsMedical.Web.Pages.Admin;
|
|
|
|
[Authorize(Roles = "Admin")]
|
|
public class IndexModel : PageModel
|
|
{
|
|
private readonly AppDbContext _db;
|
|
private readonly IngestionService _ingest;
|
|
private readonly IServiceScopeFactory _scopes;
|
|
private readonly ILogger<IndexModel> _log;
|
|
|
|
public IndexModel(AppDbContext db, IngestionService ingest, IServiceScopeFactory scopes, ILogger<IndexModel> log)
|
|
{
|
|
_db = db;
|
|
_ingest = ingest;
|
|
_scopes = scopes;
|
|
_log = log;
|
|
}
|
|
|
|
public List<RawListing> Queue { get; private set; } = new();
|
|
public List<RawListing> Flagged { get; private set; } = new();
|
|
public IReadOnlyList<string> SourceNames { get; private set; } = new List<string>();
|
|
public int PublishedShifts { get; private set; }
|
|
public int PublishedJobs { get; private set; }
|
|
public List<IngestionRun> Runs { get; private set; } = new();
|
|
|
|
[BindProperty] public string? SourceChannel { get; set; }
|
|
[BindProperty] public string? RawText { get; set; }
|
|
|
|
[TempData] public string? IngestMessage { get; set; }
|
|
|
|
public async Task OnGetAsync() => await LoadAsync();
|
|
|
|
public async Task<IActionResult> OnPostAddAsync()
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(RawText))
|
|
{
|
|
_db.RawListings.Add(new RawListing
|
|
{
|
|
SourceChannel = string.IsNullOrWhiteSpace(SourceChannel) ? "ورود دستی" : SourceChannel.Trim(),
|
|
RawText = RawText.Trim(),
|
|
Status = RawListingStatus.New,
|
|
});
|
|
await _db.SaveChangesAsync();
|
|
}
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>Fast triage — reject (discard) a queued/flagged item without opening the review page.</summary>
|
|
public async Task<IActionResult> OnPostQuickDiscardAsync(int id)
|
|
{
|
|
var raw = await _db.RawListings.FirstOrDefaultAsync(r => r.Id == id);
|
|
if (raw is not null) { raw.Status = RawListingStatus.Discarded; await _db.SaveChangesAsync(); }
|
|
return RedirectToPage();
|
|
}
|
|
|
|
public async Task<IActionResult> OnPostRunIngestionAsync()
|
|
{
|
|
var s = await _ingest.RunAsync();
|
|
IngestMessage = $"جمعآوری انجام شد — {s.TotalQueued} در صف، {s.TotalFlagged} پرچمخورده، " +
|
|
$"{s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// DESTRUCTIVE rebuild, in two distinct deletes:
|
|
/// 1. The DEDUPE CACHE — ALL RawListings, including any added via «افزودن دستی». These are not
|
|
/// published content; they're the crawl/staging rows whose ContentHash blocks re-ingesting
|
|
/// the same ad. Wiping them lets everything be re-fetched and re-judged by the AI.
|
|
/// 2. AGGREGATED listings only — Shifts/JobOpenings/TalentListings with Source==Aggregated, i.e.
|
|
/// produced by ingestion. Employer/admin-posted listings (Source==Direct) are left untouched.
|
|
/// Then re-fetch everything and re-run it through the (now AI-enabled) pipeline.
|
|
/// RawListings are deleted first so their LinkedShift/LinkedTalent FKs (SetNull) don't dangle;
|
|
/// DB cascade clears ContactMethods / Applications / InterestEvents when the posts are deleted.
|
|
/// </summary>
|
|
public async Task<IActionResult> OnPostPurgeAndReingestAsync()
|
|
{
|
|
int rawCount, shifts, jobs, talent;
|
|
await using (var tx = await _db.Database.BeginTransactionAsync())
|
|
{
|
|
rawCount = await _db.RawListings.ExecuteDeleteAsync(); // clear dedupe cache
|
|
shifts = await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
|
jobs = await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
|
talent = await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
|
await tx.CommitAsync();
|
|
}
|
|
|
|
var s = await _ingest.RunAsync(); // fresh fetch → AI audit → publish/queue
|
|
IngestMessage = $"پاکسازی شد (حذف: {rawCount} آیتم کش، {shifts} شیفت، {jobs} استخدام، {talent} آمادهبهکارِ جمعآوریشده). " +
|
|
$"جمعآوری مجدد: {s.TotalPublished} منتشر، {s.TotalQueued} در صف، {s.TotalFlagged} پرچم، {s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Clean up EXISTING aggregated content by re-running the current pipeline over the stored raw
|
|
/// text — no re-fetch, so nothing is lost to sources only exposing recent posts. Long-running
|
|
/// (one AI call per item), so it runs on a background scope and returns immediately; the result
|
|
/// shows up as a new row in the «تاریخچهٔ اجرا» log when it finishes.
|
|
/// </summary>
|
|
public IActionResult OnPostReprocessStored()
|
|
{
|
|
_ = Task.Run(async () =>
|
|
{
|
|
using var scope = _scopes.CreateScope();
|
|
var svc = scope.ServiceProvider.GetRequiredService<IngestionService>();
|
|
var log = scope.ServiceProvider.GetRequiredService<ILogger<IndexModel>>();
|
|
// talentOnly: «آماده به کار» is NoIndex/Disallow → rebuilding it doesn't churn any indexed
|
|
// URL. Shift/Job detail pages ARE indexed, so they're left to self-clean via turnover.
|
|
try { await svc.ReprocessAsync(talentOnly: true); }
|
|
catch (Exception ex) { log.LogError(ex, "Background reprocess failed"); }
|
|
});
|
|
IngestMessage = "پردازش مجدد آیتمهای ذخیرهشده در پسزمینه آغاز شد. نتیجه پس از اتمام در «تاریخچهٔ اجرا» نمایش داده میشود (بسته به تعداد آیتمها و سرعت هوش مصنوعی، چند دقیقه طول میکشد).";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Fill missing map coordinates on existing aggregated Tehran listings from their stored ad text
|
|
/// (TehranGeo). In place — no AI calls, no re-fetch, and crucially no delete/recreate, so indexed
|
|
/// shift/job URLs keep their IDs. Fast (pure DB + string matching), so it runs inline.
|
|
/// </summary>
|
|
public async Task<IActionResult> OnPostBackfillCoordsAsync()
|
|
{
|
|
var n = await _ingest.BackfillCoordsAsync();
|
|
IngestMessage = $"مختصات تقریبی برای {n} آگهی جمعآوریشده از روی متن آگهی تکمیل شد (بدون تغییر شناسه یا آدرس صفحه).";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// In-place cleanup of existing aggregated jobs/shifts: ARCHIVE (hide, keep the row) only the
|
|
/// out-of-scope ones (domestic-helper / promotional / spam) per the current validator, plus
|
|
/// near-duplicate job reposts. Archived pages drop from lists + sitemap and return 410 Gone.
|
|
/// Valid listings keep their IDs/URLs. Reversible, no re-fetch, no AI — runs inline.
|
|
/// </summary>
|
|
public async Task<IActionResult> OnPostPurgeInvalidAsync()
|
|
{
|
|
var (archived, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
|
|
IngestMessage = $"بایگانیِ درجا: {archived} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری از سایت پنهان شد (وضعیت «بایگانی»؛ ردیف نگه داشته شد و قابل بازگشت است؛ صفحهشان ۴۱۰ Gone میدهد). آگهیهای معتبر و شناسه/آدرسشان دستنخورده ماند.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
private async Task LoadAsync()
|
|
{
|
|
Queue = await _db.RawListings
|
|
.Where(r => r.Status == RawListingStatus.New)
|
|
.OrderByDescending(r => r.Confidence).ThenByDescending(r => r.FetchedAt).ToListAsync();
|
|
Flagged = await _db.RawListings
|
|
.Where(r => r.Status == RawListingStatus.Flagged)
|
|
.OrderByDescending(r => r.FetchedAt).ToListAsync();
|
|
SourceNames = _ingest.SourceNames;
|
|
PublishedShifts = await _db.Shifts.CountAsync(s => s.Source != ShiftSource.Direct);
|
|
PublishedJobs = await _db.JobOpenings.CountAsync();
|
|
Runs = await _db.IngestionRuns.OrderByDescending(r => r.RunAt).Take(15).ToListAsync();
|
|
}
|
|
}
|