Files
hamkadr/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs
T
soroush.asadi 88eca92333
CI/CD / CI · dotnet build (push) Successful in 1m51s
CI/CD / Deploy · hamkadr (push) Successful in 2m17s
Facility data hygiene: merge duplicates, drop junk-named facilities
Cleans up the crawl-generated facility table that surfaced garbage on /Facilities
(«بیمارستان هستم», «... از مدجابز», bare «کلینیک», «سازمان برنامه جنوبی» x3):

- FacilityMatcher.IsJunkName: shared detector for non-names — bare type words, cores
  made only of filler/verb tokens, and leaked crawl-source/placeholder text. Added
  داروخانه/آسایشگاه to the generic type words so bare ones are caught and dedupe better.
- HeuristicListingParser.ExtractFacilityName now rejects junk candidates (and emoji), so
  new ingests fall back to the shared placeholder instead of forging a fake facility.
- IngestionService.MergeAndCleanFacilitiesAsync (+ admin button): folds junk facilities
  into the placeholder and merges Persian-fuzzy duplicates into one keeper, repointing
  their shifts/jobs first. Hard guard: only purely crawl-generated, unmanaged facilities
  are removed — employer-owned and verified facilities are never touched.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 05:40:29 +03:30

174 lines
9.5 KiB
C#

using JobsMedical.Web.Data;
using JobsMedical.Web.Models;
using JobsMedical.Web.Services.Scraping;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.Mvc.RazorPages;
using Microsoft.EntityFrameworkCore;
namespace JobsMedical.Web.Pages.Admin;
[Authorize(Roles = "Admin")]
public class IndexModel : PageModel
{
private readonly AppDbContext _db;
private readonly IngestionService _ingest;
private readonly IServiceScopeFactory _scopes;
private readonly ILogger<IndexModel> _log;
public IndexModel(AppDbContext db, IngestionService ingest, IServiceScopeFactory scopes, ILogger<IndexModel> log)
{
_db = db;
_ingest = ingest;
_scopes = scopes;
_log = log;
}
public List<RawListing> Queue { get; private set; } = new();
public List<RawListing> Flagged { get; private set; } = new();
public IReadOnlyList<string> SourceNames { get; private set; } = new List<string>();
public int PublishedShifts { get; private set; }
public int PublishedJobs { get; private set; }
public List<IngestionRun> Runs { get; private set; } = new();
[BindProperty] public string? SourceChannel { get; set; }
[BindProperty] public string? RawText { get; set; }
[TempData] public string? IngestMessage { get; set; }
public async Task OnGetAsync() => await LoadAsync();
public async Task<IActionResult> OnPostAddAsync()
{
if (!string.IsNullOrWhiteSpace(RawText))
{
_db.RawListings.Add(new RawListing
{
SourceChannel = string.IsNullOrWhiteSpace(SourceChannel) ? "ورود دستی" : SourceChannel.Trim(),
RawText = RawText.Trim(),
Status = RawListingStatus.New,
});
await _db.SaveChangesAsync();
}
return RedirectToPage();
}
/// <summary>Fast triage — reject (discard) a queued/flagged item without opening the review page.</summary>
public async Task<IActionResult> OnPostQuickDiscardAsync(int id)
{
var raw = await _db.RawListings.FirstOrDefaultAsync(r => r.Id == id);
if (raw is not null) { raw.Status = RawListingStatus.Discarded; await _db.SaveChangesAsync(); }
return RedirectToPage();
}
public async Task<IActionResult> OnPostRunIngestionAsync()
{
var s = await _ingest.RunAsync();
IngestMessage = $"جمع‌آوری انجام شد — {s.TotalQueued} در صف، {s.TotalFlagged} پرچم‌خورده، " +
$"{s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
return RedirectToPage();
}
/// <summary>
/// DESTRUCTIVE rebuild, in two distinct deletes:
/// 1. The DEDUPE CACHE — ALL RawListings, including any added via «افزودن دستی». These are not
/// published content; they're the crawl/staging rows whose ContentHash blocks re-ingesting
/// the same ad. Wiping them lets everything be re-fetched and re-judged by the AI.
/// 2. AGGREGATED listings only — Shifts/JobOpenings/TalentListings with Source==Aggregated, i.e.
/// produced by ingestion. Employer/admin-posted listings (Source==Direct) are left untouched.
/// Then re-fetch everything and re-run it through the (now AI-enabled) pipeline.
/// RawListings are deleted first so their LinkedShift/LinkedTalent FKs (SetNull) don't dangle;
/// DB cascade clears ContactMethods / Applications / InterestEvents when the posts are deleted.
/// </summary>
public async Task<IActionResult> OnPostPurgeAndReingestAsync()
{
int rawCount, shifts, jobs, talent;
await using (var tx = await _db.Database.BeginTransactionAsync())
{
rawCount = await _db.RawListings.ExecuteDeleteAsync(); // clear dedupe cache
shifts = await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
jobs = await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
talent = await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
await tx.CommitAsync();
}
var s = await _ingest.RunAsync(); // fresh fetch → AI audit → publish/queue
IngestMessage = $"پاک‌سازی شد (حذف: {rawCount} آیتم کش، {shifts} شیفت، {jobs} استخدام، {talent} آماده‌به‌کارِ جمع‌آوری‌شده). " +
$"جمع‌آوری مجدد: {s.TotalPublished} منتشر، {s.TotalQueued} در صف، {s.TotalFlagged} پرچم، {s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
return RedirectToPage();
}
/// <summary>
/// Clean up EXISTING aggregated content by re-running the current pipeline over the stored raw
/// text — no re-fetch, so nothing is lost to sources only exposing recent posts. Long-running
/// (one AI call per item), so it runs on a background scope and returns immediately; the result
/// shows up as a new row in the «تاریخچهٔ اجرا» log when it finishes.
/// </summary>
public IActionResult OnPostReprocessStored()
{
_ = Task.Run(async () =>
{
using var scope = _scopes.CreateScope();
var svc = scope.ServiceProvider.GetRequiredService<IngestionService>();
var log = scope.ServiceProvider.GetRequiredService<ILogger<IndexModel>>();
// talentOnly: «آماده به کار» is NoIndex/Disallow → rebuilding it doesn't churn any indexed
// URL. Shift/Job detail pages ARE indexed, so they're left to self-clean via turnover.
try { await svc.ReprocessAsync(talentOnly: true); }
catch (Exception ex) { log.LogError(ex, "Background reprocess failed"); }
});
IngestMessage = "پردازش مجدد آیتم‌های ذخیره‌شده در پس‌زمینه آغاز شد. نتیجه پس از اتمام در «تاریخچهٔ اجرا» نمایش داده می‌شود (بسته به تعداد آیتم‌ها و سرعت هوش مصنوعی، چند دقیقه طول می‌کشد).";
return RedirectToPage();
}
/// <summary>
/// Fill missing map coordinates on existing aggregated Tehran listings from their stored ad text
/// (TehranGeo). In place — no AI calls, no re-fetch, and crucially no delete/recreate, so indexed
/// shift/job URLs keep their IDs. Fast (pure DB + string matching), so it runs inline.
/// </summary>
public async Task<IActionResult> OnPostBackfillCoordsAsync()
{
var n = await _ingest.BackfillCoordsAsync();
IngestMessage = $"مختصات تقریبی برای {n} آگهی جمع‌آوری‌شده از روی متن آگهی تکمیل شد (بدون تغییر شناسه یا آدرس صفحه).";
return RedirectToPage();
}
/// <summary>
/// In-place cleanup of existing aggregated jobs/shifts: ARCHIVE (hide, keep the row) only the
/// out-of-scope ones (domestic-helper / promotional / spam) per the current validator, plus
/// near-duplicate job reposts. Archived pages drop from lists + sitemap and return 410 Gone.
/// Valid listings keep their IDs/URLs. Reversible, no re-fetch, no AI — runs inline.
/// </summary>
public async Task<IActionResult> OnPostPurgeInvalidAsync()
{
var (archived, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
IngestMessage = $"بایگانیِ درجا: {archived} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری از سایت پنهان شد (وضعیت «بایگانی»؛ ردیف نگه داشته شد و قابل بازگشت است؛ صفحه‌شان ۴۱۰ Gone می‌دهد). آگهی‌های معتبر و شناسه/آدرسشان دست‌نخورده ماند.";
return RedirectToPage();
}
/// <summary>
/// Clean up the crawl-generated facility table: merge Persian-fuzzy duplicate facilities and fold
/// junk-named ones («بیمارستان هستم»، «... از مدجابز»، bare «کلینیک») into the shared placeholder,
/// repointing their listings first. Employer-owned / verified facilities are never touched.
/// </summary>
public async Task<IActionResult> OnPostCleanFacilitiesAsync()
{
var (merged, cleaned) = await _ingest.MergeAndCleanFacilitiesAsync();
IngestMessage = $"پاک‌سازی مراکز: {merged} مرکزِ تکراری ادغام و {cleaned} مرکزِ بی‌نام/نامعتبر حذف شد (آگهی‌هایشان به مرکزِ معتبر یا «نامشخص» منتقل شد). مراکز ثبت‌شده توسط کارفرما/تأییدشده دست‌نخورده ماند.";
return RedirectToPage();
}
private async Task LoadAsync()
{
Queue = await _db.RawListings
.Where(r => r.Status == RawListingStatus.New)
.OrderByDescending(r => r.Confidence).ThenByDescending(r => r.FetchedAt).ToListAsync();
Flagged = await _db.RawListings
.Where(r => r.Status == RawListingStatus.Flagged)
.OrderByDescending(r => r.FetchedAt).ToListAsync();
SourceNames = _ingest.SourceNames;
PublishedShifts = await _db.Shifts.CountAsync(s => s.Source != ShiftSource.Direct);
PublishedJobs = await _db.JobOpenings.CountAsync();
Runs = await _db.IngestionRuns.OrderByDescending(r => r.RunAt).Take(15).ToListAsync();
}
}