Files
hamkadr/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
T
soroush.asadi b223d3af2d
CI/CD / CI · dotnet build (push) Successful in 2m46s
CI/CD / Deploy · hamkadr (push) Successful in 2m5s
Collapse the sprawling role taxonomy (dedupe/compound/typo merge)
The dynamic taxonomy minted ~150 roles incl. exact triplicates («پرستار کودک» x3), multi-role
compounds («پرستار و بهیار»، «ماما / پرستار»، «پزشک و پرستار و بهیار»), and typos («بیهیار»، «بیار»).

Creation hardening: ResolveOrCreateRole now collapses a compound to its FIRST base role when that
segment is a known role (so «پرستار و بهیار»→«پرستار», but specialty names like «قلب و عروق»/«پوست
و مو» are left whole), and new aliases fold typos/synonyms (بیهیار/بیار→بهیار، فیزیوتراپ→فیزیوتراپیست،
نسخه پیچ→تکنسین داروخانه، پرستار بچه/اطفال→پرستار کودک).

Cleanup: MergeDuplicateRolesAsync (+ admin button) maps every role to a canonical form and merges
same-canonical roles into one keeper, repointing all shifts/jobs/talent/preferences/alerts/profiles
first (mirrors the manual /Admin/Roles merge). Combined with the no-fan-out change this should cut
the dropdown to a clean base set.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 21:35:43 +03:30

1066 lines
63 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Security.Cryptography;
using System.Text;
using System.Text.RegularExpressions;
using JobsMedical.Web.Data;
using JobsMedical.Web.Models;
using Microsoft.EntityFrameworkCore;
namespace JobsMedical.Web.Services.Scraping;
public record SourceResult(string Source, int Fetched, int Queued, int Published, int Flagged, int Spam, int Duplicates);
public record IngestionSummary(List<SourceResult> Sources)
{
public int TotalFetched => Sources.Sum(s => s.Fetched);
public int TotalQueued => Sources.Sum(s => s.Queued);
public int TotalPublished => Sources.Sum(s => s.Published);
public int TotalFlagged => Sources.Sum(s => s.Flagged);
public int TotalSpam => Sources.Sum(s => s.Spam);
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
}
/// <summary>
/// The scrape engine. For every enabled source: dedupe by content hash → parse → rule-validate →
/// (optional) AI audit → decide. Decision depends on admin settings:
/// • spam → Discarded
/// • AI on: AI verdict drives approve/reject/review; approve + Automatic + AiAutoApprove → publish
/// • AI off: Automatic + confidence ≥ threshold → publish; else queue/flag
/// "Publish" resolves-or-creates an (unverified) facility and creates the Shift/JobOpening.
/// </summary>
public class IngestionService
{
/// <summary>Applicant posts older than this (by the source's date, or a Persian "time ago"
/// phrase in the text) are skipped at ingest — availability goes stale fast.</summary>
private const int TalentMaxAgeDays = 7;
private readonly AppDbContext _db;
private readonly IEnumerable<IListingSource> _sources;
private readonly IListingParser _parser;
private readonly ListingValidator _validator;
private readonly IAiAuditor _ai;
private readonly SettingsService _settings;
private readonly ILogger<IngestionService> _log;
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources, IListingParser parser,
ListingValidator validator, IAiAuditor ai, SettingsService settings, ILogger<IngestionService> log)
{
_db = db; _sources = sources; _parser = parser; _validator = validator;
_ai = ai; _settings = settings; _log = log;
}
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
{
var settings = await _settings.GetAsync();
var roles = await _db.Roles.ToListAsync(ct);
var cities = await _db.Cities.ToListAsync(ct);
var districts = await _db.Districts.ToListAsync(ct);
var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create
var roleNames = roles.Select(r => r.Name).ToList();
var cityNames = cities.Select(c => c.Name).ToList();
var districtNames = districts.Select(d => d.Name).ToList();
var results = new List<SourceResult>();
foreach (var source in _sources)
{
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0;
IReadOnlyList<ScrapedItem> items;
try { items = await source.FetchAsync(settings, ct); }
catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; }
if (items.Count == 0) continue; // disabled/unconfigured source
foreach (var item in items)
{
fetched++;
var hash = Hash(item.RawText);
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
if (existing is not null)
{
// Best-effort geo retry: coords are normally captured only on first ingest, but a
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
// coords and the row has none, so an item still sitting in the queue can be placed on
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
dupes++; continue;
}
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
var val = _validator.Validate(item.RawText, parsed);
// Drop STALE applicant («آماده به کار») posts — a person's availability goes cold fast.
// Age = the source's real timestamp, else a Persian "time ago" phrase in the text
// (Divar embeds «۲ هفته پیش»…). Recorded as Discarded (keeps the dedupe hash + audit
// trail; no AI spend). Shifts/jobs are NOT aged out — their dates are in the future.
if (parsed.Kind == ListingKind.Talent && PostAgeDays(item) is int age && age > TalentMaxAgeDays)
{
_db.RawListings.Add(new RawListing
{
SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(),
ContentHash = hash, Confidence = 0, Status = RawListingStatus.Discarded,
ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد",
Lat = item.Lat, Lng = item.Lng,
});
spam++; continue;
}
AiAuditResult? ai = null;
if (settings.AiEnabled && !val.IsSpam)
ai = await _ai.AuditAsync(item.RawText, settings, ct);
var (status, reason, confidence) = Decide(settings, val, ai);
var raw = new RawListing
{
SourceChannel = item.Source,
SourceUrl = item.SourceUrl,
RawText = item.RawText.Trim(),
ContentHash = hash,
Confidence = confidence,
ValidationNotes = reason,
Status = status,
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
};
_db.RawListings.Add(raw);
if (status == RawListingStatus.Normalized)
{
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
}
else if (status == RawListingStatus.New) queued++;
else if (status == RawListingStatus.Flagged) flagged++;
else spam++;
}
await _db.SaveChangesAsync(ct);
results.Add(new SourceResult(source.Name, fetched, queued, published, flagged, spam, dupes));
_log.LogInformation("Ingest {S}: fetched={F} queued={Q} published={P} flagged={Fl} spam={Sp} dupes={D}",
source.Name, fetched, queued, published, flagged, spam, dupes);
}
var summary = new IngestionSummary(results);
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
// Self-clean after every crawl so the board stays tidy with no manual admin clicks: archive
// out-of-scope/duplicate listings, merge duplicate + fold junk facilities, backfill coords.
var cleanup = results.Count > 0 ? await RunPostIngestCleanupAsync(ct) : default;
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
if (results.Count > 0)
{
var detail = string.Join("؛ ", results.Select(r =>
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"))
+ $" || پاک‌سازیِ خودکار: {cleanup.archived} بایگانی، {cleanup.dedupedJobs} استخدامِ تکراری، {cleanup.mergedFac} مرکزِ ادغام، {cleanup.cleanedFac} مرکزِ حذف، {cleanup.coords} مختصات";
_db.IngestionRuns.Add(new IngestionRun
{
Fetched = summary.TotalFetched,
Queued = summary.TotalQueued,
Published = summary.TotalPublished,
Flagged = summary.TotalFlagged,
Spam = summary.TotalSpam,
Duplicates = summary.TotalDuplicates,
Detail = detail.Length > 2000 ? detail[..2000] : detail,
});
await _db.SaveChangesAsync(ct);
}
return summary;
}
/// <summary>
/// Re-run the CURRENT parser/AI/publish pipeline over every already-crawled RawListing, WITHOUT
/// re-fetching from sources. Use this after improving the pipeline to clean up existing aggregated
/// content (de-dupe, fix roles/categories/tags) — unlike <see cref="RunAsync"/> + the purge-cache
/// flow, it keeps every raw text, so nothing is lost to sources only exposing recent posts.
/// Deletes the old aggregated posts, then republishes from the stored raw text. Long-running
/// (one AI call per item) — call it on a background scope, not inside a request.
/// </summary>
/// <param name="talentOnly">SEO-safe default: only «آماده به کار» (which is NoIndex/Disallow) is
/// deleted &amp; rebuilt, so no INDEXED url changes. Shift/Job detail pages are indexed + in the
/// sitemap, so churning their IDs would 404 ranked pages — instead they self-clean via turnover.
/// Pass false only when you accept that SEO hit.</param>
public async Task<IngestionSummary> ReprocessAsync(bool talentOnly = true, CancellationToken ct = default)
{
var settings = await _settings.GetAsync();
var roles = await _db.Roles.ToListAsync(ct);
var cities = await _db.Cities.ToListAsync(ct);
var districts = await _db.Districts.ToListAsync(ct);
var facilities = await _db.Facilities.ToListAsync(ct); // reused (not deleted) → no facility churn
var roleNames = roles.Select(r => r.Name).ToList();
var cityNames = cities.Select(c => c.Name).ToList();
var districtNames = districts.Select(d => d.Name).ToList();
// Drop previously-published aggregated content; it's regenerated below from the raw text.
// DB cascade clears their ContactMethods/Applications/InterestEvents; RawListing back-refs SetNull.
await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
if (!talentOnly)
{
await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
}
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0;
var raws = await _db.RawListings.OrderBy(r => r.Id).ToListAsync(ct);
foreach (var raw in raws)
{
ct.ThrowIfCancellationRequested();
var parsed = _parser.Parse(raw.RawText, roleNames, cityNames, districtNames);
// SEO-safe scope: in talent-only mode, leave indexed shift/job listings (and their
// RawListing links/status) completely untouched — only applicants are rebuilt.
if (talentOnly && parsed.Kind != ListingKind.Talent) continue;
fetched++;
raw.LinkedTalentId = null; // talent rows were just deleted
if (!talentOnly) raw.LinkedShiftId = null;
var val = _validator.Validate(raw.RawText, parsed);
// Stale-applicant filter — age from the Persian "time ago" phrase in the text (Divar).
if (parsed.Kind == ListingKind.Talent
&& HtmlUtil.AgeDaysFromPersianText(raw.RawText) is int age && age > TalentMaxAgeDays)
{
raw.Status = RawListingStatus.Discarded; raw.Confidence = 0;
raw.ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد";
spam++; continue;
}
AiAuditResult? ai = null;
if (settings.AiEnabled && !val.IsSpam)
ai = await _ai.AuditAsync(raw.RawText, settings, ct);
var (status, reason, confidence) = Decide(settings, val, ai);
raw.Status = status; raw.ValidationNotes = reason; raw.Confidence = confidence;
if (status == RawListingStatus.Normalized)
{
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
catch (Exception ex) { _log.LogWarning(ex, "Reprocess publish failed; queueing"); raw.Status = RawListingStatus.New; queued++; }
}
else if (status == RawListingStatus.New) queued++;
else if (status == RawListingStatus.Flagged) flagged++;
else spam++;
if (fetched % 50 == 0) await _db.SaveChangesAsync(ct); // incremental progress on long runs
}
await _db.SaveChangesAsync(ct);
var deduped = await DedupeTalentAsync(ct); // collapse reposts the exact-hash dedup missed
_db.IngestionRuns.Add(new IngestionRun
{
Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = deduped,
Detail = $"پردازش مجدد آیتم‌های ذخیره‌شده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی، {deduped} تکراریِ حذف‌شده",
});
await _db.SaveChangesAsync(ct);
_log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S} deduped={D}",
fetched, published, queued, flagged, spam, deduped);
return new IngestionSummary(new List<SourceResult>
{ new("پردازش مجدد", fetched, queued, published, flagged, spam, deduped) });
}
/// <summary>
/// Collapse near-duplicate aggregated APPLICANTS left when a source reposts the same ad (different
/// text → different ContentHash, so exact dedup missed them). Two high-precision signals: an
/// identical phone, or identical (role, city, normalized description core with digits/«… پیش»
/// time-phrases removed). Keeps the NEWEST of each group, deletes the rest. Returns the count removed.
/// </summary>
public async Task<int> DedupeTalentAsync(CancellationToken ct = default)
{
var rows = await _db.TalentListings
.Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated)
.Select(t => new { t.Id, t.Phone, t.RoleId, t.CityId, t.Description, t.CreatedAt })
.ToListAsync(ct);
string? Sig(string? phone, int roleId, int cityId, string? desc)
{
var p = DigitsOnly(phone ?? "");
if (p.Length >= 7) return "p:" + p; // same number = same person/repost
var core = NormalizeFa(Regex.Replace(desc ?? "",
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
if (core.Length < 15) return null; // too little to call it a dup safely
return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}";
}
var toRemove = rows
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.Phone, r.RoleId, r.CityId, r.Description) })
.Where(x => x.Key is not null)
.GroupBy(x => x.Key)
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
.ToList();
if (toRemove.Count == 0) return 0;
var removed = await _db.TalentListings.Where(t => toRemove.Contains(t.Id)).ExecuteDeleteAsync(ct);
_log.LogInformation("Deduped {N} near-duplicate applicants.", removed);
return removed;
}
/// <summary>
/// In-place geocoding backfill: for existing AGGREGATED listings in Tehran that still have no map
/// coords, derive an APPROXIMATE neighbourhood center from the stored ad text (TehranGeo) and fill
/// Lat/Lng. Unlike <see cref="ReprocessAsync"/> it never deletes or recreates rows, so listing IDs —
/// and the indexed shift/job URLs in the sitemap — are untouched; safe to run on the live board.
/// Only ever FILLS a null coordinate; a real point (Divar/employer/AI) is never overwritten.
/// Returns how many listings were newly placed on the map.
/// </summary>
public async Task<int> BackfillCoordsAsync(CancellationToken ct = default)
{
var tehran = await _db.Cities.FirstOrDefaultAsync(c => c.Name == "تهران", ct);
if (tehran is null) return 0;
int filled = 0;
var jobs = await _db.JobOpenings
.Where(j => j.Status == ShiftStatus.Open && j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var j in jobs)
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
var shifts = await _db.Shifts
.Where(s => s.Status == ShiftStatus.Open && s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var s in shifts)
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
var talent = await _db.TalentListings
.Where(t => t.Status == ShiftStatus.Open && t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var t in talent)
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
if (filled > 0) await _db.SaveChangesAsync(ct);
_log.LogInformation("Coordinate backfill placed {N} aggregated listings on the map.", filled);
return filled;
}
/// <summary>
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
/// missing Tehran map coords. All in-place — reversible (archive, not delete) for listings, guarded
/// (never touches employer/verified facilities) — and pure DB + CPU (no AI, no network), so it's
/// cheap to run on every ingest. Keeps the board tidy without the admin clicking the cleanup buttons.
/// </summary>
public async Task<(int archived, int dedupedJobs, int mergedFac, int cleanedFac, int coords)>
RunPostIngestCleanupAsync(CancellationToken ct = default)
{
var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct);
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
var coords = await BackfillCoordsAsync(ct);
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}",
archived, dedupedJobs, mergedFac, cleanedFac, coords);
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
}
/// <summary>
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
/// only the ones that are now clearly out-of-scope — domestic-helper («امور منزل»),
/// promotional/training, or spam (i.e. <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-
/// but-legit ads are KEPT. Then collapse near-duplicate job reposts the same way. Archiving (vs
/// hard delete) is the project convention: the row is retained for analysis and the change is
/// reversible, the listing drops out of every public screen + the sitemap (which filter Status ==
/// Open), and its detail page returns 410 Gone (the standard "permanently removed" signal Google
/// uses to deindex). Valid listings are never touched, so their IDs/URLs stay stable.
/// Returns (archived, deduped).
/// </summary>
public async Task<(int archived, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
{
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
bool IsOutOfScope(string? text)
{
var t = text ?? "";
var parsed = _parser.Parse(t, roleNames, cityNames, districtNames);
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
}
int archived = 0;
var jobIds = (await _db.JobOpenings.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
if (jobIds.Count > 0)
archived += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id))
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
var shiftIds = (await _db.Shifts.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated)
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
if (shiftIds.Count > 0)
archived += await _db.Shifts.Where(s => shiftIds.Contains(s.Id))
.ExecuteUpdateAsync(u => u.SetProperty(s => s.Status, ShiftStatus.Archived), ct);
var deduped = await DedupeJobsAsync(ct);
_log.LogInformation("Purge archived {R} out-of-scope aggregated listings; deduped {D} jobs.", archived, deduped);
return (archived, deduped);
}
/// <summary>
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
/// group and ARCHIVES the rest (Status → Archived, reversible — same rationale as the purge).
/// Per-role fan-out of one ad is preserved (different RoleId → different signature).
/// </summary>
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
{
var gpId = await _db.Roles.Where(r => r.Name == "پزشک عمومی").Select(r => (int?)r.Id).FirstOrDefaultAsync(ct);
var rows = await _db.JobOpenings
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
.Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
.ToListAsync(ct);
// Signature = facility + normalized description core (digits/«… پیش» stripped). RoleId is
// deliberately NOT in the key, so the old role fan-out — the SAME ad published once per
// extracted/typo role (پزشک عمومی، پرستار، بهیار، «بیهیار»…) — collapses into one.
string? Sig(int facId, string? desc)
{
var core = NormalizeFa(Regex.Replace(desc ?? "",
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
if (core.Length < 15) return null; // too little to call it a dup safely
return $"j:{facId}:{(core.Length > 120 ? core[..120] : core)}";
}
// Keep one per group — prefer a non-«پزشک عمومی» role (the fan-out's GP copy is the usual
// mislabel), then the newest.
var toArchive = rows
.Select(r => new { r.Id, r.RoleId, r.CreatedAt, Key = Sig(r.FacilityId, r.Description) })
.Where(x => x.Key is not null)
.GroupBy(x => x.Key)
.SelectMany(g => g.OrderBy(x => x.RoleId == gpId ? 1 : 0).ThenByDescending(x => x.CreatedAt)
.Skip(1).Select(x => x.Id))
.ToList();
if (toArchive.Count == 0) return 0;
var archived = await _db.JobOpenings.Where(j => toArchive.Contains(j.Id))
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
_log.LogInformation("Archived {N} near-duplicate aggregated jobs.", archived);
return archived;
}
/// <summary>
/// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities
/// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the
/// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper,
/// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely
/// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and
/// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost.
/// Returns (merged, cleaned).
/// </summary>
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
{
var facilities = await _db.Facilities.ToListAsync(ct);
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId)
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
// The shared "unknown" placeholder is worded differently in older data
// («مرکز درمانی (نامشخص)») than the current constant, so an exact-name lookup found nothing and
// the junk-fold step silently no-op'd. Match by the «نامشخص» marker and pick the bucket actually
// used by the most listings — that's the real placeholder junk should fold into.
var placeholder = facilities
.Where(f => f.Name == UnknownFacilityName || FacilityMatcher.Normalize(f.Name).Contains("نامشخص"))
.OrderByDescending(f => Listings(f.Id)).FirstOrDefault();
var placeholderId = placeholder?.Id ?? -1;
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
// verified facility (those carry real employer data / verification).
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
&& f.Verification == VerificationStatus.Unverified
&& (placeholder is null || f.Id != placeholder.Id);
async Task AbsorbAsync(int fromId, int toId)
{
await _db.Shifts.Where(s => s.FacilityId == fromId)
.ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct);
await _db.JobOpenings.Where(j => j.FacilityId == fromId)
.ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct);
await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews
}
int merged = 0, cleaned = 0;
// 1) Junk-named crawl facilities → fold into the shared placeholder.
if (placeholder is not null)
foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList())
{
await AbsorbAsync(f.Id, placeholder.Id);
cleaned++;
}
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper (never the placeholder).
var remaining = (await _db.Facilities.ToListAsync(ct)).Where(f => f.Id != placeholderId).ToList();
var done = new HashSet<int>();
foreach (var f in remaining)
{
if (done.Contains(f.Id)) continue;
done.Add(f.Id);
var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id)
&& o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList();
if (cluster.Count == 0) continue;
cluster.Add(f);
// keeper: verified > owned > most listings > lowest id (oldest).
var keeper = cluster.OrderByDescending(x => x.IsVerified)
.ThenByDescending(x => x.OwnerUserId.HasValue)
.ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First();
foreach (var dup in cluster.Where(x => x.Id != keeper.Id))
{
done.Add(dup.Id);
if (!Removable(dup)) continue; // never delete an employer/verified facility
await AbsorbAsync(dup.Id, keeper.Id);
merged++;
}
}
_log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned);
return (merged, cleaned);
}
/// <summary>
/// In-place fix for EXISTING aggregated listings the AI mislabeled «پزشک عمومی» when the ad text
/// actually names a more specific role (dentist, endocrinologist/«متخصص», lab, …). Re-runs the
/// keyword parser + the same doctor-role guard over the stored text and updates RoleId (and the
/// generic «استخدام پزشک عمومی» title) IN PLACE — no AI call, no delete/recreate, so IDs and
/// indexed URLs are untouched. Only ever changes rows currently labeled «پزشک عمومی». Returns the
/// number corrected.
/// </summary>
public async Task<int> RecorrectDoctorRolesAsync(CancellationToken ct = default)
{
var roles = await _db.Roles.ToListAsync(ct);
var roleNames = roles.Select(r => r.Name).ToList();
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
var gp = roles.FirstOrDefault(r => r.Name == "پزشک عمومی");
if (gp is null) return 0;
Role? Corrected(string? text)
{
var parsed = _parser.Parse(text ?? "", roleNames, cityNames, districtNames);
var specific = parsed.RoleNames.FirstOrDefault(n => NormalizeFa(n) != NormalizeFa("پزشک عمومی"));
if (specific is not null) return ResolveOrCreateRole(roles, specific, null);
if (LooksSpecialist(text)) return ResolveOrCreateRole(roles, "پزشک متخصص", "پزشک");
return null;
}
int fixedCount = 0;
var jobs = await _db.JobOpenings
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated && j.RoleId == gp.Id)
.ToListAsync(ct);
foreach (var j in jobs)
{
if (Corrected(j.Description) is { } nr && nr.Id != j.RoleId)
{
if (string.IsNullOrWhiteSpace(j.Title) || j.Title == "استخدام پزشک عمومی") j.Title = $"استخدام {nr.Name}";
j.RoleId = nr.Id; fixedCount++;
}
}
var talent = await _db.TalentListings
.Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated && t.RoleId == gp.Id)
.ToListAsync(ct);
foreach (var t in talent)
if (Corrected(t.Description) is { } nr && nr.Id != t.RoleId) { t.RoleId = nr.Id; fixedCount++; }
if (fixedCount > 0) await _db.SaveChangesAsync(ct);
_log.LogInformation("Recorrected {N} «پزشک عمومی»-mislabeled aggregated listings.", fixedCount);
return fixedCount;
}
/// <summary>
/// Collapse the role taxonomy that the dynamic ingestion let sprawl: exact duplicates («پرستار
/// کودک» ×3), multi-role compounds («پرستار و بهیار»، «ماما / پرستار»), and typos («بیهیار»→بهیار).
/// Each role is mapped to a canonical form (strip modifiers → collapse compound to first base role →
/// alias) and same-canonical roles merge into one keeper, repointing every shift/job/talent/
/// preference/alert/profile first (mirrors the manual /Admin/Roles merge). Returns roles removed.
/// </summary>
public async Task<int> MergeDuplicateRolesAsync(CancellationToken ct = default)
{
var roles = await _db.Roles.ToListAsync(ct);
string Canon(string rawName)
{
var name = StripRoleModifiers(rawName);
if (CollapseCompound(roles, name) is { } b) name = b;
var norm = NormalizeFa(name);
return RoleAliases.TryGetValue(norm, out var c) ? NormalizeFa(c) : norm;
}
int merged = 0;
foreach (var g in roles.GroupBy(r => Canon(r.Name)).Where(g => g.Count() > 1))
{
// Keeper: a role whose own name IS the canonical (a clean base role), then the lowest Id.
var keeper = g.OrderBy(r => NormalizeFa(r.Name) == g.Key ? 0 : 1).ThenBy(r => r.Id).First();
foreach (var dup in g.Where(r => r.Id != keeper.Id))
{
await _db.Shifts.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, keeper.Id), ct);
await _db.JobOpenings.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, keeper.Id), ct);
await _db.TalentListings.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, keeper.Id), ct);
await _db.UserPreferences.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, (int?)keeper.Id), ct);
await _db.JobAlerts.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, (int?)keeper.Id), ct);
await _db.DoctorProfiles.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, (int?)keeper.Id), ct);
await _db.Roles.Where(r => r.Id == dup.Id).ExecuteDeleteAsync(ct);
merged++;
}
}
_log.LogInformation("Merged {N} duplicate/compound/typo roles.", merged);
return merged;
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide(
AppSetting s, ValidationResult val, AiAuditResult? ai)
{
var notes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null;
if (val.IsSpam)
return (RawListingStatus.Discarded, Join("اسپم", notes), val.Confidence);
if (ai is not null)
{
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
if (ai.Approve)
{
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
if (!val.LooksMedical)
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
}
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
}
if (!val.IsValid) return (RawListingStatus.Flagged, notes, val.Confidence);
if (s.Mode == IngestionMode.Automatic && val.Confidence >= s.AutoPublishMinConfidence)
return (RawListingStatus.Normalized, notes, val.Confidence);
return (RawListingStatus.New, notes, val.Confidence);
}
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
List<Role> roles, List<City> cities, List<District> districts, List<Facility> facilities)
{
var d = ai?.Data;
var cityName = d?.City ?? parsed.CityName;
var districtName = d?.District ?? parsed.DistrictName;
// One ad can name several roles («پرستار سالمند و کودک و همراه بیمار») — resolve them all
// and publish one listing per role so each is browsable/filterable. Capped to avoid spam.
// The AI's role (+ its category) is the trusted, possibly-new one; parser names are already
// canonical matches. Unknown roles are CREATED (dynamic taxonomy), not dropped.
var candidates = new List<(string name, string? category)>();
if (!string.IsNullOrWhiteSpace(d?.Role)) candidates.Add((d!.Role!.Trim(), d.Category));
foreach (var n in parsed.RoleNames) candidates.Add((n, null));
if (parsed.RoleName is not null) candidates.Add((parsed.RoleName, null));
var pubRoles = new List<Role>();
foreach (var (name, category) in candidates)
{
if (string.IsNullOrWhiteSpace(name)) continue;
var role = ResolveOrCreateRole(roles, name, category);
if (!pubRoles.Contains(role)) pubRoles.Add(role);
if (pubRoles.Count >= 4) break;
}
if (pubRoles.Count == 0) pubRoles.Add(roles.First());
// Doctor-role guard. «پزشک عمومی» is the AI's fallback when it's unsure, so it mislabels
// clearly-specific doctor ads — a dentist ad («دعوت به همکاری دندانپزشک») or an ENT/specialist
// one published as «استخدام پزشک عمومی». Rather than patch role-by-role, trust the keyword
// parser: if IT already found a more specific role in the same text, use that; otherwise fall
// back to «پزشک متخصص» when the text says specialist. Only ever overrides the weak GP default.
if (pubRoles[0].Name == "پزشک عمومی")
{
var specific = parsed.RoleNames.FirstOrDefault(n => NormalizeFa(n) != NormalizeFa("پزشک عمومی"));
if (specific is not null)
pubRoles[0] = ResolveOrCreateRole(roles, specific, null);
else if (LooksSpecialist(raw.RawText))
pubRoles[0] = ResolveOrCreateRole(roles, "پزشک متخصص", "پزشک");
}
var city = cities.FirstOrDefault(c => c.Name == cityName)
?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First();
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
// Approx. coords for the map: the source ad's point (Divar) when present; otherwise, for a
// Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough
// center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin.
double? appLat = raw.Lat, appLng = raw.Lng;
// Geocode from the structured location fields first, then fall back to scanning the ad body
// itself — many Tehran ads name the neighbourhood only in free text («… نیم ساعت پیش در سهروردی»)
// and never populate a district/area field, which is why most aggregated listings had no map.
if (appLat is null && city.Name == "تهران"
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote, raw.RawText) is { } g)
{ appLat = g.lat; appLng = g.lng; }
// Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran
// (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide.
if (appLat is null && d?.Lat is double aLat && d?.Lng is double aLng && InTehran(aLat, aLng))
{ appLat = aLat; appLng = aLng; }
var kindStr = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();
// «آماده به کار» — a worker offering themselves. No facility involved.
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
{
// Skip uncontactable applicants: no phone, no other contact channel, AND no source link to
// click through to — such a card is useless to an employer. (A source URL is enough, since
// the detail page now offers a «مشاهده آگهی در منبع» link for any source.)
var reachable = !string.IsNullOrWhiteSpace(d?.Phone) || !string.IsNullOrWhiteSpace(parsed.Phone)
|| parsed.Contacts.Count > 0 || !string.IsNullOrWhiteSpace(raw.SourceUrl);
if (!reachable) { raw.Status = RawListingStatus.Discarded; return; }
// ONE person = ONE listing. Do NOT fan out across roles: an applicant has a single
// profession, and «پرستار» + «پرستار کودک» from the same ad were producing duplicate
// cards. Use the primary (AI) role; any secondary role names become searchable tags.
var role = pubRoles[0];
var extraRoleTags = pubRoles.Skip(1).Select(r => r.Name);
var tPay = d?.PayAmount ?? parsed.PayAmount;
var tShare = d?.SharePercent ?? parsed.SharePercent;
_db.TalentListings.Add(new TalentListing
{
Role = role, City = city, DistrictId = district?.Id,
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
AreaNote = parsed.AreaNote,
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
Gender = parsed.Gender,
PayType = tShare is not null && tPay is null ? PayType.Percentage
: tPay is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = tPay, SharePercent = tShare,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
Description = raw.RawText,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
Contacts = BuildContacts(d, parsed),
Tags = BuildTags(parsed, d, role, city, extraRoleTags),
});
raw.Status = RawListingStatus.Normalized;
return;
}
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
: UnknownFacilityName;
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
if (facility is null)
{
facility = new Facility
{
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
};
_db.Facilities.Add(facility);
facilities.Add(facility); // so later listings in this run match it too
}
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
{
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
// real (employer-set or verified) location with Divar's fuzzy point.
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
}
// ONE ad = ONE listing. Do NOT fan out across roles. A single ad naming a few role-ish words
// («استخدام بهیار جهت دستیار پزشک و تزریقات») was exploding into 5 near-duplicate listings —
// one per extracted/typo role (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish only
// the primary (guard-corrected) role; the rest stay findable via the full description text.
var primaryRole = pubRoles[0];
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
{
_db.JobOpenings.Add(new JobOpening
{
Facility = facility, Role = primaryRole,
Title = $"استخدام {primaryRole.Name}",
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
// Prefer the AI-extracted salary, falling back to the parser's (matching the talent path).
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s)
});
}
else
{
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
var (start, end) = DefaultTimes(st);
_db.Shifts.Add(new Shift
{
Facility = facility, Role = primaryRole,
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
StartTime = start, EndTime = end, ShiftType = st,
SpecialtyRequired = primaryRole.Name, Description = raw.RawText,
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s)
});
}
raw.Status = RawListingStatus.Normalized;
}
/// <summary>Space-separated searchable tags: parsed cert/skill tags + AI-detected skills/requirements
/// + secondary role names + this listing's role/category + city. Pay/contact/location noise and
/// sentence fragments are filtered out so chips stay clinical. Drives deep search + tag chips.</summary>
private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city,
IEnumerable<string>? extraRoles = null)
{
var tags = new List<string>(parsed.Tags) { role.Name, role.Category, city.Name };
if (extraRoles is not null) tags.AddRange(extraRoles);
if (d?.Tags is not null)
tags.AddRange(d.Tags.Where(t => !string.IsNullOrWhiteSpace(t)).Select(t => t.Trim()));
return string.Join(" ", tags
.Where(t => !string.IsNullOrWhiteSpace(t) && !IsNoiseTag(t))
.Select(t => t.Trim())
.Distinct());
}
// Words/phrases that are NOT clinical skills — pay, contact, generic verbs, sentence fragments —
// that were polluting the tag chips («پرداخت توافقی»، «مراقبت از»…).
private static readonly string[] TagStopWords =
{
"توافقی", "پرداخت", "پرداخت توافقی", "حقوق", "دستمزد", "تماس", "شماره", "شماره تماس",
"مراقبت از", "مراقبت", "همکاری", "آماده", "آماده به کار", "نیرو", "استخدام", "جذب",
// personality / filler — not clinical skills
"خوش‌اخلاق", "خوش اخلاق", "خوشاخلاق", "دلسوز", "منظم", "مسئولیت‌پذیر", "مسئولیت پذیر", "باتجربه", "مجرب",
};
private static bool IsNoiseTag(string tag)
{
var t = NormalizeFa(tag);
if (t.Length < 2 || t.EndsWith(" از") || t.EndsWith("-از")) return true; // dangling «… از»
return TagStopWords.Any(w => NormalizeFa(w) == t);
}
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
// Separators that join SEVERAL roles in one ad («پرستار و بهیار»، «ماما / پرستار»، «پزشک و پرستار
// و بهیار»). A specialty name that legitimately contains «و» (قلب و عروق، پوست و مو) is NOT split,
// because its first segment isn't itself a known role.
private static readonly Regex RoleSeparators =
new(@"\s*/\s*|\s*،\s*|\s*,\s*|\s+یا\s+|\s+و\s+|\s*\+\s*", RegexOptions.Compiled);
/// <summary>If <paramref name="name"/> is a multi-role compound whose FIRST segment is (or aliases
/// to) an existing role, return that base role's name; otherwise null. So «پرستار و بهیار» → «پرستار»
/// but «قلب و عروق» / «پوست و مو» are left whole.</summary>
private static string? CollapseCompound(List<Role> roles, string name)
{
var segs = RoleSeparators.Split(name).Select(s => s.Trim()).Where(s => s.Length > 1).ToList();
if (segs.Count < 2) return null;
var fnorm = NormalizeFa(segs[0]);
if (roles.Any(r => NormalizeFa(r.Name) == fnorm)) return segs[0];
if (RoleAliases.TryGetValue(fnorm, out var canon) && roles.Any(r => NormalizeFa(r.Name) == NormalizeFa(canon)))
return canon;
return null;
}
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
{
// Drop gender/seniority modifiers baked into the role («پرستار آقا»→«پرستار»,
// «کارآموز تکنسین داروخانه»→«تکنسین داروخانه»). None of the real roles contain these tokens,
// so it only collapses sprawl — the modifier still lives on as a tag / the Gender field.
name = StripRoleModifiers(name);
// Collapse a multi-role compound to its first base role so we don't mint «پرستار و بهیار».
if (CollapseCompound(roles, name) is { } baseName) name = baseName;
var norm = NormalizeFa(name);
// (1) Already a known role (same word or spelling variant).
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
if (match is not null) return match;
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
if (RoleAliases.TryGetValue(norm, out var canonical))
{
var canonNorm = NormalizeFa(canonical);
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
if (aliased is not null) return aliased;
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
}
// (3) Genuinely new role — create it under a canonical-resolved category.
var created = new Role
{
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
Category = Clamp(ResolveCategory(category), 50), // closed set → respect MaxLength(50)
IsActive = true,
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
};
_db.Roles.Add(created);
roles.Add(created); // reuse within this run (saved with the batch at end of source)
_log.LogInformation("Ingestion introduced new role «{Role}» (category «{Category}») from AI.",
created.Name, created.Category);
return created;
}
/// <summary>Map an AI-suggested category to one of the FIXED groups (پزشک/پرستار/ماما/تکنسین/
/// دندانپزشک). Categories are a closed taxonomy — they drive the filter chips — so unlike roles
/// they are NEVER invented: a synonym resolves to its canonical group, anything else → «سایر».
/// (CategoryAliases maps each canonical group to itself, so exact matches resolve here too.)</summary>
private static string ResolveCategory(string? category)
=> CategoryAliases.TryGetValue(NormalizeFa(category), out var canonical) ? canonical : "سایر";
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
{
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
["تکنسین فوریت‌های پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
["کمک بهیار"] = new[] { "کمک‌یار", "کمکیار", "کمک یار", "کمک‌بهیار", "کمک بیمار" },
["بهیار"] = new[] { "بیهیار", "بیار", "بیهی", "بهییار", "بهیار پرستار" },
["پرستار کودک"] = new[] { "پرستار بچه", "مراقب کودک", "پرستار مراقب کودک", "کودکیار", "مادر یار کودک", "پرستار اطفال" },
["فیزیوتراپیست"] = new[] { "فیزیوتراپ", "فیزیوتراپی" },
["تکنسین داروخانه"] = new[] { "نسخه پیچ", "تکنسین نسخه پیچ" },
});
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
{
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
["ماما"] = new[] { "مامایی", "midwifery" },
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
});
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
/// also mapping each canonical's own normalized form to itself.</summary>
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
{
var map = new Dictionary<string, string>();
foreach (var (canonical, aliases) in src)
{
map[NormalizeFa(canonical)] = canonical;
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
}
return map;
}
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
private static string NormalizeFa(string? s) => Regex.Replace(
(s ?? "").Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim(),
@"\s+", " ").ToLowerInvariant();
private static string Clamp(string s, int max) => s.Length <= max ? s : s[..max].Trim();
/// <summary>Greater-Tehran bounding box — rejects out-of-area (hallucinated) AI coordinates.</summary>
private static bool InTehran(double lat, double lng) => lat is >= 35.4 and <= 35.95 && lng is >= 51.0 and <= 51.8;
// Markers that mean a doctor role is a SPECIALIST, not a GP — used to correct a «پزشک عمومی»
// mislabel on a clearly-specialist ad (e.g. an ENT post showing as «استخدام پزشک عمومی»).
private static readonly string[] SpecialistMarkers =
{ "متخصص", "فوق تخصص", "فوقتخصص", "فلوشیپ", "فلوشیب", "بورد تخصصی", "ساب اسپشالیتی", "ent" };
private static bool LooksSpecialist(string? rawText)
{
var t = NormalizeFa(rawText);
return SpecialistMarkers.Any(m => t.Contains(NormalizeFa(m)));
}
// Gender/seniority tokens that don't belong in a role name (they go to tags / the Gender field).
private static readonly string[] RoleModifierWords =
{ "آقا", "خانم", "خانوم", "بانو", "مرد", "زن", "کارآموز", "کارورز", "ارشد", "مبتدی" };
// Availability phrases that the model sometimes glues onto the role («کمک بهیار آماده به کار»);
// removed as whole substrings before token-stripping (so «به»/«کار» tokens stay safe elsewhere).
private static readonly string[] RolePhraseNoise =
{ "آماده به کار", "آماده همکاری", "آماده بکار", "آماده به همکاری", "جویای کار", "دنبال کار", "جهت همکاری" };
/// <summary>Remove availability phrases + gender/seniority tokens from a role name, keeping the
/// base profession. Never strips to empty (falls back to the original).</summary>
private static string StripRoleModifiers(string name)
{
var n = NormalizeFa(name);
foreach (var p in RolePhraseNoise) n = n.Replace(NormalizeFa(p), " ");
var kept = n.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Where(t => !RoleModifierWords.Any(m => NormalizeFa(m) == t)).ToList();
return kept.Count > 0 ? string.Join(" ", kept) : name.Trim();
}
/// <summary>Fresh ContactMethod rows for one talent listing (parser contacts + AI phone).</summary>
private static List<ContactMethod> BuildContacts(AiStructured? d, ParsedListing parsed)
{
var contacts = parsed.Contacts
.Select((c, i) => new ContactMethod { Type = c.Type, Value = c.Value, SortOrder = i })
.ToList();
if (!string.IsNullOrWhiteSpace(d?.Phone)
&& !contacts.Any(c => c.Type is ContactType.Mobile or ContactType.Phone))
contacts.Insert(0, new ContactMethod { Type = ContactType.Mobile, Value = d!.Phone!.Trim(), SortOrder = -1 });
return contacts;
}
private static ShiftType MapShiftType(string? ai, ShiftType? parsed) => (ai?.ToLowerInvariant()) switch
{
"day" => ShiftType.Day, "evening" => ShiftType.Evening, "night" => ShiftType.Night, "oncall" => ShiftType.OnCall,
_ => parsed ?? ShiftType.Day,
};
private static EmploymentType MapEmployment(string? ai, EmploymentType? parsed) => (ai?.ToLowerInvariant()) switch
{
"parttime" => EmploymentType.PartTime, "contract" => EmploymentType.Contract,
"plan" => EmploymentType.Plan, "fulltime" => EmploymentType.FullTime,
_ => parsed ?? EmploymentType.FullTime,
};
private static (TimeOnly, TimeOnly) DefaultTimes(ShiftType t) => t switch
{
ShiftType.Day => (new TimeOnly(8, 0), new TimeOnly(14, 0)),
ShiftType.Evening => (new TimeOnly(14, 0), new TimeOnly(20, 0)),
ShiftType.Night => (new TimeOnly(20, 0), new TimeOnly(8, 0)),
_ => (new TimeOnly(8, 0), new TimeOnly(8, 0)),
};
private static string? Join(string a, string? b) => string.IsNullOrEmpty(b) ? a : $"{a} | {b}";
private static string Hash(string text)
{
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant();
}
/// <summary>Age of a post in whole days — from the source's real timestamp when present, else a
/// Persian "time ago" phrase in the text (Divar). Null when neither is available (= unknown age,
/// so it's NOT filtered out).</summary>
private static int? PostAgeDays(ScrapedItem item)
{
if (item.PostedAt is DateTime posted)
return Math.Max(0, (int)Math.Floor((DateTime.UtcNow - posted).TotalDays));
return HtmlUtil.AgeDaysFromPersianText(item.RawText);
}
}