using System.Security.Cryptography; using System.Text; using System.Text.RegularExpressions; using JobsMedical.Web.Data; using JobsMedical.Web.Models; using Microsoft.EntityFrameworkCore; namespace JobsMedical.Web.Services.Scraping; public record SourceResult(string Source, int Fetched, int Queued, int Published, int Flagged, int Spam, int Duplicates); public record IngestionSummary(List Sources) { public int TotalFetched => Sources.Sum(s => s.Fetched); public int TotalQueued => Sources.Sum(s => s.Queued); public int TotalPublished => Sources.Sum(s => s.Published); public int TotalFlagged => Sources.Sum(s => s.Flagged); public int TotalSpam => Sources.Sum(s => s.Spam); public int TotalDuplicates => Sources.Sum(s => s.Duplicates); } /// /// The scrape engine. For every enabled source: dedupe by content hash → parse → rule-validate → /// (optional) AI audit → decide. Decision depends on admin settings: /// • spam → Discarded /// • AI on: AI verdict drives approve/reject/review; approve + Automatic + AiAutoApprove → publish /// • AI off: Automatic + confidence ≥ threshold → publish; else queue/flag /// "Publish" resolves-or-creates an (unverified) facility and creates the Shift/JobOpening. /// public class IngestionService { /// Applicant posts older than this (by the source's date, or a Persian "time ago" /// phrase in the text) are skipped at ingest — availability goes stale fast. private const int TalentMaxAgeDays = 7; private readonly AppDbContext _db; private readonly IEnumerable _sources; private readonly IListingParser _parser; private readonly ListingValidator _validator; private readonly IAiAuditor _ai; private readonly SettingsService _settings; private readonly ILogger _log; public IngestionService(AppDbContext db, IEnumerable sources, IListingParser parser, ListingValidator validator, IAiAuditor ai, SettingsService settings, ILogger log) { _db = db; _sources = sources; _parser = parser; _validator = validator; _ai = ai; _settings = settings; _log = log; } public IReadOnlyList SourceNames => _sources.Select(s => s.Name).ToList(); /// Shared placeholder facility name for unnamed ads — kept identical to /// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record. private const string UnknownFacilityName = "نامشخص / ثبت نشده"; public async Task RunAsync(CancellationToken ct = default) { var settings = await _settings.GetAsync(); var roles = await _db.Roles.ToListAsync(ct); var cities = await _db.Cities.ToListAsync(ct); var districts = await _db.Districts.ToListAsync(ct); var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create var roleNames = roles.Select(r => r.Name).ToList(); var cityNames = cities.Select(c => c.Name).ToList(); var districtNames = districts.Select(d => d.Name).ToList(); var results = new List(); foreach (var source in _sources) { int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0; IReadOnlyList items; try { items = await source.FetchAsync(settings, ct); } catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; } if (items.Count == 0) continue; // disabled/unconfigured source foreach (var item in items) { fetched++; var hash = Hash(item.RawText); var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct); if (existing is not null) { // Best-effort geo retry: coords are normally captured only on first ingest, but a // re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to // null on a bad response / out-of-bbox). Backfill the cached row when this fetch has // coords and the row has none, so an item still sitting in the queue can be placed on // the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.) if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; } dupes++; continue; } var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames); var val = _validator.Validate(item.RawText, parsed); // Drop STALE applicant («آماده به کار») posts — a person's availability goes cold fast. // Age = the source's real timestamp, else a Persian "time ago" phrase in the text // (Divar embeds «۲ هفته پیش»…). Recorded as Discarded (keeps the dedupe hash + audit // trail; no AI spend). Shifts/jobs are NOT aged out — their dates are in the future. if (parsed.Kind == ListingKind.Talent && PostAgeDays(item) is int age && age > TalentMaxAgeDays) { _db.RawListings.Add(new RawListing { SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(), ContentHash = hash, Confidence = 0, Status = RawListingStatus.Discarded, ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد", Lat = item.Lat, Lng = item.Lng, }); spam++; continue; } AiAuditResult? ai = null; if (settings.AiEnabled && !val.IsSpam) ai = await _ai.AuditAsync(item.RawText, settings, ct); var (status, reason, confidence) = Decide(settings, val, ai); var raw = new RawListing { SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(), ContentHash = hash, Confidence = confidence, ValidationNotes = reason, Status = status, Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish }; _db.RawListings.Add(raw); if (status == RawListingStatus.Normalized) { try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; } catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; } } else if (status == RawListingStatus.New) queued++; else if (status == RawListingStatus.Flagged) flagged++; else spam++; } await _db.SaveChangesAsync(ct); results.Add(new SourceResult(source.Name, fetched, queued, published, flagged, spam, dupes)); _log.LogInformation("Ingest {S}: fetched={F} queued={Q} published={P} flagged={Fl} spam={Sp} dupes={D}", source.Name, fetched, queued, published, flagged, spam, dupes); } var summary = new IngestionSummary(results); await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch // Self-clean after every crawl so the board stays tidy with no manual admin clicks: archive // out-of-scope/duplicate listings, merge duplicate + fold junk facilities, backfill coords. var cleanup = results.Count > 0 ? await RunPostIngestCleanupAsync(ct) : default; // Persist a run-log row so admins get a crawl history (with a per-source breakdown). if (results.Count > 0) { var detail = string.Join("؛ ", results.Select(r => $"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}")) + $" || پاک‌سازیِ خودکار: {cleanup.archived} بایگانی، {cleanup.dedupedJobs} استخدامِ تکراری، {cleanup.mergedFac} مرکزِ ادغام، {cleanup.cleanedFac} مرکزِ حذف، {cleanup.coords} مختصات"; _db.IngestionRuns.Add(new IngestionRun { Fetched = summary.TotalFetched, Queued = summary.TotalQueued, Published = summary.TotalPublished, Flagged = summary.TotalFlagged, Spam = summary.TotalSpam, Duplicates = summary.TotalDuplicates, Detail = detail.Length > 2000 ? detail[..2000] : detail, }); await _db.SaveChangesAsync(ct); } return summary; } /// /// Re-run the CURRENT parser/AI/publish pipeline over every already-crawled RawListing, WITHOUT /// re-fetching from sources. Use this after improving the pipeline to clean up existing aggregated /// content (de-dupe, fix roles/categories/tags) — unlike + the purge-cache /// flow, it keeps every raw text, so nothing is lost to sources only exposing recent posts. /// Deletes the old aggregated posts, then republishes from the stored raw text. Long-running /// (one AI call per item) — call it on a background scope, not inside a request. /// /// SEO-safe default: only «آماده به کار» (which is NoIndex/Disallow) is /// deleted & rebuilt, so no INDEXED url changes. Shift/Job detail pages are indexed + in the /// sitemap, so churning their IDs would 404 ranked pages — instead they self-clean via turnover. /// Pass false only when you accept that SEO hit. public async Task ReprocessAsync(bool talentOnly = true, CancellationToken ct = default) { var settings = await _settings.GetAsync(); var roles = await _db.Roles.ToListAsync(ct); var cities = await _db.Cities.ToListAsync(ct); var districts = await _db.Districts.ToListAsync(ct); var facilities = await _db.Facilities.ToListAsync(ct); // reused (not deleted) → no facility churn var roleNames = roles.Select(r => r.Name).ToList(); var cityNames = cities.Select(c => c.Name).ToList(); var districtNames = districts.Select(d => d.Name).ToList(); // Drop previously-published aggregated content; it's regenerated below from the raw text. // DB cascade clears their ContactMethods/Applications/InterestEvents; RawListing back-refs SetNull. await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct); if (!talentOnly) { await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct); await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct); } int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0; var raws = await _db.RawListings.OrderBy(r => r.Id).ToListAsync(ct); foreach (var raw in raws) { ct.ThrowIfCancellationRequested(); var parsed = _parser.Parse(raw.RawText, roleNames, cityNames, districtNames); // SEO-safe scope: in talent-only mode, leave indexed shift/job listings (and their // RawListing links/status) completely untouched — only applicants are rebuilt. if (talentOnly && parsed.Kind != ListingKind.Talent) continue; fetched++; raw.LinkedTalentId = null; // talent rows were just deleted if (!talentOnly) raw.LinkedShiftId = null; var val = _validator.Validate(raw.RawText, parsed); // Stale-applicant filter — age from the Persian "time ago" phrase in the text (Divar). if (parsed.Kind == ListingKind.Talent && HtmlUtil.AgeDaysFromPersianText(raw.RawText) is int age && age > TalentMaxAgeDays) { raw.Status = RawListingStatus.Discarded; raw.Confidence = 0; raw.ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد"; spam++; continue; } AiAuditResult? ai = null; if (settings.AiEnabled && !val.IsSpam) ai = await _ai.AuditAsync(raw.RawText, settings, ct); var (status, reason, confidence) = Decide(settings, val, ai); raw.Status = status; raw.ValidationNotes = reason; raw.Confidence = confidence; if (status == RawListingStatus.Normalized) { try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; } catch (Exception ex) { _log.LogWarning(ex, "Reprocess publish failed; queueing"); raw.Status = RawListingStatus.New; queued++; } } else if (status == RawListingStatus.New) queued++; else if (status == RawListingStatus.Flagged) flagged++; else spam++; if (fetched % 50 == 0) await _db.SaveChangesAsync(ct); // incremental progress on long runs } await _db.SaveChangesAsync(ct); var deduped = await DedupeTalentAsync(ct); // collapse reposts the exact-hash dedup missed _db.IngestionRuns.Add(new IngestionRun { Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = deduped, Detail = $"پردازش مجدد آیتم‌های ذخیره‌شده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی، {deduped} تکراریِ حذف‌شده", }); await _db.SaveChangesAsync(ct); _log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S} deduped={D}", fetched, published, queued, flagged, spam, deduped); return new IngestionSummary(new List { new("پردازش مجدد", fetched, queued, published, flagged, spam, deduped) }); } /// /// Collapse near-duplicate aggregated APPLICANTS left when a source reposts the same ad (different /// text → different ContentHash, so exact dedup missed them). Two high-precision signals: an /// identical phone, or identical (role, city, normalized description core with digits/«… پیش» /// time-phrases removed). Keeps the NEWEST of each group, deletes the rest. Returns the count removed. /// public async Task DedupeTalentAsync(CancellationToken ct = default) { var rows = await _db.TalentListings .Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated) .Select(t => new { t.Id, t.Phone, t.RoleId, t.CityId, t.Description, t.CreatedAt }) .ToListAsync(ct); string? Sig(string? phone, int roleId, int cityId, string? desc) { var p = DigitsOnly(phone ?? ""); if (p.Length >= 7) return "p:" + p; // same number = same person/repost var core = NormalizeFa(Regex.Replace(desc ?? "", @"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim(); if (core.Length < 15) return null; // too little to call it a dup safely return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}"; } var toRemove = rows .Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.Phone, r.RoleId, r.CityId, r.Description) }) .Where(x => x.Key is not null) .GroupBy(x => x.Key) .SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id)) .ToList(); if (toRemove.Count == 0) return 0; var removed = await _db.TalentListings.Where(t => toRemove.Contains(t.Id)).ExecuteDeleteAsync(ct); _log.LogInformation("Deduped {N} near-duplicate applicants.", removed); return removed; } /// /// In-place geocoding backfill: for existing AGGREGATED listings in Tehran that still have no map /// coords, derive an APPROXIMATE neighbourhood center from the stored ad text (TehranGeo) and fill /// Lat/Lng. Unlike it never deletes or recreates rows, so listing IDs — /// and the indexed shift/job URLs in the sitemap — are untouched; safe to run on the live board. /// Only ever FILLS a null coordinate; a real point (Divar/employer/AI) is never overwritten. /// Returns how many listings were newly placed on the map. /// public async Task BackfillCoordsAsync(CancellationToken ct = default) { var tehran = await _db.Cities.FirstOrDefaultAsync(c => c.Name == "تهران", ct); if (tehran is null) return 0; int filled = 0; var jobs = await _db.JobOpenings .Where(j => j.Status == ShiftStatus.Open && j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id) .ToListAsync(ct); foreach (var j in jobs) if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; } var shifts = await _db.Shifts .Where(s => s.Status == ShiftStatus.Open && s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id) .ToListAsync(ct); foreach (var s in shifts) if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; } var talent = await _db.TalentListings .Where(t => t.Status == ShiftStatus.Open && t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id) .ToListAsync(ct); foreach (var t in talent) if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; } if (filled > 0) await _db.SaveChangesAsync(ct); _log.LogInformation("Coordinate backfill placed {N} aggregated listings on the map.", filled); return filled; } /// /// The self-cleaning pass run automatically at the end of every crawl (and available on demand): /// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill /// missing Tehran map coords. All in-place — reversible (archive, not delete) for listings, guarded /// (never touches employer/verified facilities) — and pure DB + CPU (no AI, no network), so it's /// cheap to run on every ingest. Keeps the board tidy without the admin clicking the cleanup buttons. /// public async Task<(int archived, int dedupedJobs, int mergedFac, int cleanedFac, int coords)> RunPostIngestCleanupAsync(CancellationToken ct = default) { var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct); var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct); var coords = await BackfillCoordsAsync(ct); _log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}", archived, dedupedJobs, mergedFac, cleanedFac, coords); return (archived, dedupedJobs, mergedFac, cleanedFac, coords); } /// /// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open /// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete) /// only the ones that are now clearly out-of-scope — domestic-helper («امور منزل»), /// promotional/training, or spam (i.e. ). Merely-incomplete- /// but-legit ads are KEPT. Then collapse near-duplicate job reposts the same way. Archiving (vs /// hard delete) is the project convention: the row is retained for analysis and the change is /// reversible, the listing drops out of every public screen + the sitemap (which filter Status == /// Open), and its detail page returns 410 Gone (the standard "permanently removed" signal Google /// uses to deindex). Valid listings are never touched, so their IDs/URLs stay stable. /// Returns (archived, deduped). /// public async Task<(int archived, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default) { var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct); var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct); var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct); bool IsOutOfScope(string? text) { var t = text ?? ""; var parsed = _parser.Parse(t, roleNames, cityNames, districtNames); return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper } int archived = 0; var jobIds = (await _db.JobOpenings.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated) .Select(j => new { j.Id, j.Description }).ToListAsync(ct)) .Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList(); if (jobIds.Count > 0) archived += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)) .ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct); var shiftIds = (await _db.Shifts.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated) .Select(s => new { s.Id, s.Description }).ToListAsync(ct)) .Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList(); if (shiftIds.Count > 0) archived += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)) .ExecuteUpdateAsync(u => u.SetProperty(s => s.Status, ShiftStatus.Archived), ct); var deduped = await DedupeJobsAsync(ct); _log.LogInformation("Purge archived {R} out-of-scope aggregated listings; deduped {D} jobs.", archived, deduped); return (archived, deduped); } /// /// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled /// with slightly different surrounding text → different ContentHash). Signature = role + facility + /// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each /// group and ARCHIVES the rest (Status → Archived, reversible — same rationale as the purge). /// Per-role fan-out of one ad is preserved (different RoleId → different signature). /// public async Task DedupeJobsAsync(CancellationToken ct = default) { var gpId = await _db.Roles.Where(r => r.Name == "پزشک عمومی").Select(r => (int?)r.Id).FirstOrDefaultAsync(ct); var rows = await _db.JobOpenings .Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated) .Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt }) .ToListAsync(ct); // Signature = facility + normalized description core (digits/«… پیش» stripped). RoleId is // deliberately NOT in the key, so the old role fan-out — the SAME ad published once per // extracted/typo role (پزشک عمومی، پرستار، بهیار، «بیهیار»…) — collapses into one. string? Sig(int facId, string? desc) { var core = NormalizeFa(Regex.Replace(desc ?? "", @"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim(); if (core.Length < 15) return null; // too little to call it a dup safely return $"j:{facId}:{(core.Length > 120 ? core[..120] : core)}"; } // Keep one per group — prefer a non-«پزشک عمومی» role (the fan-out's GP copy is the usual // mislabel), then the newest. var toArchive = rows .Select(r => new { r.Id, r.RoleId, r.CreatedAt, Key = Sig(r.FacilityId, r.Description) }) .Where(x => x.Key is not null) .GroupBy(x => x.Key) .SelectMany(g => g.OrderBy(x => x.RoleId == gpId ? 1 : 0).ThenByDescending(x => x.CreatedAt) .Skip(1).Select(x => x.Id)) .ToList(); if (toArchive.Count == 0) return 0; var archived = await _db.JobOpenings.Where(j => toArchive.Contains(j.Id)) .ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct); _log.LogInformation("Archived {N} near-duplicate aggregated jobs.", archived); return archived; } /// /// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities /// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the /// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper, /// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely /// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and /// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost. /// Returns (merged, cleaned). /// public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default) { var facilities = await _db.Facilities.ToListAsync(ct); var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId) .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct); var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId) .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct); int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id); // The shared "unknown" placeholder is worded differently in older data // («مرکز درمانی (نامشخص)») than the current constant, so an exact-name lookup found nothing and // the junk-fold step silently no-op'd. Match by the «نامشخص» marker and pick the bucket actually // used by the most listings — that's the real placeholder junk should fold into. var placeholder = facilities .Where(f => f.Name == UnknownFacilityName || FacilityMatcher.Normalize(f.Name).Contains("نامشخص")) .OrderByDescending(f => Listings(f.Id)).FirstOrDefault(); var placeholderId = placeholder?.Id ?? -1; // Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a // verified facility (those carry real employer data / verification). bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified && f.Verification == VerificationStatus.Unverified && (placeholder is null || f.Id != placeholder.Id); async Task AbsorbAsync(int fromId, int toId) { await _db.Shifts.Where(s => s.FacilityId == fromId) .ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct); await _db.JobOpenings.Where(j => j.FacilityId == fromId) .ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct); await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews } int merged = 0, cleaned = 0; // 1) Junk-named crawl facilities → fold into the shared placeholder. if (placeholder is not null) foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList()) { await AbsorbAsync(f.Id, placeholder.Id); cleaned++; } // 2) Merge same-city Persian-fuzzy duplicates into the best keeper (never the placeholder). var remaining = (await _db.Facilities.ToListAsync(ct)).Where(f => f.Id != placeholderId).ToList(); var done = new HashSet(); foreach (var f in remaining) { if (done.Contains(f.Id)) continue; done.Add(f.Id); var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id) && o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList(); if (cluster.Count == 0) continue; cluster.Add(f); // keeper: verified > owned > most listings > lowest id (oldest). var keeper = cluster.OrderByDescending(x => x.IsVerified) .ThenByDescending(x => x.OwnerUserId.HasValue) .ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First(); foreach (var dup in cluster.Where(x => x.Id != keeper.Id)) { done.Add(dup.Id); if (!Removable(dup)) continue; // never delete an employer/verified facility await AbsorbAsync(dup.Id, keeper.Id); merged++; } } _log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned); return (merged, cleaned); } /// /// In-place fix for EXISTING aggregated listings the AI mislabeled «پزشک عمومی» when the ad text /// actually names a more specific role (dentist, endocrinologist/«متخصص», lab, …). Re-runs the /// keyword parser + the same doctor-role guard over the stored text and updates RoleId (and the /// generic «استخدام پزشک عمومی» title) IN PLACE — no AI call, no delete/recreate, so IDs and /// indexed URLs are untouched. Only ever changes rows currently labeled «پزشک عمومی». Returns the /// number corrected. /// public async Task RecorrectDoctorRolesAsync(CancellationToken ct = default) { var roles = await _db.Roles.ToListAsync(ct); var roleNames = roles.Select(r => r.Name).ToList(); var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct); var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct); var gp = roles.FirstOrDefault(r => r.Name == "پزشک عمومی"); if (gp is null) return 0; Role? Corrected(string? text) { var parsed = _parser.Parse(text ?? "", roleNames, cityNames, districtNames); var specific = parsed.RoleNames.FirstOrDefault(n => NormalizeFa(n) != NormalizeFa("پزشک عمومی")); if (specific is not null) return ResolveOrCreateRole(roles, specific, null); if (LooksSpecialist(text)) return ResolveOrCreateRole(roles, "پزشک متخصص", "پزشک"); return null; } int fixedCount = 0; var jobs = await _db.JobOpenings .Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated && j.RoleId == gp.Id) .ToListAsync(ct); foreach (var j in jobs) { if (Corrected(j.Description) is { } nr && nr.Id != j.RoleId) { if (string.IsNullOrWhiteSpace(j.Title) || j.Title == "استخدام پزشک عمومی") j.Title = $"استخدام {nr.Name}"; j.RoleId = nr.Id; fixedCount++; } } var talent = await _db.TalentListings .Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated && t.RoleId == gp.Id) .ToListAsync(ct); foreach (var t in talent) if (Corrected(t.Description) is { } nr && nr.Id != t.RoleId) { t.RoleId = nr.Id; fixedCount++; } if (fixedCount > 0) await _db.SaveChangesAsync(ct); _log.LogInformation("Recorrected {N} «پزشک عمومی»-mislabeled aggregated listings.", fixedCount); return fixedCount; } /// /// Collapse the role taxonomy that the dynamic ingestion let sprawl: exact duplicates («پرستار /// کودک» ×3), multi-role compounds («پرستار و بهیار»، «ماما / پرستار»), and typos («بیهیار»→بهیار). /// Each role is mapped to a canonical form (strip modifiers → collapse compound to first base role → /// alias) and same-canonical roles merge into one keeper, repointing every shift/job/talent/ /// preference/alert/profile first (mirrors the manual /Admin/Roles merge). Returns roles removed. /// public async Task MergeDuplicateRolesAsync(CancellationToken ct = default) { var roles = await _db.Roles.ToListAsync(ct); string Canon(string rawName) { var name = StripRoleModifiers(rawName); if (CollapseCompound(roles, name) is { } b) name = b; var norm = NormalizeFa(name); return RoleAliases.TryGetValue(norm, out var c) ? NormalizeFa(c) : norm; } int merged = 0; foreach (var g in roles.GroupBy(r => Canon(r.Name)).Where(g => g.Count() > 1)) { // Keeper: a role whose own name IS the canonical (a clean base role), then the lowest Id. var keeper = g.OrderBy(r => NormalizeFa(r.Name) == g.Key ? 0 : 1).ThenBy(r => r.Id).First(); foreach (var dup in g.Where(r => r.Id != keeper.Id)) { await _db.Shifts.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, keeper.Id), ct); await _db.JobOpenings.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, keeper.Id), ct); await _db.TalentListings.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, keeper.Id), ct); await _db.UserPreferences.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, (int?)keeper.Id), ct); await _db.JobAlerts.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, (int?)keeper.Id), ct); await _db.DoctorProfiles.Where(x => x.RoleId == dup.Id).ExecuteUpdateAsync(u => u.SetProperty(x => x.RoleId, (int?)keeper.Id), ct); await _db.Roles.Where(r => r.Id == dup.Id).ExecuteDeleteAsync(ct); merged++; } } _log.LogInformation("Merged {N} duplicate/compound/typo roles.", merged); return merged; } private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray()); private static (RawListingStatus status, string? reason, int confidence) Decide( AppSetting s, ValidationResult val, AiAuditResult? ai) { var notes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null; if (val.IsSpam) return (RawListingStatus.Discarded, Join("اسپم", notes), val.Confidence); if (ai is not null) { var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes); if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence); if (ai.Approve) { // MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can // hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our // own keyword/role check sees nothing clinical, never auto-publish; send to review. if (!val.LooksMedical) return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence); return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove ? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence); } return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review } if (!val.IsValid) return (RawListingStatus.Flagged, notes, val.Confidence); if (s.Mode == IngestionMode.Automatic && val.Confidence >= s.AutoPublishMinConfidence) return (RawListingStatus.Normalized, notes, val.Confidence); return (RawListingStatus.New, notes, val.Confidence); } private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw, List roles, List cities, List districts, List facilities) { var d = ai?.Data; var cityName = d?.City ?? parsed.CityName; var districtName = d?.District ?? parsed.DistrictName; // One ad can name several roles («پرستار سالمند و کودک و همراه بیمار») — resolve them all // and publish one listing per role so each is browsable/filterable. Capped to avoid spam. // The AI's role (+ its category) is the trusted, possibly-new one; parser names are already // canonical matches. Unknown roles are CREATED (dynamic taxonomy), not dropped. var candidates = new List<(string name, string? category)>(); if (!string.IsNullOrWhiteSpace(d?.Role)) candidates.Add((d!.Role!.Trim(), d.Category)); foreach (var n in parsed.RoleNames) candidates.Add((n, null)); if (parsed.RoleName is not null) candidates.Add((parsed.RoleName, null)); var pubRoles = new List(); foreach (var (name, category) in candidates) { if (string.IsNullOrWhiteSpace(name)) continue; var role = ResolveOrCreateRole(roles, name, category); if (!pubRoles.Contains(role)) pubRoles.Add(role); if (pubRoles.Count >= 4) break; } if (pubRoles.Count == 0) pubRoles.Add(roles.First()); // Doctor-role guard. «پزشک عمومی» is the AI's fallback when it's unsure, so it mislabels // clearly-specific doctor ads — a dentist ad («دعوت به همکاری دندانپزشک») or an ENT/specialist // one published as «استخدام پزشک عمومی». Rather than patch role-by-role, trust the keyword // parser: if IT already found a more specific role in the same text, use that; otherwise fall // back to «پزشک متخصص» when the text says specialist. Only ever overrides the weak GP default. if (pubRoles[0].Name == "پزشک عمومی") { var specific = parsed.RoleNames.FirstOrDefault(n => NormalizeFa(n) != NormalizeFa("پزشک عمومی")); if (specific is not null) pubRoles[0] = ResolveOrCreateRole(roles, specific, null); else if (LooksSpecialist(raw.RawText)) pubRoles[0] = ResolveOrCreateRole(roles, "پزشک متخصص", "پزشک"); } var city = cities.FirstOrDefault(c => c.Name == cityName) ?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First(); var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id); // Approx. coords for the map: the source ad's point (Divar) when present; otherwise, for a // Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough // center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin. double? appLat = raw.Lat, appLng = raw.Lng; // Geocode from the structured location fields first, then fall back to scanning the ad body // itself — many Tehran ads name the neighbourhood only in free text («… نیم ساعت پیش در سهروردی») // and never populate a district/area field, which is why most aggregated listings had no map. if (appLat is null && city.Name == "تهران" && TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote, raw.RawText) is { } g) { appLat = g.lat; appLng = g.lng; } // Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran // (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide. if (appLat is null && d?.Lat is double aLat && d?.Lng is double aLng && InTehran(aLat, aLng)) { appLat = aLat; appLng = aLng; } var kindStr = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant(); // «آماده به کار» — a worker offering themselves. No facility involved. if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده")) { // Skip uncontactable applicants: no phone, no other contact channel, AND no source link to // click through to — such a card is useless to an employer. (A source URL is enough, since // the detail page now offers a «مشاهده آگهی در منبع» link for any source.) var reachable = !string.IsNullOrWhiteSpace(d?.Phone) || !string.IsNullOrWhiteSpace(parsed.Phone) || parsed.Contacts.Count > 0 || !string.IsNullOrWhiteSpace(raw.SourceUrl); if (!reachable) { raw.Status = RawListingStatus.Discarded; return; } // ONE person = ONE listing. Do NOT fan out across roles: an applicant has a single // profession, and «پرستار» + «پرستار کودک» from the same ad were producing duplicate // cards. Use the primary (AI) role; any secondary role names become searchable tags. var role = pubRoles[0]; var extraRoleTags = pubRoles.Skip(1).Select(r => r.Name); var tPay = d?.PayAmount ?? parsed.PayAmount; var tShare = d?.SharePercent ?? parsed.SharePercent; _db.TalentListings.Add(new TalentListing { Role = role, City = city, DistrictId = district?.Id, PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName, YearsExperience = d?.YearsExperience ?? parsed.YearsExperience, IsLicensed = d?.IsLicensed ?? parsed.IsLicensed, AreaNote = parsed.AreaNote, Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType), Gender = parsed.Gender, PayType = tShare is not null && tPay is null ? PayType.Percentage : tPay is null ? PayType.Negotiable : PayType.PerShift, PayAmount = tPay, SharePercent = tShare, Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center Contacts = BuildContacts(d, parsed), Tags = BuildTags(parsed, d, role, city, extraRoleTags), }); raw.Status = RawListingStatus.Normalized; return; } // Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad // falls back to ONE shared placeholder (same string as the manual-review flow, so both // pipelines reuse a single record). That placeholder is shared by every unnamed ad in a // city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of // unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync. bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName); var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim() : !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim() : UnknownFacilityName; // Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one. var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id); if (facility is null) { facility = new Facility { Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id, Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false, Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center }; _db.Facilities.Add(facility); facilities.Add(facility); // so later listings in this run match it too } else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null) { // Backfill coords only when the matched (real, named) facility has none — never overwrite a // real (employer-set or verified) location with Divar's fuzzy point. facility.Lat = raw.Lat; facility.Lng = raw.Lng; } // ONE ad = ONE listing. Do NOT fan out across roles. A single ad naming a few role-ish words // («استخدام بهیار جهت دستیار پزشک و تزریقات») was exploding into 5 near-duplicate listings — // one per extracted/typo role (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish only // the primary (guard-corrected) role; the rest stay findable via the full description text. var primaryRole = pubRoles[0]; if (kindStr.Contains("job") || kindStr.Contains("استخدام")) { _db.JobOpenings.Add(new JobOpening { Facility = facility, Role = primaryRole, Title = $"استخدام {primaryRole.Name}", EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType), // Prefer the AI-extracted salary, falling back to the parser's (matching the talent path). SalaryMin = d?.PayAmount ?? parsed.PayAmount, Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) }); } else { var st = MapShiftType(d?.ShiftType, parsed.ShiftType); var (start, end) = DefaultTimes(st); _db.Shifts.Add(new Shift { Facility = facility, Role = primaryRole, Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1), StartTime = start, EndTime = end, ShiftType = st, SpecialtyRequired = primaryRole.Name, Description = raw.RawText, PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage : parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift, PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) }); } raw.Status = RawListingStatus.Normalized; } /// Space-separated searchable tags: parsed cert/skill tags + AI-detected skills/requirements /// + secondary role names + this listing's role/category + city. Pay/contact/location noise and /// sentence fragments are filtered out so chips stay clinical. Drives deep search + tag chips. private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city, IEnumerable? extraRoles = null) { var tags = new List(parsed.Tags) { role.Name, role.Category, city.Name }; if (extraRoles is not null) tags.AddRange(extraRoles); if (d?.Tags is not null) tags.AddRange(d.Tags.Where(t => !string.IsNullOrWhiteSpace(t)).Select(t => t.Trim())); return string.Join(" ", tags .Where(t => !string.IsNullOrWhiteSpace(t) && !IsNoiseTag(t)) .Select(t => t.Trim()) .Distinct()); } // Words/phrases that are NOT clinical skills — pay, contact, generic verbs, sentence fragments — // that were polluting the tag chips («پرداخت توافقی»، «مراقبت از»…). private static readonly string[] TagStopWords = { "توافقی", "پرداخت", "پرداخت توافقی", "حقوق", "دستمزد", "تماس", "شماره", "شماره تماس", "مراقبت از", "مراقبت", "همکاری", "آماده", "آماده به کار", "نیرو", "استخدام", "جذب", // personality / filler — not clinical skills "خوش‌اخلاق", "خوش اخلاق", "خوشاخلاق", "دلسوز", "منظم", "مسئولیت‌پذیر", "مسئولیت پذیر", "باتجربه", "مجرب", }; private static bool IsNoiseTag(string tag) { var t = NormalizeFa(tag); if (t.Length < 2 || t.EndsWith(" از") || t.EndsWith("-از")) return true; // dangling «… از» return TagStopWords.Any(w => NormalizeFa(w) == t); } /// Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic /// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the /// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias /// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real /// sub-specialties («پرستار ICU») stay distinct on purpose. // Separators that join SEVERAL roles in one ad («پرستار و بهیار»، «ماما / پرستار»، «پزشک و پرستار // و بهیار»). A specialty name that legitimately contains «و» (قلب و عروق، پوست و مو) is NOT split, // because its first segment isn't itself a known role. private static readonly Regex RoleSeparators = new(@"\s*/\s*|\s*،\s*|\s*,\s*|\s+یا\s+|\s+و\s+|\s*\+\s*", RegexOptions.Compiled); /// If is a multi-role compound whose FIRST segment is (or aliases /// to) an existing role, return that base role's name; otherwise null. So «پرستار و بهیار» → «پرستار» /// but «قلب و عروق» / «پوست و مو» are left whole. private static string? CollapseCompound(List roles, string name) { var segs = RoleSeparators.Split(name).Select(s => s.Trim()).Where(s => s.Length > 1).ToList(); if (segs.Count < 2) return null; var fnorm = NormalizeFa(segs[0]); if (roles.Any(r => NormalizeFa(r.Name) == fnorm)) return segs[0]; if (RoleAliases.TryGetValue(fnorm, out var canon) && roles.Any(r => NormalizeFa(r.Name) == NormalizeFa(canon))) return canon; return null; } private Role ResolveOrCreateRole(List roles, string name, string? category) { // Drop gender/seniority modifiers baked into the role («پرستار آقا»→«پرستار», // «کارآموز تکنسین داروخانه»→«تکنسین داروخانه»). None of the real roles contain these tokens, // so it only collapses sprawl — the modifier still lives on as a tag / the Gender field. name = StripRoleModifiers(name); // Collapse a multi-role compound to its first base role so we don't mint «پرستار و بهیار». if (CollapseCompound(roles, name) is { } baseName) name = baseName; var norm = NormalizeFa(name); // (1) Already a known role (same word or spelling variant). var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm); if (match is not null) return match; // (2) A synonym of a canonical role → use that role; don't create a duplicate. if (RoleAliases.TryGetValue(norm, out var canonical)) { var canonNorm = NormalizeFa(canonical); var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm); if (aliased is not null) return aliased; name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name } // (3) Genuinely new role — create it under a canonical-resolved category. var created = new Role { Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100) Category = Clamp(ResolveCategory(category), 50), // closed set → respect MaxLength(50) IsActive = true, SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1, }; _db.Roles.Add(created); roles.Add(created); // reuse within this run (saved with the batch at end of source) _log.LogInformation("Ingestion introduced new role «{Role}» (category «{Category}») from AI.", created.Name, created.Category); return created; } /// Map an AI-suggested category to one of the FIXED groups (پزشک/پرستار/ماما/تکنسین/ /// دندانپزشک). Categories are a closed taxonomy — they drive the filter chips — so unlike roles /// they are NEVER invented: a synonym resolves to its canonical group, anything else → «سایر». /// (CategoryAliases maps each canonical group to itself, so exact matches resolve here too.) private static string ResolveCategory(string? category) => CategoryAliases.TryGetValue(NormalizeFa(category), out var canonical) ? canonical : "سایر"; // Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an // existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely. private static readonly Dictionary RoleAliases = BuildAliasMap(new() { ["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" }, ["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" }, ["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" }, ["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" }, ["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" }, ["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" }, ["تکنسین فوریت‌های پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" }, ["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" }, ["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" }, ["کمک بهیار"] = new[] { "کمک‌یار", "کمکیار", "کمک یار", "کمک‌بهیار", "کمک بیمار" }, ["بهیار"] = new[] { "بیهیار", "بیار", "بیهی", "بهییار", "بهیار پرستار" }, ["پرستار کودک"] = new[] { "پرستار بچه", "مراقب کودک", "پرستار مراقب کودک", "کودکیار", "مادر یار کودک", "پرستار اطفال" }, ["فیزیوتراپیست"] = new[] { "فیزیوتراپ", "فیزیوتراپی" }, ["تکنسین داروخانه"] = new[] { "نسخه پیچ", "تکنسین نسخه پیچ" }, }); // Synonyms → canonical CATEGORY (the role-group used for filters/chips). private static readonly Dictionary CategoryAliases = BuildAliasMap(new() { ["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" }, ["پرستار"] = new[] { "پرستاری", "nurse", "nursing" }, ["ماما"] = new[] { "مامایی", "midwifery" }, ["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" }, ["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" }, }); /// Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup, /// also mapping each canonical's own normalized form to itself. private static Dictionary BuildAliasMap(Dictionary src) { var map = new Dictionary(); foreach (var (canonical, aliases) in src) { map[NormalizeFa(canonical)] = canonical; foreach (var a in aliases) map[NormalizeFa(a)] = canonical; } return map; } /// Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ, /// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match). private static string NormalizeFa(string? s) => Regex.Replace( (s ?? "").Replace('ي', 'ی').Replace('ك', 'ک').Replace('‌', ' ').Trim(), @"\s+", " ").ToLowerInvariant(); private static string Clamp(string s, int max) => s.Length <= max ? s : s[..max].Trim(); /// Greater-Tehran bounding box — rejects out-of-area (hallucinated) AI coordinates. private static bool InTehran(double lat, double lng) => lat is >= 35.4 and <= 35.95 && lng is >= 51.0 and <= 51.8; // Markers that mean a doctor role is a SPECIALIST, not a GP — used to correct a «پزشک عمومی» // mislabel on a clearly-specialist ad (e.g. an ENT post showing as «استخدام پزشک عمومی»). private static readonly string[] SpecialistMarkers = { "متخصص", "فوق تخصص", "فوقتخصص", "فلوشیپ", "فلوشیب", "بورد تخصصی", "ساب اسپشالیتی", "ent" }; private static bool LooksSpecialist(string? rawText) { var t = NormalizeFa(rawText); return SpecialistMarkers.Any(m => t.Contains(NormalizeFa(m))); } // Gender/seniority tokens that don't belong in a role name (they go to tags / the Gender field). private static readonly string[] RoleModifierWords = { "آقا", "خانم", "خانوم", "بانو", "مرد", "زن", "کارآموز", "کارورز", "ارشد", "مبتدی" }; // Availability phrases that the model sometimes glues onto the role («کمک بهیار آماده به کار»); // removed as whole substrings before token-stripping (so «به»/«کار» tokens stay safe elsewhere). private static readonly string[] RolePhraseNoise = { "آماده به کار", "آماده همکاری", "آماده بکار", "آماده به همکاری", "جویای کار", "دنبال کار", "جهت همکاری" }; /// Remove availability phrases + gender/seniority tokens from a role name, keeping the /// base profession. Never strips to empty (falls back to the original). private static string StripRoleModifiers(string name) { var n = NormalizeFa(name); foreach (var p in RolePhraseNoise) n = n.Replace(NormalizeFa(p), " "); var kept = n.Split(' ', StringSplitOptions.RemoveEmptyEntries) .Where(t => !RoleModifierWords.Any(m => NormalizeFa(m) == t)).ToList(); return kept.Count > 0 ? string.Join(" ", kept) : name.Trim(); } /// Fresh ContactMethod rows for one talent listing (parser contacts + AI phone). private static List BuildContacts(AiStructured? d, ParsedListing parsed) { var contacts = parsed.Contacts .Select((c, i) => new ContactMethod { Type = c.Type, Value = c.Value, SortOrder = i }) .ToList(); if (!string.IsNullOrWhiteSpace(d?.Phone) && !contacts.Any(c => c.Type is ContactType.Mobile or ContactType.Phone)) contacts.Insert(0, new ContactMethod { Type = ContactType.Mobile, Value = d!.Phone!.Trim(), SortOrder = -1 }); return contacts; } private static ShiftType MapShiftType(string? ai, ShiftType? parsed) => (ai?.ToLowerInvariant()) switch { "day" => ShiftType.Day, "evening" => ShiftType.Evening, "night" => ShiftType.Night, "oncall" => ShiftType.OnCall, _ => parsed ?? ShiftType.Day, }; private static EmploymentType MapEmployment(string? ai, EmploymentType? parsed) => (ai?.ToLowerInvariant()) switch { "parttime" => EmploymentType.PartTime, "contract" => EmploymentType.Contract, "plan" => EmploymentType.Plan, "fulltime" => EmploymentType.FullTime, _ => parsed ?? EmploymentType.FullTime, }; private static (TimeOnly, TimeOnly) DefaultTimes(ShiftType t) => t switch { ShiftType.Day => (new TimeOnly(8, 0), new TimeOnly(14, 0)), ShiftType.Evening => (new TimeOnly(14, 0), new TimeOnly(20, 0)), ShiftType.Night => (new TimeOnly(20, 0), new TimeOnly(8, 0)), _ => (new TimeOnly(8, 0), new TimeOnly(8, 0)), }; private static string? Join(string a, string? b) => string.IsNullOrEmpty(b) ? a : $"{a} | {b}"; private static string Hash(string text) { var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " "); return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant(); } /// Age of a post in whole days — from the source's real timestamp when present, else a /// Persian "time ago" phrase in the text (Divar). Null when neither is available (= unknown age, /// so it's NOT filtered out). private static int? PostAgeDays(ScrapedItem item) { if (item.PostedAt is DateTime posted) return Math.Max(0, (int)Math.Floor((DateTime.UtcNow - posted).TotalDays)); return HtmlUtil.AgeDaysFromPersianText(item.RawText); } }