using System.Security.Cryptography; using System.Text; using System.Text.RegularExpressions; using JobsMedical.Web.Data; using JobsMedical.Web.Models; using Microsoft.EntityFrameworkCore; namespace JobsMedical.Web.Services.Scraping; public record SourceResult(string Source, int Fetched, int Queued, int Published, int Flagged, int Spam, int Duplicates); public record IngestionSummary(List Sources) { public int TotalFetched => Sources.Sum(s => s.Fetched); public int TotalQueued => Sources.Sum(s => s.Queued); public int TotalPublished => Sources.Sum(s => s.Published); public int TotalFlagged => Sources.Sum(s => s.Flagged); public int TotalSpam => Sources.Sum(s => s.Spam); public int TotalDuplicates => Sources.Sum(s => s.Duplicates); } /// /// The scrape engine. For every enabled source: dedupe by content hash → parse → rule-validate → /// (optional) AI audit → decide. Decision depends on admin settings: /// • spam → Discarded /// • AI on: AI verdict drives approve/reject/review; approve + Automatic + AiAutoApprove → publish /// • AI off: Automatic + confidence ≥ threshold → publish; else queue/flag /// "Publish" resolves-or-creates an (unverified) facility and creates the Shift/JobOpening. /// public class IngestionService { /// Applicant posts older than this (by the source's date, or a Persian "time ago" /// phrase in the text) are skipped at ingest — availability goes stale fast. private const int TalentMaxAgeDays = 7; private readonly AppDbContext _db; private readonly IEnumerable _sources; private readonly IListingParser _parser; private readonly ListingValidator _validator; private readonly IAiAuditor _ai; private readonly SettingsService _settings; private readonly ILogger _log; public IngestionService(AppDbContext db, IEnumerable sources, IListingParser parser, ListingValidator validator, IAiAuditor ai, SettingsService settings, ILogger log) { _db = db; _sources = sources; _parser = parser; _validator = validator; _ai = ai; _settings = settings; _log = log; } public IReadOnlyList SourceNames => _sources.Select(s => s.Name).ToList(); /// Shared placeholder facility name for unnamed ads — kept identical to /// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record. private const string UnknownFacilityName = "نامشخص / ثبت نشده"; public async Task RunAsync(CancellationToken ct = default) { var settings = await _settings.GetAsync(); var roles = await _db.Roles.ToListAsync(ct); var cities = await _db.Cities.ToListAsync(ct); var districts = await _db.Districts.ToListAsync(ct); var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create var roleNames = roles.Select(r => r.Name).ToList(); var cityNames = cities.Select(c => c.Name).ToList(); var districtNames = districts.Select(d => d.Name).ToList(); var results = new List(); foreach (var source in _sources) { int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0; IReadOnlyList items; try { items = await source.FetchAsync(settings, ct); } catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; } if (items.Count == 0) continue; // disabled/unconfigured source foreach (var item in items) { fetched++; var hash = Hash(item.RawText); var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct); if (existing is not null) { // Best-effort geo retry: coords are normally captured only on first ingest, but a // re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to // null on a bad response / out-of-bbox). Backfill the cached row when this fetch has // coords and the row has none, so an item still sitting in the queue can be placed on // the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.) if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; } dupes++; continue; } var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames); var val = _validator.Validate(item.RawText, parsed); // Drop STALE applicant («آماده به کار») posts — a person's availability goes cold fast. // Age = the source's real timestamp, else a Persian "time ago" phrase in the text // (Divar embeds «۲ هفته پیش»…). Recorded as Discarded (keeps the dedupe hash + audit // trail; no AI spend). Shifts/jobs are NOT aged out — their dates are in the future. if (parsed.Kind == ListingKind.Talent && PostAgeDays(item) is int age && age > TalentMaxAgeDays) { _db.RawListings.Add(new RawListing { SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(), ContentHash = hash, Confidence = 0, Status = RawListingStatus.Discarded, ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد", Lat = item.Lat, Lng = item.Lng, }); spam++; continue; } AiAuditResult? ai = null; if (settings.AiEnabled && !val.IsSpam) ai = await _ai.AuditAsync(item.RawText, settings, ct); var (status, reason, confidence) = Decide(settings, val, ai); var raw = new RawListing { SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(), ContentHash = hash, Confidence = confidence, ValidationNotes = reason, Status = status, Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish }; _db.RawListings.Add(raw); if (status == RawListingStatus.Normalized) { try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; } catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; } } else if (status == RawListingStatus.New) queued++; else if (status == RawListingStatus.Flagged) flagged++; else spam++; } await _db.SaveChangesAsync(ct); results.Add(new SourceResult(source.Name, fetched, queued, published, flagged, spam, dupes)); _log.LogInformation("Ingest {S}: fetched={F} queued={Q} published={P} flagged={Fl} spam={Sp} dupes={D}", source.Name, fetched, queued, published, flagged, spam, dupes); } var summary = new IngestionSummary(results); // Persist a run-log row so admins get a crawl history (with a per-source breakdown). if (results.Count > 0) { var detail = string.Join("؛ ", results.Select(r => $"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}")); _db.IngestionRuns.Add(new IngestionRun { Fetched = summary.TotalFetched, Queued = summary.TotalQueued, Published = summary.TotalPublished, Flagged = summary.TotalFlagged, Spam = summary.TotalSpam, Duplicates = summary.TotalDuplicates, Detail = detail.Length > 2000 ? detail[..2000] : detail, }); await _db.SaveChangesAsync(ct); } return summary; } private static (RawListingStatus status, string? reason, int confidence) Decide( AppSetting s, ValidationResult val, AiAuditResult? ai) { var notes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null; if (val.IsSpam) return (RawListingStatus.Discarded, Join("اسپم", notes), val.Confidence); if (ai is not null) { var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes); if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence); if (ai.Approve) { // MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can // hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our // own keyword/role check sees nothing clinical, never auto-publish; send to review. if (!val.LooksMedical) return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence); return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove ? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence); } return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review } if (!val.IsValid) return (RawListingStatus.Flagged, notes, val.Confidence); if (s.Mode == IngestionMode.Automatic && val.Confidence >= s.AutoPublishMinConfidence) return (RawListingStatus.Normalized, notes, val.Confidence); return (RawListingStatus.New, notes, val.Confidence); } private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw, List roles, List cities, List districts, List facilities) { var d = ai?.Data; var cityName = d?.City ?? parsed.CityName; var districtName = d?.District ?? parsed.DistrictName; // One ad can name several roles («پرستار سالمند و کودک و همراه بیمار») — resolve them all // and publish one listing per role so each is browsable/filterable. Capped to avoid spam. // The AI's role (+ its category) is the trusted, possibly-new one; parser names are already // canonical matches. Unknown roles are CREATED (dynamic taxonomy), not dropped. var candidates = new List<(string name, string? category)>(); if (!string.IsNullOrWhiteSpace(d?.Role)) candidates.Add((d!.Role!.Trim(), d.Category)); foreach (var n in parsed.RoleNames) candidates.Add((n, null)); if (parsed.RoleName is not null) candidates.Add((parsed.RoleName, null)); var pubRoles = new List(); foreach (var (name, category) in candidates) { if (string.IsNullOrWhiteSpace(name)) continue; var role = ResolveOrCreateRole(roles, name, category); if (!pubRoles.Contains(role)) pubRoles.Add(role); if (pubRoles.Count >= 4) break; } if (pubRoles.Count == 0) pubRoles.Add(roles.First()); var city = cities.FirstOrDefault(c => c.Name == cityName) ?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First(); var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id); var kindStr = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant(); // «آماده به کار» — a worker offering themselves. No facility involved. if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده")) { // Prefer the AI's tags when present, else the heuristic parser. var tPay = d?.PayAmount ?? parsed.PayAmount; var tShare = d?.SharePercent ?? parsed.SharePercent; foreach (var role in pubRoles) _db.TalentListings.Add(new TalentListing { Role = role, City = city, DistrictId = district?.Id, PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName, YearsExperience = d?.YearsExperience ?? parsed.YearsExperience, IsLicensed = d?.IsLicensed ?? parsed.IsLicensed, AreaNote = parsed.AreaNote, Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType), Gender = parsed.Gender, PayType = tShare is not null && tPay is null ? PayType.Percentage : tPay is null ? PayType.Negotiable : PayType.PerShift, PayAmount = tPay, SharePercent = tShare, Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, Contacts = BuildContacts(d, parsed), // fresh instances per listing Tags = BuildTags(parsed, d, role, city), }); raw.Status = RawListingStatus.Normalized; return; } // Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad // falls back to ONE shared placeholder (same string as the manual-review flow, so both // pipelines reuse a single record). That placeholder is shared by every unnamed ad in a // city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of // unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync. bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName); var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim() : !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim() : UnknownFacilityName; // Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one. var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id); if (facility is null) { facility = new Facility { Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id, Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false, Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center }; _db.Facilities.Add(facility); facilities.Add(facility); // so later listings in this run match it too } else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null) { // Backfill coords only when the matched (real, named) facility has none — never overwrite a // real (employer-set or verified) location with Divar's fuzzy point. facility.Lat = raw.Lat; facility.Lng = raw.Lng; } if (kindStr.Contains("job") || kindStr.Contains("استخدام")) { foreach (var role in pubRoles) _db.JobOpenings.Add(new JobOpening { Facility = facility, Role = role, Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}", EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType), SalaryMin = parsed.PayAmount, Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing }); } else { var st = MapShiftType(d?.ShiftType, parsed.ShiftType); var (start, end) = DefaultTimes(st); foreach (var role in pubRoles) _db.Shifts.Add(new Shift { Facility = facility, Role = role, Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1), StartTime = start, EndTime = end, ShiftType = st, SpecialtyRequired = role.Name, Description = raw.RawText, PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage : parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift, PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing }); } raw.Status = RawListingStatus.Normalized; } /// Space-separated searchable tags: parsed cert/skill tags + AI-detected skills/requirements /// + this listing's role/category + city. Drives deep search and tag chips on the applicant card. private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city) { var tags = new List(parsed.Tags) { role.Name, role.Category, city.Name }; if (d?.Tags is not null) tags.AddRange(d.Tags.Where(t => !string.IsNullOrWhiteSpace(t)).Select(t => t.Trim())); return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct()); } /// Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic /// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the /// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias /// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real /// sub-specialties («پرستار ICU») stay distinct on purpose. private Role ResolveOrCreateRole(List roles, string name, string? category) { var norm = NormalizeFa(name); // (1) Already a known role (same word or spelling variant). var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm); if (match is not null) return match; // (2) A synonym of a canonical role → use that role; don't create a duplicate. if (RoleAliases.TryGetValue(norm, out var canonical)) { var canonNorm = NormalizeFa(canonical); var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm); if (aliased is not null) return aliased; name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name } // (3) Genuinely new role — create it under a canonical-resolved category. var created = new Role { Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100) Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50) IsActive = true, SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1, }; _db.Roles.Add(created); roles.Add(created); // reuse within this run (saved with the batch at end of source) _log.LogInformation("Ingestion introduced new role «{Role}» (category «{Category}») from AI.", created.Name, created.Category); return created; } /// Map an AI-suggested category to a canonical one: synonym alias first /// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is. private static string ResolveCategory(List roles, string? category) { var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim(); // Resolve to a canonical first (synonym alias), then to whichever normalized form is the // matching target. Crucially, ALWAYS prefer a category string already stored on a role — even // after an alias maps to a canonical — so we never fork a second variant of the same group. var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw; var targetNorm = NormalizeFa(target); return roles.Select(r => r.Category) .FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target; } // Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an // existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely. private static readonly Dictionary RoleAliases = BuildAliasMap(new() { ["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" }, ["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" }, ["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" }, ["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" }, ["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" }, ["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" }, ["تکنسین فوریت‌های پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" }, ["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" }, ["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" }, }); // Synonyms → canonical CATEGORY (the role-group used for filters/chips). private static readonly Dictionary CategoryAliases = BuildAliasMap(new() { ["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" }, ["پرستار"] = new[] { "پرستاری", "nurse", "nursing" }, ["ماما"] = new[] { "مامایی", "midwifery" }, ["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" }, ["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" }, }); /// Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup, /// also mapping each canonical's own normalized form to itself. private static Dictionary BuildAliasMap(Dictionary src) { var map = new Dictionary(); foreach (var (canonical, aliases) in src) { map[NormalizeFa(canonical)] = canonical; foreach (var a in aliases) map[NormalizeFa(a)] = canonical; } return map; } /// Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ, /// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match). private static string NormalizeFa(string? s) => Regex.Replace( (s ?? "").Replace('ي', 'ی').Replace('ك', 'ک').Replace('‌', ' ').Trim(), @"\s+", " ").ToLowerInvariant(); private static string Clamp(string s, int max) => s.Length <= max ? s : s[..max].Trim(); /// Fresh ContactMethod rows for one talent listing (parser contacts + AI phone). private static List BuildContacts(AiStructured? d, ParsedListing parsed) { var contacts = parsed.Contacts .Select((c, i) => new ContactMethod { Type = c.Type, Value = c.Value, SortOrder = i }) .ToList(); if (!string.IsNullOrWhiteSpace(d?.Phone) && !contacts.Any(c => c.Type is ContactType.Mobile or ContactType.Phone)) contacts.Insert(0, new ContactMethod { Type = ContactType.Mobile, Value = d!.Phone!.Trim(), SortOrder = -1 }); return contacts; } private static ShiftType MapShiftType(string? ai, ShiftType? parsed) => (ai?.ToLowerInvariant()) switch { "day" => ShiftType.Day, "evening" => ShiftType.Evening, "night" => ShiftType.Night, "oncall" => ShiftType.OnCall, _ => parsed ?? ShiftType.Day, }; private static EmploymentType MapEmployment(string? ai, EmploymentType? parsed) => (ai?.ToLowerInvariant()) switch { "parttime" => EmploymentType.PartTime, "contract" => EmploymentType.Contract, "plan" => EmploymentType.Plan, "fulltime" => EmploymentType.FullTime, _ => parsed ?? EmploymentType.FullTime, }; private static (TimeOnly, TimeOnly) DefaultTimes(ShiftType t) => t switch { ShiftType.Day => (new TimeOnly(8, 0), new TimeOnly(14, 0)), ShiftType.Evening => (new TimeOnly(14, 0), new TimeOnly(20, 0)), ShiftType.Night => (new TimeOnly(20, 0), new TimeOnly(8, 0)), _ => (new TimeOnly(8, 0), new TimeOnly(8, 0)), }; private static string? Join(string a, string? b) => string.IsNullOrEmpty(b) ? a : $"{a} | {b}"; private static string Hash(string text) { var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " "); return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant(); } /// Age of a post in whole days — from the source's real timestamp when present, else a /// Persian "time ago" phrase in the text (Divar). Null when neither is available (= unknown age, /// so it's NOT filtered out). private static int? PostAgeDays(ScrapedItem item) { if (item.PostedAt is DateTime posted) return Math.Max(0, (int)Math.Floor((DateTime.UtcNow - posted).TotalDays)); return HtmlUtil.AgeDaysFromPersianText(item.RawText); } }