38031cb189
Phone fix: shifts/jobs showed Facility.Phone, but unnamed ads all share one placeholder facility, so every such listing displayed the same stale number while the ad's real phone sat unused in the description. ContactMethod is now attachable to a Shift/JobOpening (not just talent); ingestion stores the ad's own number(s) on each listing and the detail pages render them (new _ContactList partial), falling back to the facility phone only when the ad had none. Migration ShiftJobContacts (nullable owner FKs) — auto-applies on deploy. Stale applicants: skip «آماده به کار» posts older than 7 days at ingest, by the source's real timestamp (Telegram <time>, Bale date) or a Persian time-ago phrase in the text (Divar «۲ هفته پیش»). Recorded as Discarded; shifts/jobs are not aged out. Admin: Review page now shows a «مشاهده آگهی در منبع» link (RawListing.SourceUrl) so the source post can be checked before publishing. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
485 lines
27 KiB
C#
485 lines
27 KiB
C#
using System.Security.Cryptography;
|
||
using System.Text;
|
||
using System.Text.RegularExpressions;
|
||
using JobsMedical.Web.Data;
|
||
using JobsMedical.Web.Models;
|
||
using Microsoft.EntityFrameworkCore;
|
||
|
||
namespace JobsMedical.Web.Services.Scraping;
|
||
|
||
public record SourceResult(string Source, int Fetched, int Queued, int Published, int Flagged, int Spam, int Duplicates);
|
||
|
||
public record IngestionSummary(List<SourceResult> Sources)
|
||
{
|
||
public int TotalFetched => Sources.Sum(s => s.Fetched);
|
||
public int TotalQueued => Sources.Sum(s => s.Queued);
|
||
public int TotalPublished => Sources.Sum(s => s.Published);
|
||
public int TotalFlagged => Sources.Sum(s => s.Flagged);
|
||
public int TotalSpam => Sources.Sum(s => s.Spam);
|
||
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
|
||
}
|
||
|
||
/// <summary>
|
||
/// The scrape engine. For every enabled source: dedupe by content hash → parse → rule-validate →
|
||
/// (optional) AI audit → decide. Decision depends on admin settings:
|
||
/// • spam → Discarded
|
||
/// • AI on: AI verdict drives approve/reject/review; approve + Automatic + AiAutoApprove → publish
|
||
/// • AI off: Automatic + confidence ≥ threshold → publish; else queue/flag
|
||
/// "Publish" resolves-or-creates an (unverified) facility and creates the Shift/JobOpening.
|
||
/// </summary>
|
||
public class IngestionService
|
||
{
|
||
/// <summary>Applicant posts older than this (by the source's date, or a Persian "time ago"
|
||
/// phrase in the text) are skipped at ingest — availability goes stale fast.</summary>
|
||
private const int TalentMaxAgeDays = 7;
|
||
|
||
private readonly AppDbContext _db;
|
||
private readonly IEnumerable<IListingSource> _sources;
|
||
private readonly IListingParser _parser;
|
||
private readonly ListingValidator _validator;
|
||
private readonly IAiAuditor _ai;
|
||
private readonly SettingsService _settings;
|
||
private readonly ILogger<IngestionService> _log;
|
||
|
||
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources, IListingParser parser,
|
||
ListingValidator validator, IAiAuditor ai, SettingsService settings, ILogger<IngestionService> log)
|
||
{
|
||
_db = db; _sources = sources; _parser = parser; _validator = validator;
|
||
_ai = ai; _settings = settings; _log = log;
|
||
}
|
||
|
||
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
|
||
|
||
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
|
||
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
|
||
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
|
||
|
||
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
||
{
|
||
var settings = await _settings.GetAsync();
|
||
var roles = await _db.Roles.ToListAsync(ct);
|
||
var cities = await _db.Cities.ToListAsync(ct);
|
||
var districts = await _db.Districts.ToListAsync(ct);
|
||
var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create
|
||
var roleNames = roles.Select(r => r.Name).ToList();
|
||
var cityNames = cities.Select(c => c.Name).ToList();
|
||
var districtNames = districts.Select(d => d.Name).ToList();
|
||
|
||
var results = new List<SourceResult>();
|
||
|
||
foreach (var source in _sources)
|
||
{
|
||
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0;
|
||
IReadOnlyList<ScrapedItem> items;
|
||
try { items = await source.FetchAsync(settings, ct); }
|
||
catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; }
|
||
if (items.Count == 0) continue; // disabled/unconfigured source
|
||
|
||
foreach (var item in items)
|
||
{
|
||
fetched++;
|
||
var hash = Hash(item.RawText);
|
||
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
|
||
if (existing is not null)
|
||
{
|
||
// Best-effort geo retry: coords are normally captured only on first ingest, but a
|
||
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
|
||
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
|
||
// coords and the row has none, so an item still sitting in the queue can be placed on
|
||
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
|
||
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
|
||
dupes++; continue;
|
||
}
|
||
|
||
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
|
||
var val = _validator.Validate(item.RawText, parsed);
|
||
|
||
// Drop STALE applicant («آماده به کار») posts — a person's availability goes cold fast.
|
||
// Age = the source's real timestamp, else a Persian "time ago" phrase in the text
|
||
// (Divar embeds «۲ هفته پیش»…). Recorded as Discarded (keeps the dedupe hash + audit
|
||
// trail; no AI spend). Shifts/jobs are NOT aged out — their dates are in the future.
|
||
if (parsed.Kind == ListingKind.Talent && PostAgeDays(item) is int age && age > TalentMaxAgeDays)
|
||
{
|
||
_db.RawListings.Add(new RawListing
|
||
{
|
||
SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(),
|
||
ContentHash = hash, Confidence = 0, Status = RawListingStatus.Discarded,
|
||
ValidationNotes = $"آمادهبهکارِ قدیمی ({age} روز) — نادیده گرفته شد",
|
||
Lat = item.Lat, Lng = item.Lng,
|
||
});
|
||
spam++; continue;
|
||
}
|
||
|
||
AiAuditResult? ai = null;
|
||
if (settings.AiEnabled && !val.IsSpam)
|
||
ai = await _ai.AuditAsync(item.RawText, settings, ct);
|
||
|
||
var (status, reason, confidence) = Decide(settings, val, ai);
|
||
|
||
var raw = new RawListing
|
||
{
|
||
SourceChannel = item.Source,
|
||
SourceUrl = item.SourceUrl,
|
||
RawText = item.RawText.Trim(),
|
||
ContentHash = hash,
|
||
Confidence = confidence,
|
||
ValidationNotes = reason,
|
||
Status = status,
|
||
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
|
||
};
|
||
_db.RawListings.Add(raw);
|
||
|
||
if (status == RawListingStatus.Normalized)
|
||
{
|
||
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
|
||
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
|
||
}
|
||
else if (status == RawListingStatus.New) queued++;
|
||
else if (status == RawListingStatus.Flagged) flagged++;
|
||
else spam++;
|
||
}
|
||
|
||
await _db.SaveChangesAsync(ct);
|
||
results.Add(new SourceResult(source.Name, fetched, queued, published, flagged, spam, dupes));
|
||
_log.LogInformation("Ingest {S}: fetched={F} queued={Q} published={P} flagged={Fl} spam={Sp} dupes={D}",
|
||
source.Name, fetched, queued, published, flagged, spam, dupes);
|
||
}
|
||
|
||
var summary = new IngestionSummary(results);
|
||
|
||
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
|
||
if (results.Count > 0)
|
||
{
|
||
var detail = string.Join("؛ ", results.Select(r =>
|
||
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"));
|
||
_db.IngestionRuns.Add(new IngestionRun
|
||
{
|
||
Fetched = summary.TotalFetched,
|
||
Queued = summary.TotalQueued,
|
||
Published = summary.TotalPublished,
|
||
Flagged = summary.TotalFlagged,
|
||
Spam = summary.TotalSpam,
|
||
Duplicates = summary.TotalDuplicates,
|
||
Detail = detail.Length > 2000 ? detail[..2000] : detail,
|
||
});
|
||
await _db.SaveChangesAsync(ct);
|
||
}
|
||
|
||
return summary;
|
||
}
|
||
|
||
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
||
AppSetting s, ValidationResult val, AiAuditResult? ai)
|
||
{
|
||
var notes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null;
|
||
|
||
if (val.IsSpam)
|
||
return (RawListingStatus.Discarded, Join("اسپم", notes), val.Confidence);
|
||
|
||
if (ai is not null)
|
||
{
|
||
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
|
||
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
|
||
if (ai.Approve)
|
||
{
|
||
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
|
||
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
|
||
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
|
||
if (!val.LooksMedical)
|
||
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
|
||
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
|
||
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
|
||
}
|
||
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
|
||
}
|
||
|
||
if (!val.IsValid) return (RawListingStatus.Flagged, notes, val.Confidence);
|
||
if (s.Mode == IngestionMode.Automatic && val.Confidence >= s.AutoPublishMinConfidence)
|
||
return (RawListingStatus.Normalized, notes, val.Confidence);
|
||
return (RawListingStatus.New, notes, val.Confidence);
|
||
}
|
||
|
||
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
|
||
List<Role> roles, List<City> cities, List<District> districts, List<Facility> facilities)
|
||
{
|
||
var d = ai?.Data;
|
||
var cityName = d?.City ?? parsed.CityName;
|
||
var districtName = d?.District ?? parsed.DistrictName;
|
||
|
||
// One ad can name several roles («پرستار سالمند و کودک و همراه بیمار») — resolve them all
|
||
// and publish one listing per role so each is browsable/filterable. Capped to avoid spam.
|
||
// The AI's role (+ its category) is the trusted, possibly-new one; parser names are already
|
||
// canonical matches. Unknown roles are CREATED (dynamic taxonomy), not dropped.
|
||
var candidates = new List<(string name, string? category)>();
|
||
if (!string.IsNullOrWhiteSpace(d?.Role)) candidates.Add((d!.Role!.Trim(), d.Category));
|
||
foreach (var n in parsed.RoleNames) candidates.Add((n, null));
|
||
if (parsed.RoleName is not null) candidates.Add((parsed.RoleName, null));
|
||
|
||
var pubRoles = new List<Role>();
|
||
foreach (var (name, category) in candidates)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(name)) continue;
|
||
var role = ResolveOrCreateRole(roles, name, category);
|
||
if (!pubRoles.Contains(role)) pubRoles.Add(role);
|
||
if (pubRoles.Count >= 4) break;
|
||
}
|
||
if (pubRoles.Count == 0) pubRoles.Add(roles.First());
|
||
|
||
var city = cities.FirstOrDefault(c => c.Name == cityName)
|
||
?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First();
|
||
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
|
||
|
||
var kindStr = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();
|
||
|
||
// «آماده به کار» — a worker offering themselves. No facility involved.
|
||
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
|
||
{
|
||
// Prefer the AI's tags when present, else the heuristic parser.
|
||
var tPay = d?.PayAmount ?? parsed.PayAmount;
|
||
var tShare = d?.SharePercent ?? parsed.SharePercent;
|
||
foreach (var role in pubRoles)
|
||
_db.TalentListings.Add(new TalentListing
|
||
{
|
||
Role = role, City = city, DistrictId = district?.Id,
|
||
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
|
||
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
|
||
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
|
||
AreaNote = parsed.AreaNote,
|
||
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||
Gender = parsed.Gender,
|
||
PayType = tShare is not null && tPay is null ? PayType.Percentage
|
||
: tPay is null ? PayType.Negotiable : PayType.PerShift,
|
||
PayAmount = tPay, SharePercent = tShare,
|
||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
|
||
Description = raw.RawText,
|
||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||
Contacts = BuildContacts(d, parsed), // fresh instances per listing
|
||
Tags = BuildTags(parsed, d, role, city),
|
||
});
|
||
raw.Status = RawListingStatus.Normalized;
|
||
return;
|
||
}
|
||
|
||
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
|
||
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
|
||
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
|
||
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
|
||
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
|
||
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
|
||
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
||
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
||
: UnknownFacilityName;
|
||
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
||
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
||
if (facility is null)
|
||
{
|
||
facility = new Facility
|
||
{
|
||
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
|
||
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
|
||
};
|
||
_db.Facilities.Add(facility);
|
||
facilities.Add(facility); // so later listings in this run match it too
|
||
}
|
||
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
|
||
{
|
||
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
|
||
// real (employer-set or verified) location with Divar's fuzzy point.
|
||
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
|
||
}
|
||
|
||
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
||
{
|
||
foreach (var role in pubRoles)
|
||
_db.JobOpenings.Add(new JobOpening
|
||
{
|
||
Facility = facility, Role = role,
|
||
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
|
||
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||
SalaryMin = parsed.PayAmount,
|
||
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||
SourceUrl = raw.SourceUrl,
|
||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
||
});
|
||
}
|
||
else
|
||
{
|
||
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
|
||
var (start, end) = DefaultTimes(st);
|
||
foreach (var role in pubRoles)
|
||
_db.Shifts.Add(new Shift
|
||
{
|
||
Facility = facility, Role = role,
|
||
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
|
||
StartTime = start, EndTime = end, ShiftType = st,
|
||
SpecialtyRequired = role.Name, Description = raw.RawText,
|
||
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
||
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
||
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
||
});
|
||
}
|
||
raw.Status = RawListingStatus.Normalized;
|
||
}
|
||
|
||
/// <summary>Space-separated searchable tags: parsed cert/skill tags + AI-detected skills/requirements
|
||
/// + this listing's role/category + city. Drives deep search and tag chips on the applicant card.</summary>
|
||
private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city)
|
||
{
|
||
var tags = new List<string>(parsed.Tags) { role.Name, role.Category, city.Name };
|
||
if (d?.Tags is not null)
|
||
tags.AddRange(d.Tags.Where(t => !string.IsNullOrWhiteSpace(t)).Select(t => t.Trim()));
|
||
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
|
||
}
|
||
|
||
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
|
||
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
|
||
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
|
||
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
|
||
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
|
||
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
|
||
{
|
||
var norm = NormalizeFa(name);
|
||
|
||
// (1) Already a known role (same word or spelling variant).
|
||
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
|
||
if (match is not null) return match;
|
||
|
||
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
|
||
if (RoleAliases.TryGetValue(norm, out var canonical))
|
||
{
|
||
var canonNorm = NormalizeFa(canonical);
|
||
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
|
||
if (aliased is not null) return aliased;
|
||
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
|
||
}
|
||
|
||
// (3) Genuinely new role — create it under a canonical-resolved category.
|
||
var created = new Role
|
||
{
|
||
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
|
||
Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50)
|
||
IsActive = true,
|
||
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
|
||
};
|
||
_db.Roles.Add(created);
|
||
roles.Add(created); // reuse within this run (saved with the batch at end of source)
|
||
_log.LogInformation("Ingestion introduced new role «{Role}» (category «{Category}») from AI.",
|
||
created.Name, created.Category);
|
||
return created;
|
||
}
|
||
|
||
/// <summary>Map an AI-suggested category to a canonical one: synonym alias first
|
||
/// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is.</summary>
|
||
private static string ResolveCategory(List<Role> roles, string? category)
|
||
{
|
||
var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
|
||
// Resolve to a canonical first (synonym alias), then to whichever normalized form is the
|
||
// matching target. Crucially, ALWAYS prefer a category string already stored on a role — even
|
||
// after an alias maps to a canonical — so we never fork a second variant of the same group.
|
||
var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw;
|
||
var targetNorm = NormalizeFa(target);
|
||
return roles.Select(r => r.Category)
|
||
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target;
|
||
}
|
||
|
||
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
|
||
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
|
||
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
|
||
{
|
||
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
|
||
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
|
||
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
|
||
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
|
||
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
|
||
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
|
||
["تکنسین فوریتهای پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
|
||
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
|
||
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
|
||
});
|
||
|
||
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
|
||
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
|
||
{
|
||
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
|
||
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
|
||
["ماما"] = new[] { "مامایی", "midwifery" },
|
||
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
|
||
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
|
||
});
|
||
|
||
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
|
||
/// also mapping each canonical's own normalized form to itself.</summary>
|
||
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
|
||
{
|
||
var map = new Dictionary<string, string>();
|
||
foreach (var (canonical, aliases) in src)
|
||
{
|
||
map[NormalizeFa(canonical)] = canonical;
|
||
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
|
||
}
|
||
return map;
|
||
}
|
||
|
||
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
|
||
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
|
||
private static string NormalizeFa(string? s) => Regex.Replace(
|
||
(s ?? "").Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim(),
|
||
@"\s+", " ").ToLowerInvariant();
|
||
|
||
private static string Clamp(string s, int max) => s.Length <= max ? s : s[..max].Trim();
|
||
|
||
/// <summary>Fresh ContactMethod rows for one talent listing (parser contacts + AI phone).</summary>
|
||
private static List<ContactMethod> BuildContacts(AiStructured? d, ParsedListing parsed)
|
||
{
|
||
var contacts = parsed.Contacts
|
||
.Select((c, i) => new ContactMethod { Type = c.Type, Value = c.Value, SortOrder = i })
|
||
.ToList();
|
||
if (!string.IsNullOrWhiteSpace(d?.Phone)
|
||
&& !contacts.Any(c => c.Type is ContactType.Mobile or ContactType.Phone))
|
||
contacts.Insert(0, new ContactMethod { Type = ContactType.Mobile, Value = d!.Phone!.Trim(), SortOrder = -1 });
|
||
return contacts;
|
||
}
|
||
|
||
private static ShiftType MapShiftType(string? ai, ShiftType? parsed) => (ai?.ToLowerInvariant()) switch
|
||
{
|
||
"day" => ShiftType.Day, "evening" => ShiftType.Evening, "night" => ShiftType.Night, "oncall" => ShiftType.OnCall,
|
||
_ => parsed ?? ShiftType.Day,
|
||
};
|
||
|
||
private static EmploymentType MapEmployment(string? ai, EmploymentType? parsed) => (ai?.ToLowerInvariant()) switch
|
||
{
|
||
"parttime" => EmploymentType.PartTime, "contract" => EmploymentType.Contract,
|
||
"plan" => EmploymentType.Plan, "fulltime" => EmploymentType.FullTime,
|
||
_ => parsed ?? EmploymentType.FullTime,
|
||
};
|
||
|
||
private static (TimeOnly, TimeOnly) DefaultTimes(ShiftType t) => t switch
|
||
{
|
||
ShiftType.Day => (new TimeOnly(8, 0), new TimeOnly(14, 0)),
|
||
ShiftType.Evening => (new TimeOnly(14, 0), new TimeOnly(20, 0)),
|
||
ShiftType.Night => (new TimeOnly(20, 0), new TimeOnly(8, 0)),
|
||
_ => (new TimeOnly(8, 0), new TimeOnly(8, 0)),
|
||
};
|
||
|
||
private static string? Join(string a, string? b) => string.IsNullOrEmpty(b) ? a : $"{a} | {b}";
|
||
|
||
private static string Hash(string text)
|
||
{
|
||
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
|
||
return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant();
|
||
}
|
||
|
||
/// <summary>Age of a post in whole days — from the source's real timestamp when present, else a
|
||
/// Persian "time ago" phrase in the text (Divar). Null when neither is available (= unknown age,
|
||
/// so it's NOT filtered out).</summary>
|
||
private static int? PostAgeDays(ScrapedItem item)
|
||
{
|
||
if (item.PostedAt is DateTime posted)
|
||
return Math.Max(0, (int)Math.Floor((DateTime.UtcNow - posted).TotalDays));
|
||
return HtmlUtil.AgeDaysFromPersianText(item.RawText);
|
||
}
|
||
}
|