Files
hamkadr/src/JobsMedical.Web/Services/ListingParser.cs
T
soroush.asadi 1f628d971e
CI/CD / CI · dotnet build (push) Successful in 1m54s
CI/CD / Deploy · hamkadr (push) Successful in 2m19s
Default aggregated ads to Job, not Shift (stop fabricating shift dates/times)
A generic hiring ad like «پرستار درمانگاه» was published as a dated SHIFT with an invented date
(«فردا») and default hours («۰۸:۰۰–۱۴:۰۰») the source never stated — because classification defaulted
to Shift. Now a dated Shift is only produced when the text carries an explicit shift signal
(شیفت/آنکال/کشیک/نوبت); everything else is an ongoing hiring post → Job (no date to invent). Fixed in
both the parser default and the Publish branch (so an AI mislabel can''t force a shift either).

ReclassifyMisclassifiedShiftsAsync (in the post-ingest auto-cleanup) converts the existing signal-less
aggregated shifts into jobs in place — copies the content to a JobOpening and archives the old shift
(its URL 410s). After one pass it''s a no-op since new ads no longer become shifts.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-23 07:08:47 +03:30

451 lines
26 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services;
/// <summary>One contact channel pulled from a post (type + raw value).</summary>
public record ParsedContact(ContactType Type, string Value);
/// <summary>Structured guess extracted from a raw channel post. All fields are best-effort.</summary>
public class ParsedListing
{
public ListingKind Kind { get; set; } = ListingKind.Shift;
public string? RoleName { get; set; } // primary role (first match)
public List<string> RoleNames { get; set; } = new(); // all roles in the ad (e.g. سالمند + کودک)
public ShiftType? ShiftType { get; set; }
public EmploymentType? EmploymentType { get; set; }
public long? PayAmount { get; set; } // shift pay or single salary figure
public int? SharePercent { get; set; } // profit-share % (درصدی / سهم درآمد)
public bool PayNegotiable { get; set; }
public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز
public string? CityName { get; set; }
public string? DistrictName { get; set; }
public string? FacilityName { get; set; } // hospital/clinic name guessed from the text
public string? Phone { get; set; }
// «آماده به کار» (talent) extras — populated when Kind == Talent.
public string? PersonName { get; set; } // «دکتر سپیده علیزاده»
public int? YearsExperience { get; set; } // سابقه (سال)
public bool IsLicensed { get; set; } // پروانه‌دار
public string? AreaNote { get; set; } // «فقط منطقه ۱»
public List<ParsedContact> Contacts { get; set; } = new(); // phones, email, socials…
public List<string> Tags { get; set; } = new(); // cert/skill keywords for search
public List<string> Notes { get; set; } = new(); // what was/wasn't detected (shown to admin)
}
/// <summary>
/// Turns a messy Persian channel/Divar post into a structured listing guess. This is the
/// Stage-1 implementation: transparent keyword + regex heuristics, no AI dependency (important
/// since LLM APIs are blocked from Iran). A future LlmListingParser can implement the same
/// interface and be swapped in via DI without touching the admin queue.
/// </summary>
public interface IListingParser
{
ParsedListing Parse(string rawText, IEnumerable<string> knownRoles,
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts);
}
public class HeuristicListingParser : IListingParser
{
public ParsedListing Parse(string raw, IEnumerable<string> knownRoles,
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts)
{
var p = new ParsedListing();
var text = Normalize(raw);
// --- Kind: talent (worker offers themselves) vs shift vs hiring ---
// Talent is checked first: «آماده به کار/همکاری», «جویای کار» mean the *person* is
// available — distinct from an employer's «دعوت به همکاری».
bool talentSignals = ContainsAny(text,
"آماده به کار", "آماده‌به‌کار", "آماده همکاری", "آماده‌ی همکاری", "آماده ی همکاری",
"آماده فعالیت", "جویای کار", "جویای کار هستم", "متقاضی کار", "نیازمند کار",
"آماده انجام", "می‌توانم همکاری", "میتوانم همکاری", "حاضر به همکاری");
bool jobSignals = ContainsAny(text, "استخدام", "جذب", "دعوت به همکاری", "نیازمندیم", "نیازمند است", "حقوق ثابت");
bool shiftSignals = ContainsAny(text, "شیفت", "آنکال", "انکال", "نوبت", "کشیک");
if (talentSignals)
{
p.Kind = ListingKind.Talent;
p.Notes.Add("نوع: آماده به کار (تشخیص خودکار)");
}
else
{
// A dated SHIFT requires an explicit shift signal («شیفت/آنکال/کشیک/نوبت»). Otherwise the ad
// is an ongoing hiring post → Job. (Defaulting to Shift forced a fabricated date/time onto
// generic ads like «پرستار درمانگاه», which the source never stated.)
p.Kind = shiftSignals ? ListingKind.Shift : ListingKind.Job;
p.Notes.Add(p.Kind == ListingKind.Shift ? "نوع: شیفت (تشخیص خودکار)" : "نوع: استخدام (تشخیص خودکار)");
}
// --- Roles (an ad can name several at once: «پرستار سالمند و کودک و همراه بیمار») ---
var known = knownRoles.ToList();
var hits = new List<string>();
// Exact taxonomy matches (longest first so «پزشک متخصص» beats «پزشک»).
foreach (var role in known.OrderByDescending(r => r.Length))
if (text.Contains(Normalize(role))) hits.Add(role);
// Drop a role that's a substring of a longer matched role (پرستار ⊂ پرستار سالمندان).
hits = hits.Where(r => !hits.Any(o => o != r && o.Contains(r))).Distinct().ToList();
// Synonyms → canonical role names (covers terms not written verbatim). Only add a canonical
// that actually exists in the taxonomy, and isn't already a hit.
void AddSyn(string canonical, params string[] needles)
{
if (ContainsAny(text, needles) && known.Contains(canonical) && !hits.Contains(canonical))
hits.Add(canonical);
}
AddSyn("پرستار سالمندان", "سالمند", "سالمندان", "نگهداری سالمند");
AddSyn("دندانپزشک", "دندان", "دندانپزشک", "دندان‌پزشک");
AddSyn("تکنسین اتاق عمل", "اتاق عمل", "اسکراب");
AddSyn("تکنسین فوریت‌های پزشکی", "فوریت", "اورژانس پیش بیمارستانی", "آمبولانس");
AddSyn("کارشناس آزمایشگاه", "آزمایشگاه", "علوم آزمایشگاهی", "نمونه گیر");
AddSyn("ماما", "مامایی");
AddSyn("پرستار", "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر", "مراقب", "همراه بیمار",
"کودک", "اطفال", "نوزاد", "تزریقات", "پانسمان");
AddSyn("پزشک متخصص", "فوق تخصص", "متخصص");
AddSyn("پزشک عمومی", "پزشک", "دکتر", "طبیب");
p.RoleNames = hits.Distinct().Take(4).ToList(); // cap fan-out
p.RoleName = p.RoleNames.FirstOrDefault();
p.Notes.Add(p.RoleNames.Count == 0 ? "نقش: تشخیص داده نشد" : $"نقش‌ها: {string.Join("، ", p.RoleNames)}");
// --- Shift type ---
if (ContainsAny(text, "آنکال", "انکال")) p.ShiftType = Models.ShiftType.OnCall;
else if (text.Contains("شب")) p.ShiftType = Models.ShiftType.Night;
else if (text.Contains("عصر")) p.ShiftType = Models.ShiftType.Evening;
else if (ContainsAny(text, "صبح", "روز")) p.ShiftType = Models.ShiftType.Day;
// --- Employment type ---
if (ContainsAny(text, "پاره وقت", "پاره‌وقت", "پارت تایم")) p.EmploymentType = Models.EmploymentType.PartTime;
else if (text.Contains("طرح")) p.EmploymentType = Models.EmploymentType.Plan;
else if (text.Contains("قرارداد")) p.EmploymentType = Models.EmploymentType.Contract;
else if (ContainsAny(text, "تمام وقت", "تمام‌وقت")) p.EmploymentType = Models.EmploymentType.FullTime;
// --- Gender requirement ---
if (ContainsAny(text, "خانم", "خانوم", "بانو", "زن ", "مامای")) p.Gender = Gender.Female;
else if (ContainsAny(text, "آقا", "اقا", "مرد ", "مرد،", "پسر")) p.Gender = Gender.Male;
if (p.Gender != Gender.Any)
p.Notes.Add($"جنسیت: {(p.Gender == Gender.Female ? "خانم" : "آقا")}");
// --- City / district ---
p.CityName = knownCities.FirstOrDefault(c => text.Contains(Normalize(c)));
p.DistrictName = knownDistricts.OrderByDescending(d => d.Length)
.FirstOrDefault(d => text.Contains(Normalize(d)));
// --- Profit share (درصدی / سهم) ---
var latinForShare = ToLatinDigits(text);
var share = Regex.Match(latinForShare, @"(\d{1,3})\s*(?:٪|%|درصد)");
if (!share.Success) share = Regex.Match(latinForShare, @"(?:٪|%)\s*(\d{1,3})");
if (share.Success && int.TryParse(share.Groups[1].Value, out var pct) && pct is > 0 and <= 100)
{ p.SharePercent = pct; p.Notes.Add($"سهم درآمد: {pct}٪"); }
else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت"))
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
// --- Fixed pay (strip phone numbers first so they're never read as money) ---
// A STATED amount wins over «توافقی»: ads often say a number AND «… بقیه توافقی»; showing the
// figure is far more useful than «توافقی». Fall back to negotiable only when no amount is found.
var amount = ExtractAmount(StripPhones(text));
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
else if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
// --- Talent extras (only meaningful for «آماده به کار») ---
if (p.Kind == ListingKind.Talent)
{
var latinT = ToLatinDigits(text);
var exp = Regex.Match(latinT, @"سابقه[^\d]{0,8}(\d{1,2})\s*سال");
if (!exp.Success) exp = Regex.Match(latinT, @"(\d{1,2})\s*سال\s*سابقه");
if (exp.Success && int.TryParse(exp.Groups[1].Value, out var yrs) && yrs is > 0 and <= 60)
{ p.YearsExperience = yrs; p.Notes.Add($"سابقه: {yrs} سال"); }
p.IsLicensed = ContainsAny(text, "پروانه دار", "پروانه‌دار", "دارای پروانه", "پروانه فعالیت", "پروانه طبابت");
if (p.IsLicensed) p.Notes.Add("پروانه‌دار");
p.PersonName = ExtractPersonName(text);
if (p.PersonName is not null) p.Notes.Add($"نام: {p.PersonName}");
var area = Regex.Match(text, @"منطقه\s*[۰-۹0-9]{1,2}");
if (area.Success) { p.AreaNote = area.Value.Trim(); p.Notes.Add($"محدوده: {p.AreaNote}"); }
}
// --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) ---
if (p.Kind != ListingKind.Talent)
{
p.FacilityName = ExtractFacilityName(text);
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
}
// --- Tags (certs/skills for deep search): mmt, icu, پروانه‌دار, اتاق عمل … ---
p.Tags = ExtractTags(text);
if (p.RoleNames.Count > 0) p.Tags.AddRange(p.RoleNames);
if (p.IsLicensed && !p.Tags.Contains("پروانه‌دار")) p.Tags.Add("پروانه‌دار");
p.Tags = p.Tags.Distinct().ToList();
// --- Contacts (phones, email, socials — one ad may have several) ---
p.Contacts = ExtractContacts(raw ?? text);
p.Phone = p.Contacts.FirstOrDefault(c => c.Type is ContactType.Mobile or ContactType.Phone)?.Value;
if (p.Contacts.Count > 0)
p.Notes.Add("راه‌های ارتباطی: " + string.Join("، ", p.Contacts.Select(c => ContactLabel(c.Type))));
return p;
}
// Words that introduce a facility name, longest/most-specific first.
private static readonly string[] FacilityKeywords =
{
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
"آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان",
};
// Words that clearly aren't part of a facility's name — stop collecting here.
private static readonly string[] NameStops =
{
"جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب",
"دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز",
"شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم",
};
/// <summary>Best-effort hospital/clinic name: a facility keyword plus up to three name words.</summary>
private static string? ExtractFacilityName(string text)
{
foreach (var kw in FacilityKeywords)
{
var idx = text.IndexOf(kw, StringComparison.Ordinal);
if (idx < 0) continue;
var after = text[(idx + kw.Length)..];
var words = after.Split(
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' },
StringSplitOptions.RemoveEmptyEntries);
var picked = new List<string>();
foreach (var w in words)
{
if (NameStops.Contains(w)) break;
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
if (!w.Any(char.IsLetter)) break; // emoji / punctuation («📍») isn't a name
if (w.Length == 1) break; // stray letters
picked.Add(w);
if (picked.Count >= 3) break;
}
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
var candidate = (kw + " " + string.Join(" ", picked)).Trim();
// Reject names that are only filler/verb/source noise («بیمارستان هستم», «... از مدجابز») —
// a real name couldn't be extracted, so fall back to the shared placeholder downstream.
if (Scraping.FacilityMatcher.IsJunkName(candidate)) continue;
return candidate;
}
return null;
}
// Titles that introduce a person's name in «آماده به کار» posts.
private static readonly string[] PersonTitles = { "دکتر", "خانم دکتر", "آقای دکتر", "مهندس", "سرکار خانم", "جناب آقای", "خانم", "آقای" };
// Words that are NOT a person's name — verbs/fillers/availability/role words the extractor was
// grabbing after a title («خانم هستم»، «دکتر ام»، «دکتر داروساز آماده»). Stop collecting at one.
private static readonly string[] NameNoise =
{
"هستم", "هستیم", "هستش", "ام", "بودم", "میباشم", "میباشد", "باشم", "آماده", "آماده‌ام",
"جویای", "بکار", "به‌کار", "کار", "همکاری", "نیازمند", "استخدام", "جذب", "عزیز", "محترم",
"گرامی", "خانم", "آقا", "اقا", "دکتر", "پزشک", "پرستار", "بهیار", "ماما", "دندانپزشک",
"داروساز", "تکنسین", "کارشناس", "متخصص", "عمومی", "مراقب", "کمک",
};
/// <summary>Best-effort person name: a title (دکتر/خانم/…) plus up to two following words.</summary>
private static string? ExtractPersonName(string text)
{
foreach (var title in PersonTitles)
{
var idx = text.IndexOf(title, StringComparison.Ordinal);
if (idx < 0) continue;
var after = text[(idx + title.Length)..];
var words = after.Split(
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/' },
StringSplitOptions.RemoveEmptyEntries);
var picked = new List<string>();
foreach (var w in words)
{
if (NameStops.Contains(w)) break;
if (NameNoise.Any(n => Normalize(n) == Normalize(w))) break; // «خانم هستم»/«دکتر ام»…
if (Regex.IsMatch(w, @"[\d]")) break;
if (w.Length == 1) break;
picked.Add(w);
if (picked.Count >= 2) break;
}
if (picked.Count == 0) continue;
return (title + " " + string.Join(" ", picked)).Trim();
}
return null;
}
/// <summary>Remove phone numbers (and «شماره تماس…» lines) so they're not mistaken for money.</summary>
private static string StripPhones(string text)
{
var t = Regex.Replace(text, @"شماره\s*(?:تماس|موبایل|همراه|ثابت|تلفن)[^\n]*", " ");
t = ToLatinDigits(t);
t = Regex.Replace(t, @"(?<!\d)(?:\+?98|0)?9\d{9}(?!\d)", " "); // mobile
t = Regex.Replace(t, @"(?<!\d)0\d{2,3}[\s-]?\d{7,8}(?!\d)", " "); // landline
return t;
}
/// <summary>Pull a figure out of free text and normalize to TOMAN (ریال → تومان = ÷۱۰),
/// handling «میلیون» and Persian digits.</summary>
private static long? ExtractAmount(string text)
{
var latin = ToLatinDigits(text);
bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;
// Iranian salary shorthand: a 13 digit number means MILLIONS of toman — «۱۵ تومان»،
// «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م». Take the LOWER bound of a range. The lookarounds keep
// this from ever matching part of a long literal-toman number (the digits must end at the unit).
var collo = Regex.Match(latin,
@"(?<!\d)(\d{1,3})(?:\s*تا\s*(\d{1,3}))?\s*(?:میلیون|م(?![ا-یA-Za-z])|تومان|تومن)(?!\s*\d)");
if (collo.Success && int.TryParse(collo.Groups[1].Value, out var lo) && lo is > 0 and <= 500)
return (long)lo * 1_000_000;
// e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
{
var val = (long)(m * 1_000_000);
if (million.Groups[2].Success) val /= 10; // «میلیون ریال»
return val;
}
// Largest plain number that looks like money (610 digits, no leading zero — a leading
// zero or 11+ digits means it's a phone/id). Convert ریال→تومان by the unit next to the
// number, else by the ad's overall currency.
long best = 0;
foreach (Match num in Regex.Matches(latin, @"(?<!\d)([1-9][\d٬,،.]{4,})\s*(ریال|ريال|تومان|تومن)?"))
{
var digits = Regex.Replace(num.Groups[1].Value, @"[^\d]", "");
if (digits.Length is < 6 or > 10 || !long.TryParse(digits, out var v)) continue;
var unit = num.Groups[2].Value;
bool isRial = unit is "ریال" or "ريال" || (unit.Length == 0 && hasRial);
if (isRial) v /= 10;
if (v > best) best = v;
}
// Sanity: a monthly figure of 200M+ تومان is implausible in Iran — if the ad never said
// «تومان», it was almost certainly ریال, so normalize.
if (best >= 200_000_000 && !hasToman) best /= 10;
return best > 0 ? best : null;
}
private static readonly Regex EmailRx = new(@"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", RegexOptions.Compiled);
private static readonly Regex UrlRx = new(@"https?://[^\s]+", RegexOptions.Compiled);
private static string ContactLabel(ContactType t) => ContactInfo.Label(t);
/// <summary>Pull every contact channel out of a post: phones, email, and socials (Instagram /
/// Telegram / Bale / WhatsApp / website) via URLs and Persian keyword cues.</summary>
private static List<ParsedContact> ExtractContacts(string raw)
{
var latin = ToLatinDigits(raw);
var list = new List<ParsedContact>();
void Add(ContactType t, string v)
{
v = v.Trim().Trim('.', '،', ',', ')', '(', ':', '«', '»', '"', '/').Trim();
if (v.Length < 2) return;
if (!list.Any(c => c.Type == t && string.Equals(c.Value, v, StringComparison.OrdinalIgnoreCase)))
list.Add(new ParsedContact(t, v));
}
foreach (Match m in EmailRx.Matches(latin)) Add(ContactType.Email, m.Value);
foreach (Match m in UrlRx.Matches(latin))
{
var u = m.Value.TrimEnd('.', '،', ')', '(', '"');
var low = u.ToLowerInvariant();
if (low.Contains("instagram.com") || low.Contains("instagr.am")) Add(ContactType.Instagram, UrlHandle(u));
else if (low.Contains("t.me") || low.Contains("telegram.me")) Add(ContactType.Telegram, UrlHandle(u));
else if (low.Contains("ble.ir") || low.Contains("bale.ai")) Add(ContactType.Bale, UrlHandle(u));
else if (low.Contains("wa.me") || low.Contains("whatsapp")) Add(ContactType.WhatsApp, UrlHandle(u));
else Add(ContactType.Website, u);
}
// Persian keyword → handle (latin handles only, so Persian words after the cue don't match).
void Keyed(ContactType t, params string[] kws)
{
foreach (var kw in kws)
foreach (Match m in Regex.Matches(latin, kw + @"\s*[:]?\s*@?([A-Za-z0-9_.]{3,30})"))
Add(t, m.Groups[1].Value);
}
Keyed(ContactType.Instagram, "اینستاگرام", "اینستگرام", "اینستا", "پیج");
Keyed(ContactType.Telegram, "تلگرام");
Keyed(ContactType.WhatsApp, "واتساپ", "واتس اپ");
// phones — mobiles then landlines (multiple), boundary-guarded.
foreach (Match m in Regex.Matches(latin, @"(?<!\d)(?:\+?98|0)?9\d{9}(?!\d)"))
{
var d = Regex.Replace(m.Value, @"\D", "");
if (d.StartsWith("98")) d = "0" + d[2..];
if (d.Length == 10 && d[0] == '9') d = "0" + d;
Add(ContactType.Mobile, d);
}
// Landline area codes start 0[1-8] (021 Tehran, 026 Karaj, …) — never 09, which is a MOBILE.
// The old 0\d{2,3} matched 09xx numbers and mislabeled mobiles as «تلفن ثابت».
foreach (Match m in Regex.Matches(latin, @"(?<!\d)0[1-8]\d{1,2}[\s-]?\d{7,8}(?!\d)"))
Add(ContactType.Phone, Regex.Replace(m.Value, @"\D", ""));
return list.Take(8).ToList();
}
// Canonical tag → trigger words found in the post.
private static readonly (string Tag, string[] Needles)[] TagDict =
{
("mmt", new[] { "mmt", "ام ام تی", "ام‌ام‌تی" }),
("ICU", new[] { "icu", "آی سی یو", "آی‌سی‌یو" }),
("CCU", new[] { "ccu", "سی سی یو", "سی‌سی‌یو" }),
("NICU", new[] { "nicu", "ان آی سی یو", "نوزادان" }),
("BLS", new[] { "bls" }),
("ACLS", new[] { "acls" }),
("دیالیز", new[] { "دیالیز" }),
("اتاق عمل", new[] { "اتاق عمل", "اسکراب" }),
("بیهوشی", new[] { "بیهوشی" }),
("تریاژ", new[] { "تریاژ" }),
("تزریقات", new[] { "تزریقات", "تزریق" }),
("پانسمان", new[] { "پانسمان", "زخم" }),
("سونوگرافی", new[] { "سونوگرافی" }),
("رادیولوژی", new[] { "رادیولوژی" }),
("اورژانس", new[] { "اورژانس", "فوریت" }),
("مسئول فنی", new[] { "مسئول فنی" }),
("طرح", new[] { "طرح" }),
("سالمند", new[] { "سالمند" }),
("کودک", new[] { "کودک", "اطفال" }),
("همراه بیمار", new[] { "همراه بیمار" }),
("پروانه‌دار", new[] { "پروانه" }),
};
private static List<string> ExtractTags(string text)
{
var tags = new List<string>();
foreach (var (tag, needles) in TagDict)
if (ContainsAny(text, needles)) tags.Add(tag);
return tags;
}
private static string UrlHandle(string url)
{
var u = url.Split('?')[0].TrimEnd('/');
var seg = u.Contains('/') ? u[(u.LastIndexOf('/') + 1)..] : u;
return string.IsNullOrWhiteSpace(seg) ? url : seg;
}
private static string Normalize(string s) => s
.Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim();
private static bool ContainsAny(string text, params string[] needles)
=> needles.Any(n => text.Contains(n));
private static string ToLatinDigits(string s)
{
var chars = s.ToCharArray();
for (var i = 0; i < chars.Length; i++)
{
if (chars[i] >= '۰' && chars[i] <= '۹') chars[i] = (char)('0' + (chars[i] - '۰'));
else if (chars[i] >= '٠' && chars[i] <= '٩') chars[i] = (char)('0' + (chars[i] - '٠'));
}
return new string(chars);
}
}