using System.Text.RegularExpressions; using JobsMedical.Web.Models; namespace JobsMedical.Web.Services; /// One contact channel pulled from a post (type + raw value). public record ParsedContact(ContactType Type, string Value); /// Structured guess extracted from a raw channel post. All fields are best-effort. public class ParsedListing { public ListingKind Kind { get; set; } = ListingKind.Shift; public string? RoleName { get; set; } // primary role (first match) public List RoleNames { get; set; } = new(); // all roles in the ad (e.g. سالمند + کودک) public ShiftType? ShiftType { get; set; } public EmploymentType? EmploymentType { get; set; } public long? PayAmount { get; set; } // shift pay or single salary figure public int? SharePercent { get; set; } // profit-share % (درصدی / سهم درآمد) public bool PayNegotiable { get; set; } public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز public string? CityName { get; set; } public string? DistrictName { get; set; } public string? FacilityName { get; set; } // hospital/clinic name guessed from the text public string? Phone { get; set; } // «آماده به کار» (talent) extras — populated when Kind == Talent. public string? PersonName { get; set; } // «دکتر سپیده علیزاده» public int? YearsExperience { get; set; } // سابقه (سال) public bool IsLicensed { get; set; } // پروانه‌دار public string? AreaNote { get; set; } // «فقط منطقه ۱» public List Contacts { get; set; } = new(); // phones, email, socials… public List Tags { get; set; } = new(); // cert/skill keywords for search public List Notes { get; set; } = new(); // what was/wasn't detected (shown to admin) } /// /// Turns a messy Persian channel/Divar post into a structured listing guess. This is the /// Stage-1 implementation: transparent keyword + regex heuristics, no AI dependency (important /// since LLM APIs are blocked from Iran). A future LlmListingParser can implement the same /// interface and be swapped in via DI without touching the admin queue. /// public interface IListingParser { ParsedListing Parse(string rawText, IEnumerable knownRoles, IEnumerable knownCities, IEnumerable knownDistricts); } public class HeuristicListingParser : IListingParser { public ParsedListing Parse(string raw, IEnumerable knownRoles, IEnumerable knownCities, IEnumerable knownDistricts) { var p = new ParsedListing(); var text = Normalize(raw); // --- Kind: talent (worker offers themselves) vs shift vs hiring --- // Talent is checked first: «آماده به کار/همکاری», «جویای کار» mean the *person* is // available — distinct from an employer's «دعوت به همکاری». bool talentSignals = ContainsAny(text, "آماده به کار", "آماده‌به‌کار", "آماده همکاری", "آماده‌ی همکاری", "آماده ی همکاری", "آماده فعالیت", "جویای کار", "جویای کار هستم", "متقاضی کار", "نیازمند کار", "آماده انجام", "می‌توانم همکاری", "میتوانم همکاری", "حاضر به همکاری"); bool jobSignals = ContainsAny(text, "استخدام", "جذب", "دعوت به همکاری", "نیازمندیم", "نیازمند است", "حقوق ثابت"); bool shiftSignals = ContainsAny(text, "شیفت", "آنکال", "انکال", "نوبت", "کشیک"); if (talentSignals) { p.Kind = ListingKind.Talent; p.Notes.Add("نوع: آماده به کار (تشخیص خودکار)"); } else { // A dated SHIFT requires an explicit shift signal («شیفت/آنکال/کشیک/نوبت»). Otherwise the ad // is an ongoing hiring post → Job. (Defaulting to Shift forced a fabricated date/time onto // generic ads like «پرستار درمانگاه», which the source never stated.) p.Kind = shiftSignals ? ListingKind.Shift : ListingKind.Job; p.Notes.Add(p.Kind == ListingKind.Shift ? "نوع: شیفت (تشخیص خودکار)" : "نوع: استخدام (تشخیص خودکار)"); } // --- Roles (an ad can name several at once: «پرستار سالمند و کودک و همراه بیمار») --- var known = knownRoles.ToList(); var hits = new List(); // Exact taxonomy matches (longest first so «پزشک متخصص» beats «پزشک»). foreach (var role in known.OrderByDescending(r => r.Length)) if (text.Contains(Normalize(role))) hits.Add(role); // Drop a role that's a substring of a longer matched role (پرستار ⊂ پرستار سالمندان). hits = hits.Where(r => !hits.Any(o => o != r && o.Contains(r))).Distinct().ToList(); // Synonyms → canonical role names (covers terms not written verbatim). Only add a canonical // that actually exists in the taxonomy, and isn't already a hit. void AddSyn(string canonical, params string[] needles) { if (ContainsAny(text, needles) && known.Contains(canonical) && !hits.Contains(canonical)) hits.Add(canonical); } AddSyn("پرستار سالمندان", "سالمند", "سالمندان", "نگهداری سالمند"); AddSyn("دندانپزشک", "دندان", "دندانپزشک", "دندان‌پزشک"); AddSyn("تکنسین اتاق عمل", "اتاق عمل", "اسکراب"); AddSyn("تکنسین فوریت‌های پزشکی", "فوریت", "اورژانس پیش بیمارستانی", "آمبولانس"); AddSyn("کارشناس آزمایشگاه", "آزمایشگاه", "علوم آزمایشگاهی", "نمونه گیر"); AddSyn("ماما", "مامایی"); AddSyn("پرستار", "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر", "مراقب", "همراه بیمار", "کودک", "اطفال", "نوزاد", "تزریقات", "پانسمان"); AddSyn("پزشک متخصص", "فوق تخصص", "متخصص"); AddSyn("پزشک عمومی", "پزشک", "دکتر", "طبیب"); p.RoleNames = hits.Distinct().Take(4).ToList(); // cap fan-out p.RoleName = p.RoleNames.FirstOrDefault(); p.Notes.Add(p.RoleNames.Count == 0 ? "نقش: تشخیص داده نشد" : $"نقش‌ها: {string.Join("، ", p.RoleNames)}"); // --- Shift type --- if (ContainsAny(text, "آنکال", "انکال")) p.ShiftType = Models.ShiftType.OnCall; else if (text.Contains("شب")) p.ShiftType = Models.ShiftType.Night; else if (text.Contains("عصر")) p.ShiftType = Models.ShiftType.Evening; else if (ContainsAny(text, "صبح", "روز")) p.ShiftType = Models.ShiftType.Day; // --- Employment type --- if (ContainsAny(text, "پاره وقت", "پاره‌وقت", "پارت تایم")) p.EmploymentType = Models.EmploymentType.PartTime; else if (text.Contains("طرح")) p.EmploymentType = Models.EmploymentType.Plan; else if (text.Contains("قرارداد")) p.EmploymentType = Models.EmploymentType.Contract; else if (ContainsAny(text, "تمام وقت", "تمام‌وقت")) p.EmploymentType = Models.EmploymentType.FullTime; // --- Gender requirement --- if (ContainsAny(text, "خانم", "خانوم", "بانو", "زن ", "مامای")) p.Gender = Gender.Female; else if (ContainsAny(text, "آقا", "اقا", "مرد ", "مرد،", "پسر")) p.Gender = Gender.Male; if (p.Gender != Gender.Any) p.Notes.Add($"جنسیت: {(p.Gender == Gender.Female ? "خانم" : "آقا")}"); // --- City / district --- p.CityName = knownCities.FirstOrDefault(c => text.Contains(Normalize(c))); p.DistrictName = knownDistricts.OrderByDescending(d => d.Length) .FirstOrDefault(d => text.Contains(Normalize(d))); // --- Profit share (درصدی / سهم) --- var latinForShare = ToLatinDigits(text); var share = Regex.Match(latinForShare, @"(\d{1,3})\s*(?:٪|%|درصد)"); if (!share.Success) share = Regex.Match(latinForShare, @"(?:٪|%)\s*(\d{1,3})"); if (share.Success && int.TryParse(share.Groups[1].Value, out var pct) && pct is > 0 and <= 100) { p.SharePercent = pct; p.Notes.Add($"سهم درآمد: {pct}٪"); } else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت")) { p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); } // --- Fixed pay (strip phone numbers first so they're never read as money) --- // A STATED amount wins over «توافقی»: ads often say a number AND «… بقیه توافقی»; showing the // figure is far more useful than «توافقی». Fall back to negotiable only when no amount is found. var amount = ExtractAmount(StripPhones(text)); if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); } else if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); } else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد"); // --- Talent extras (only meaningful for «آماده به کار») --- if (p.Kind == ListingKind.Talent) { var latinT = ToLatinDigits(text); var exp = Regex.Match(latinT, @"سابقه[^\d]{0,8}(\d{1,2})\s*سال"); if (!exp.Success) exp = Regex.Match(latinT, @"(\d{1,2})\s*سال\s*سابقه"); if (exp.Success && int.TryParse(exp.Groups[1].Value, out var yrs) && yrs is > 0 and <= 60) { p.YearsExperience = yrs; p.Notes.Add($"سابقه: {yrs} سال"); } p.IsLicensed = ContainsAny(text, "پروانه دار", "پروانه‌دار", "دارای پروانه", "پروانه فعالیت", "پروانه طبابت"); if (p.IsLicensed) p.Notes.Add("پروانه‌دار"); p.PersonName = ExtractPersonName(text); if (p.PersonName is not null) p.Notes.Add($"نام: {p.PersonName}"); var area = Regex.Match(text, @"منطقه\s*[۰-۹0-9]{1,2}"); if (area.Success) { p.AreaNote = area.Value.Trim(); p.Notes.Add($"محدوده: {p.AreaNote}"); } } // --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) --- if (p.Kind != ListingKind.Talent) { p.FacilityName = ExtractFacilityName(text); if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}"); } // --- Tags (certs/skills for deep search): mmt, icu, پروانه‌دار, اتاق عمل … --- p.Tags = ExtractTags(text); if (p.RoleNames.Count > 0) p.Tags.AddRange(p.RoleNames); if (p.IsLicensed && !p.Tags.Contains("پروانه‌دار")) p.Tags.Add("پروانه‌دار"); p.Tags = p.Tags.Distinct().ToList(); // --- Contacts (phones, email, socials — one ad may have several) --- p.Contacts = ExtractContacts(raw ?? text); p.Phone = p.Contacts.FirstOrDefault(c => c.Type is ContactType.Mobile or ContactType.Phone)?.Value; if (p.Contacts.Count > 0) p.Notes.Add("راه‌های ارتباطی: " + string.Join("، ", p.Contacts.Select(c => ContactLabel(c.Type)))); return p; } // Words that introduce a facility name, longest/most-specific first. private static readonly string[] FacilityKeywords = { "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک", "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع", "آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان", }; // Words that clearly aren't part of a facility's name — stop collecting here. private static readonly string[] NameStops = { "جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب", "دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز", "شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم", }; /// Best-effort hospital/clinic name: a facility keyword plus up to three name words. private static string? ExtractFacilityName(string text) { foreach (var kw in FacilityKeywords) { var idx = text.IndexOf(kw, StringComparison.Ordinal); if (idx < 0) continue; var after = text[(idx + kw.Length)..]; var words = after.Split( new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' }, StringSplitOptions.RemoveEmptyEntries); var picked = new List(); foreach (var w in words) { if (NameStops.Contains(w)) break; if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names if (!w.Any(char.IsLetter)) break; // emoji / punctuation («📍») isn't a name if (w.Length == 1) break; // stray letters picked.Add(w); if (picked.Count >= 3) break; } if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful var candidate = (kw + " " + string.Join(" ", picked)).Trim(); // Reject names that are only filler/verb/source noise («بیمارستان هستم», «... از مدجابز») — // a real name couldn't be extracted, so fall back to the shared placeholder downstream. if (Scraping.FacilityMatcher.IsJunkName(candidate)) continue; return candidate; } return null; } // Titles that introduce a person's name in «آماده به کار» posts. private static readonly string[] PersonTitles = { "دکتر", "خانم دکتر", "آقای دکتر", "مهندس", "سرکار خانم", "جناب آقای", "خانم", "آقای" }; // Words that are NOT a person's name — verbs/fillers/availability/role words the extractor was // grabbing after a title («خانم هستم»، «دکتر ام»، «دکتر داروساز آماده»). Stop collecting at one. private static readonly string[] NameNoise = { "هستم", "هستیم", "هستش", "ام", "بودم", "میباشم", "میباشد", "باشم", "آماده", "آماده‌ام", "جویای", "بکار", "به‌کار", "کار", "همکاری", "نیازمند", "استخدام", "جذب", "عزیز", "محترم", "گرامی", "خانم", "آقا", "اقا", "دکتر", "پزشک", "پرستار", "بهیار", "ماما", "دندانپزشک", "داروساز", "تکنسین", "کارشناس", "متخصص", "عمومی", "مراقب", "کمک", }; /// Best-effort person name: a title (دکتر/خانم/…) plus up to two following words. private static string? ExtractPersonName(string text) { foreach (var title in PersonTitles) { var idx = text.IndexOf(title, StringComparison.Ordinal); if (idx < 0) continue; var after = text[(idx + title.Length)..]; var words = after.Split( new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/' }, StringSplitOptions.RemoveEmptyEntries); var picked = new List(); foreach (var w in words) { if (NameStops.Contains(w)) break; if (NameNoise.Any(n => Normalize(n) == Normalize(w))) break; // «خانم هستم»/«دکتر ام»… if (Regex.IsMatch(w, @"[\d]")) break; if (w.Length == 1) break; picked.Add(w); if (picked.Count >= 2) break; } if (picked.Count == 0) continue; return (title + " " + string.Join(" ", picked)).Trim(); } return null; } /// Remove phone numbers (and «شماره تماس…» lines) so they're not mistaken for money. private static string StripPhones(string text) { var t = Regex.Replace(text, @"شماره\s*(?:تماس|موبایل|همراه|ثابت|تلفن)[^\n]*", " "); t = ToLatinDigits(t); t = Regex.Replace(t, @"(?Pull a figure out of free text and normalize to TOMAN (ریال → تومان = ÷۱۰), /// handling «میلیون» and Persian digits. private static long? ExtractAmount(string text) { var latin = ToLatinDigits(text); bool hasToman = latin.Contains("تومان") || latin.Contains("تومن"); bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman; // Iranian salary shorthand: a 1–3 digit number means MILLIONS of toman — «۱۵ تومان»، // «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م». Take the LOWER bound of a range. The lookarounds keep // this from ever matching part of a long literal-toman number (the digits must end at the unit). var collo = Regex.Match(latin, @"(? 0 and <= 500) return (long)lo * 1_000_000; // e.g. "۲ میلیون" / "2.5 میلیون [ریال]" var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?"); if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."), System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m)) { var val = (long)(m * 1_000_000); if (million.Groups[2].Success) val /= 10; // «میلیون ریال» return val; } // Largest plain number that looks like money (6–10 digits, no leading zero — a leading // zero or 11+ digits means it's a phone/id). Convert ریال→تومان by the unit next to the // number, else by the ad's overall currency. long best = 0; foreach (Match num in Regex.Matches(latin, @"(? 10 || !long.TryParse(digits, out var v)) continue; var unit = num.Groups[2].Value; bool isRial = unit is "ریال" or "ريال" || (unit.Length == 0 && hasRial); if (isRial) v /= 10; if (v > best) best = v; } // Sanity: a monthly figure of 200M+ تومان is implausible in Iran — if the ad never said // «تومان», it was almost certainly ریال, so normalize. if (best >= 200_000_000 && !hasToman) best /= 10; return best > 0 ? best : null; } private static readonly Regex EmailRx = new(@"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", RegexOptions.Compiled); private static readonly Regex UrlRx = new(@"https?://[^\s]+", RegexOptions.Compiled); private static string ContactLabel(ContactType t) => ContactInfo.Label(t); /// Pull every contact channel out of a post: phones, email, and socials (Instagram / /// Telegram / Bale / WhatsApp / website) via URLs and Persian keyword cues. private static List ExtractContacts(string raw) { var latin = ToLatinDigits(raw); var list = new List(); void Add(ContactType t, string v) { v = v.Trim().Trim('.', '،', ',', ')', '(', ':', '«', '»', '"', '/').Trim(); if (v.Length < 2) return; if (!list.Any(c => c.Type == t && string.Equals(c.Value, v, StringComparison.OrdinalIgnoreCase))) list.Add(new ParsedContact(t, v)); } foreach (Match m in EmailRx.Matches(latin)) Add(ContactType.Email, m.Value); foreach (Match m in UrlRx.Matches(latin)) { var u = m.Value.TrimEnd('.', '،', ')', '(', '"'); var low = u.ToLowerInvariant(); if (low.Contains("instagram.com") || low.Contains("instagr.am")) Add(ContactType.Instagram, UrlHandle(u)); else if (low.Contains("t.me") || low.Contains("telegram.me")) Add(ContactType.Telegram, UrlHandle(u)); else if (low.Contains("ble.ir") || low.Contains("bale.ai")) Add(ContactType.Bale, UrlHandle(u)); else if (low.Contains("wa.me") || low.Contains("whatsapp")) Add(ContactType.WhatsApp, UrlHandle(u)); else Add(ContactType.Website, u); } // Persian keyword → handle (latin handles only, so Persian words after the cue don't match). void Keyed(ContactType t, params string[] kws) { foreach (var kw in kws) foreach (Match m in Regex.Matches(latin, kw + @"\s*[::]?\s*@?([A-Za-z0-9_.]{3,30})")) Add(t, m.Groups[1].Value); } Keyed(ContactType.Instagram, "اینستاگرام", "اینستگرام", "اینستا", "پیج"); Keyed(ContactType.Telegram, "تلگرام"); Keyed(ContactType.WhatsApp, "واتساپ", "واتس اپ"); // phones — mobiles then landlines (multiple), boundary-guarded. foreach (Match m in Regex.Matches(latin, @"(? ExtractTags(string text) { var tags = new List(); foreach (var (tag, needles) in TagDict) if (ContainsAny(text, needles)) tags.Add(tag); return tags; } private static string UrlHandle(string url) { var u = url.Split('?')[0].TrimEnd('/'); var seg = u.Contains('/') ? u[(u.LastIndexOf('/') + 1)..] : u; return string.IsNullOrWhiteSpace(seg) ? url : seg; } private static string Normalize(string s) => s .Replace('ي', 'ی').Replace('ك', 'ک').Replace('‌', ' ').Trim(); private static bool ContainsAny(string text, params string[] needles) => needles.Any(n => text.Contains(n)); private static string ToLatinDigits(string s) { var chars = s.ToCharArray(); for (var i = 0; i < chars.Length; i++) { if (chars[i] >= '۰' && chars[i] <= '۹') chars[i] = (char)('0' + (chars[i] - '۰')); else if (chars[i] >= '٠' && chars[i] <= '٩') chars[i] = (char)('0' + (chars[i] - '٠')); } return new string(chars); } }