213af9db48
AI (when enabled, now that the server proxy is up): - AiStructured gains phone, personName, yearsExperience, isLicensed. - The auditor appends an authoritative output-schema to the admin prompt so classification stays correct even with an older stored prompt — it now classifies kind as shift|job|talent and extracts the contact phone and talent details. - Ingestion publish prefers the AI's tags (kind/role/city/facility/phone + talent fields) over the heuristic parser when present. - Default prompt updated to describe the three kinds + new fields. Phone extraction from websites (Medjobs / generic sites), where the number sits behind a "تماس با این آگهی" reveal: - HtmlUtil.HarvestPhones scans the full markup for tel: links, JSON-LD "telephone", data-*phone* attributes, and inline Iranian mobile/landline numbers (Persian digits folded), normalized (mobiles 09…, landlines 0…). - Medjobs + Website sources append harvested numbers to the ad text so the parser/AI capture them; manual review then prefills the phone too. - Parser phone extraction now also captures a landline as a fallback. Note: if a site loads the number purely via XHR (not in HTML), a per-source reveal endpoint would be a follow-up. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
284 lines
15 KiB
C#
284 lines
15 KiB
C#
using System.Text.RegularExpressions;
|
||
using JobsMedical.Web.Models;
|
||
|
||
namespace JobsMedical.Web.Services;
|
||
|
||
/// <summary>Structured guess extracted from a raw channel post. All fields are best-effort.</summary>
|
||
public class ParsedListing
|
||
{
|
||
public ListingKind Kind { get; set; } = ListingKind.Shift;
|
||
public string? RoleName { get; set; }
|
||
public ShiftType? ShiftType { get; set; }
|
||
public EmploymentType? EmploymentType { get; set; }
|
||
public long? PayAmount { get; set; } // shift pay or single salary figure
|
||
public int? SharePercent { get; set; } // profit-share % (درصدی / سهم درآمد)
|
||
public bool PayNegotiable { get; set; }
|
||
public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز
|
||
public string? CityName { get; set; }
|
||
public string? DistrictName { get; set; }
|
||
public string? FacilityName { get; set; } // hospital/clinic name guessed from the text
|
||
public string? Phone { get; set; }
|
||
|
||
// «آماده به کار» (talent) extras — populated when Kind == Talent.
|
||
public string? PersonName { get; set; } // «دکتر سپیده علیزاده»
|
||
public int? YearsExperience { get; set; } // سابقه (سال)
|
||
public bool IsLicensed { get; set; } // پروانهدار
|
||
public string? AreaNote { get; set; } // «فقط منطقه ۱»
|
||
public List<string> Notes { get; set; } = new(); // what was/wasn't detected (shown to admin)
|
||
}
|
||
|
||
/// <summary>
|
||
/// Turns a messy Persian channel/Divar post into a structured listing guess. This is the
|
||
/// Stage-1 implementation: transparent keyword + regex heuristics, no AI dependency (important
|
||
/// since LLM APIs are blocked from Iran). A future LlmListingParser can implement the same
|
||
/// interface and be swapped in via DI without touching the admin queue.
|
||
/// </summary>
|
||
public interface IListingParser
|
||
{
|
||
ParsedListing Parse(string rawText, IEnumerable<string> knownRoles,
|
||
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts);
|
||
}
|
||
|
||
public class HeuristicListingParser : IListingParser
|
||
{
|
||
public ParsedListing Parse(string raw, IEnumerable<string> knownRoles,
|
||
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts)
|
||
{
|
||
var p = new ParsedListing();
|
||
var text = Normalize(raw);
|
||
|
||
// --- Kind: talent (worker offers themselves) vs shift vs hiring ---
|
||
// Talent is checked first: «آماده به کار/همکاری», «جویای کار» mean the *person* is
|
||
// available — distinct from an employer's «دعوت به همکاری».
|
||
bool talentSignals = ContainsAny(text,
|
||
"آماده به کار", "آمادهبهکار", "آماده همکاری", "آمادهی همکاری", "آماده ی همکاری",
|
||
"آماده فعالیت", "جویای کار", "جویای کار هستم", "متقاضی کار", "نیازمند کار",
|
||
"آماده انجام", "میتوانم همکاری", "میتوانم همکاری", "حاضر به همکاری");
|
||
bool jobSignals = ContainsAny(text, "استخدام", "جذب", "دعوت به همکاری", "نیازمندیم", "نیازمند است", "حقوق ثابت");
|
||
bool shiftSignals = ContainsAny(text, "شیفت", "آنکال", "انکال", "نوبت", "کشیک");
|
||
if (talentSignals)
|
||
{
|
||
p.Kind = ListingKind.Talent;
|
||
p.Notes.Add("نوع: آماده به کار (تشخیص خودکار)");
|
||
}
|
||
else
|
||
{
|
||
p.Kind = (jobSignals && !shiftSignals) ? ListingKind.Job : ListingKind.Shift;
|
||
p.Notes.Add(p.Kind == ListingKind.Job ? "نوع: استخدام (تشخیص خودکار)" : "نوع: شیفت (تشخیص خودکار)");
|
||
}
|
||
|
||
// --- Role (longest match first so «پزشک متخصص» beats «پزشک») ---
|
||
foreach (var role in knownRoles.OrderByDescending(r => r.Length))
|
||
{
|
||
if (text.Contains(Normalize(role))) { p.RoleName = role; break; }
|
||
}
|
||
// Synonyms common on Divar/Medjobs → canonical seeded role names.
|
||
if (p.RoleName is null)
|
||
{
|
||
p.RoleName =
|
||
ContainsAny(text, "اتاق عمل", "اسکراب") ? "تکنسین اتاق عمل"
|
||
: ContainsAny(text, "فوریت", "اورژانس پیش بیمارستانی", "آمبولانس") ? "تکنسین فوریتهای پزشکی"
|
||
: ContainsAny(text, "آزمایشگاه", "علوم آزمایشگاهی", "نمونه گیر") ? "کارشناس آزمایشگاه"
|
||
: ContainsAny(text, "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر", "مراقب", "سالمند", "همراه بیمار", "تزریقات", "پانسمان") ? "پرستار"
|
||
: ContainsAny(text, "ماما", "مامایی") ? "ماما"
|
||
: ContainsAny(text, "فوق تخصص", "متخصص") ? "پزشک متخصص"
|
||
: ContainsAny(text, "پزشک", "دکتر", "طبیب") ? "پزشک عمومی"
|
||
: null;
|
||
}
|
||
p.Notes.Add(p.RoleName is null ? "نقش: تشخیص داده نشد" : $"نقش: {p.RoleName}");
|
||
|
||
// --- Shift type ---
|
||
if (ContainsAny(text, "آنکال", "انکال")) p.ShiftType = Models.ShiftType.OnCall;
|
||
else if (text.Contains("شب")) p.ShiftType = Models.ShiftType.Night;
|
||
else if (text.Contains("عصر")) p.ShiftType = Models.ShiftType.Evening;
|
||
else if (ContainsAny(text, "صبح", "روز")) p.ShiftType = Models.ShiftType.Day;
|
||
|
||
// --- Employment type ---
|
||
if (ContainsAny(text, "پاره وقت", "پارهوقت", "پارت تایم")) p.EmploymentType = Models.EmploymentType.PartTime;
|
||
else if (text.Contains("طرح")) p.EmploymentType = Models.EmploymentType.Plan;
|
||
else if (text.Contains("قرارداد")) p.EmploymentType = Models.EmploymentType.Contract;
|
||
else if (ContainsAny(text, "تمام وقت", "تماموقت")) p.EmploymentType = Models.EmploymentType.FullTime;
|
||
|
||
// --- Gender requirement ---
|
||
if (ContainsAny(text, "خانم", "خانوم", "بانو", "زن ", "مامای")) p.Gender = Gender.Female;
|
||
else if (ContainsAny(text, "آقا", "اقا", "مرد ", "مرد،", "پسر")) p.Gender = Gender.Male;
|
||
if (p.Gender != Gender.Any)
|
||
p.Notes.Add($"جنسیت: {(p.Gender == Gender.Female ? "خانم" : "آقا")}");
|
||
|
||
// --- City / district ---
|
||
p.CityName = knownCities.FirstOrDefault(c => text.Contains(Normalize(c)));
|
||
p.DistrictName = knownDistricts.OrderByDescending(d => d.Length)
|
||
.FirstOrDefault(d => text.Contains(Normalize(d)));
|
||
|
||
// --- Profit share (درصدی / سهم) ---
|
||
var latinForShare = ToLatinDigits(text);
|
||
var share = Regex.Match(latinForShare, @"(\d{1,3})\s*(?:٪|%|درصد)");
|
||
if (!share.Success) share = Regex.Match(latinForShare, @"(?:٪|%)\s*(\d{1,3})");
|
||
if (share.Success && int.TryParse(share.Groups[1].Value, out var pct) && pct is > 0 and <= 100)
|
||
{ p.SharePercent = pct; p.Notes.Add($"سهم درآمد: {pct}٪"); }
|
||
else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت"))
|
||
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
|
||
|
||
// --- Fixed pay ---
|
||
if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
|
||
else
|
||
{
|
||
var amount = ExtractAmount(text);
|
||
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
|
||
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
|
||
}
|
||
|
||
// --- Talent extras (only meaningful for «آماده به کار») ---
|
||
if (p.Kind == ListingKind.Talent)
|
||
{
|
||
var latinT = ToLatinDigits(text);
|
||
var exp = Regex.Match(latinT, @"سابقه[^\d]{0,8}(\d{1,2})\s*سال");
|
||
if (!exp.Success) exp = Regex.Match(latinT, @"(\d{1,2})\s*سال\s*سابقه");
|
||
if (exp.Success && int.TryParse(exp.Groups[1].Value, out var yrs) && yrs is > 0 and <= 60)
|
||
{ p.YearsExperience = yrs; p.Notes.Add($"سابقه: {yrs} سال"); }
|
||
|
||
p.IsLicensed = ContainsAny(text, "پروانه دار", "پروانهدار", "دارای پروانه", "پروانه فعالیت", "پروانه طبابت");
|
||
if (p.IsLicensed) p.Notes.Add("پروانهدار");
|
||
|
||
p.PersonName = ExtractPersonName(text);
|
||
if (p.PersonName is not null) p.Notes.Add($"نام: {p.PersonName}");
|
||
|
||
var area = Regex.Match(text, @"منطقه\s*[۰-۹0-9]{1,2}");
|
||
if (area.Success) { p.AreaNote = area.Value.Trim(); p.Notes.Add($"محدوده: {p.AreaNote}"); }
|
||
}
|
||
|
||
// --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) ---
|
||
if (p.Kind != ListingKind.Talent)
|
||
{
|
||
p.FacilityName = ExtractFacilityName(text);
|
||
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
|
||
}
|
||
|
||
// --- Phone (mobile preferred, landline as fallback) ---
|
||
var latinPhone = ToLatinDigits(text);
|
||
var mobile = Regex.Match(latinPhone, @"(?:\+?98|0)?9\d{9}");
|
||
if (mobile.Success)
|
||
{
|
||
var d = Regex.Replace(mobile.Value, @"\D", "");
|
||
if (d.StartsWith("98")) d = "0" + d[2..];
|
||
if (d.Length == 10 && d.StartsWith("9")) d = "0" + d;
|
||
p.Phone = d;
|
||
}
|
||
else
|
||
{
|
||
var land = Regex.Match(latinPhone, @"0\d{2,3}[\s-]?\d{7,8}");
|
||
if (land.Success) p.Phone = Regex.Replace(land.Value, @"\D", "");
|
||
}
|
||
|
||
return p;
|
||
}
|
||
|
||
// Words that introduce a facility name, longest/most-specific first.
|
||
private static readonly string[] FacilityKeywords =
|
||
{
|
||
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
|
||
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
|
||
"آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان",
|
||
};
|
||
|
||
// Words that clearly aren't part of a facility's name — stop collecting here.
|
||
private static readonly string[] NameStops =
|
||
{
|
||
"جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب",
|
||
"دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز",
|
||
"شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم",
|
||
};
|
||
|
||
/// <summary>Best-effort hospital/clinic name: a facility keyword plus up to three name words.</summary>
|
||
private static string? ExtractFacilityName(string text)
|
||
{
|
||
foreach (var kw in FacilityKeywords)
|
||
{
|
||
var idx = text.IndexOf(kw, StringComparison.Ordinal);
|
||
if (idx < 0) continue;
|
||
var after = text[(idx + kw.Length)..];
|
||
var words = after.Split(
|
||
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' },
|
||
StringSplitOptions.RemoveEmptyEntries);
|
||
var picked = new List<string>();
|
||
foreach (var w in words)
|
||
{
|
||
if (NameStops.Contains(w)) break;
|
||
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
|
||
if (w.Length == 1) break; // stray letters
|
||
picked.Add(w);
|
||
if (picked.Count >= 3) break;
|
||
}
|
||
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
|
||
return (kw + " " + string.Join(" ", picked)).Trim();
|
||
}
|
||
return null;
|
||
}
|
||
|
||
// Titles that introduce a person's name in «آماده به کار» posts.
|
||
private static readonly string[] PersonTitles = { "دکتر", "خانم دکتر", "آقای دکتر", "مهندس", "سرکار خانم", "جناب آقای", "خانم", "آقای" };
|
||
|
||
/// <summary>Best-effort person name: a title (دکتر/خانم/…) plus up to two following words.</summary>
|
||
private static string? ExtractPersonName(string text)
|
||
{
|
||
foreach (var title in PersonTitles)
|
||
{
|
||
var idx = text.IndexOf(title, StringComparison.Ordinal);
|
||
if (idx < 0) continue;
|
||
var after = text[(idx + title.Length)..];
|
||
var words = after.Split(
|
||
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/' },
|
||
StringSplitOptions.RemoveEmptyEntries);
|
||
var picked = new List<string>();
|
||
foreach (var w in words)
|
||
{
|
||
if (NameStops.Contains(w)) break;
|
||
if (Regex.IsMatch(w, @"[\d]")) break;
|
||
if (w.Length == 1) break;
|
||
picked.Add(w);
|
||
if (picked.Count >= 2) break;
|
||
}
|
||
if (picked.Count == 0) continue;
|
||
return (title + " " + string.Join(" ", picked)).Trim();
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary>
|
||
private static long? ExtractAmount(string text)
|
||
{
|
||
var latin = ToLatinDigits(text);
|
||
// e.g. "۲ میلیون" / "2.5 میلیون"
|
||
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون");
|
||
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
|
||
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
|
||
return (long)(m * 1_000_000);
|
||
|
||
// Otherwise the largest plain number that looks like money (>= 6 digits after removing separators).
|
||
long best = 0;
|
||
foreach (Match num in Regex.Matches(latin, @"[\d٬,،.]{6,}"))
|
||
{
|
||
var digits = Regex.Replace(num.Value, @"[^\d]", "");
|
||
if (digits.Length >= 6 && long.TryParse(digits, out var v) && v > best) best = v;
|
||
}
|
||
return best > 0 ? best : null;
|
||
}
|
||
|
||
private static string Normalize(string s) => s
|
||
.Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim();
|
||
|
||
private static bool ContainsAny(string text, params string[] needles)
|
||
=> needles.Any(n => text.Contains(n));
|
||
|
||
private static string ToLatinDigits(string s)
|
||
{
|
||
var chars = s.ToCharArray();
|
||
for (var i = 0; i < chars.Length; i++)
|
||
{
|
||
if (chars[i] >= '۰' && chars[i] <= '۹') chars[i] = (char)('0' + (chars[i] - '۰'));
|
||
else if (chars[i] >= '٠' && chars[i] <= '٩') chars[i] = (char)('0' + (chars[i] - '٠'));
|
||
}
|
||
return new string(chars);
|
||
}
|
||
}
|