Files
hamkadr/src/JobsMedical.Web/Services/ListingParser.cs
T
soroush.asadi 213af9db48
CI/CD / CI · dotnet build (push) Successful in 2m37s
CI/CD / Deploy · hamkadr (push) Successful in 1m11s
AI tag/category assignment + phone extraction from web ads
AI (when enabled, now that the server proxy is up):
- AiStructured gains phone, personName, yearsExperience, isLicensed.
- The auditor appends an authoritative output-schema to the admin prompt
  so classification stays correct even with an older stored prompt — it
  now classifies kind as shift|job|talent and extracts the contact phone
  and talent details.
- Ingestion publish prefers the AI's tags (kind/role/city/facility/phone +
  talent fields) over the heuristic parser when present.
- Default prompt updated to describe the three kinds + new fields.

Phone extraction from websites (Medjobs / generic sites), where the
number sits behind a "تماس با این آگهی" reveal:
- HtmlUtil.HarvestPhones scans the full markup for tel: links, JSON-LD
  "telephone", data-*phone* attributes, and inline Iranian mobile/landline
  numbers (Persian digits folded), normalized (mobiles 09…, landlines 0…).
- Medjobs + Website sources append harvested numbers to the ad text so the
  parser/AI capture them; manual review then prefills the phone too.
- Parser phone extraction now also captures a landline as a fallback.

Note: if a site loads the number purely via XHR (not in HTML), a
per-source reveal endpoint would be a follow-up.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-08 08:11:14 +03:30

284 lines
15 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services;
/// <summary>Structured guess extracted from a raw channel post. All fields are best-effort.</summary>
public class ParsedListing
{
public ListingKind Kind { get; set; } = ListingKind.Shift;
public string? RoleName { get; set; }
public ShiftType? ShiftType { get; set; }
public EmploymentType? EmploymentType { get; set; }
public long? PayAmount { get; set; } // shift pay or single salary figure
public int? SharePercent { get; set; } // profit-share % (درصدی / سهم درآمد)
public bool PayNegotiable { get; set; }
public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز
public string? CityName { get; set; }
public string? DistrictName { get; set; }
public string? FacilityName { get; set; } // hospital/clinic name guessed from the text
public string? Phone { get; set; }
// «آماده به کار» (talent) extras — populated when Kind == Talent.
public string? PersonName { get; set; } // «دکتر سپیده علیزاده»
public int? YearsExperience { get; set; } // سابقه (سال)
public bool IsLicensed { get; set; } // پروانه‌دار
public string? AreaNote { get; set; } // «فقط منطقه ۱»
public List<string> Notes { get; set; } = new(); // what was/wasn't detected (shown to admin)
}
/// <summary>
/// Turns a messy Persian channel/Divar post into a structured listing guess. This is the
/// Stage-1 implementation: transparent keyword + regex heuristics, no AI dependency (important
/// since LLM APIs are blocked from Iran). A future LlmListingParser can implement the same
/// interface and be swapped in via DI without touching the admin queue.
/// </summary>
public interface IListingParser
{
ParsedListing Parse(string rawText, IEnumerable<string> knownRoles,
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts);
}
public class HeuristicListingParser : IListingParser
{
public ParsedListing Parse(string raw, IEnumerable<string> knownRoles,
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts)
{
var p = new ParsedListing();
var text = Normalize(raw);
// --- Kind: talent (worker offers themselves) vs shift vs hiring ---
// Talent is checked first: «آماده به کار/همکاری», «جویای کار» mean the *person* is
// available — distinct from an employer's «دعوت به همکاری».
bool talentSignals = ContainsAny(text,
"آماده به کار", "آماده‌به‌کار", "آماده همکاری", "آماده‌ی همکاری", "آماده ی همکاری",
"آماده فعالیت", "جویای کار", "جویای کار هستم", "متقاضی کار", "نیازمند کار",
"آماده انجام", "می‌توانم همکاری", "میتوانم همکاری", "حاضر به همکاری");
bool jobSignals = ContainsAny(text, "استخدام", "جذب", "دعوت به همکاری", "نیازمندیم", "نیازمند است", "حقوق ثابت");
bool shiftSignals = ContainsAny(text, "شیفت", "آنکال", "انکال", "نوبت", "کشیک");
if (talentSignals)
{
p.Kind = ListingKind.Talent;
p.Notes.Add("نوع: آماده به کار (تشخیص خودکار)");
}
else
{
p.Kind = (jobSignals && !shiftSignals) ? ListingKind.Job : ListingKind.Shift;
p.Notes.Add(p.Kind == ListingKind.Job ? "نوع: استخدام (تشخیص خودکار)" : "نوع: شیفت (تشخیص خودکار)");
}
// --- Role (longest match first so «پزشک متخصص» beats «پزشک») ---
foreach (var role in knownRoles.OrderByDescending(r => r.Length))
{
if (text.Contains(Normalize(role))) { p.RoleName = role; break; }
}
// Synonyms common on Divar/Medjobs → canonical seeded role names.
if (p.RoleName is null)
{
p.RoleName =
ContainsAny(text, "اتاق عمل", "اسکراب") ? "تکنسین اتاق عمل"
: ContainsAny(text, "فوریت", "اورژانس پیش بیمارستانی", "آمبولانس") ? "تکنسین فوریت‌های پزشکی"
: ContainsAny(text, "آزمایشگاه", "علوم آزمایشگاهی", "نمونه گیر") ? "کارشناس آزمایشگاه"
: ContainsAny(text, "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر", "مراقب", "سالمند", "همراه بیمار", "تزریقات", "پانسمان") ? "پرستار"
: ContainsAny(text, "ماما", "مامایی") ? "ماما"
: ContainsAny(text, "فوق تخصص", "متخصص") ? "پزشک متخصص"
: ContainsAny(text, "پزشک", "دکتر", "طبیب") ? "پزشک عمومی"
: null;
}
p.Notes.Add(p.RoleName is null ? "نقش: تشخیص داده نشد" : $"نقش: {p.RoleName}");
// --- Shift type ---
if (ContainsAny(text, "آنکال", "انکال")) p.ShiftType = Models.ShiftType.OnCall;
else if (text.Contains("شب")) p.ShiftType = Models.ShiftType.Night;
else if (text.Contains("عصر")) p.ShiftType = Models.ShiftType.Evening;
else if (ContainsAny(text, "صبح", "روز")) p.ShiftType = Models.ShiftType.Day;
// --- Employment type ---
if (ContainsAny(text, "پاره وقت", "پاره‌وقت", "پارت تایم")) p.EmploymentType = Models.EmploymentType.PartTime;
else if (text.Contains("طرح")) p.EmploymentType = Models.EmploymentType.Plan;
else if (text.Contains("قرارداد")) p.EmploymentType = Models.EmploymentType.Contract;
else if (ContainsAny(text, "تمام وقت", "تمام‌وقت")) p.EmploymentType = Models.EmploymentType.FullTime;
// --- Gender requirement ---
if (ContainsAny(text, "خانم", "خانوم", "بانو", "زن ", "مامای")) p.Gender = Gender.Female;
else if (ContainsAny(text, "آقا", "اقا", "مرد ", "مرد،", "پسر")) p.Gender = Gender.Male;
if (p.Gender != Gender.Any)
p.Notes.Add($"جنسیت: {(p.Gender == Gender.Female ? "خانم" : "آقا")}");
// --- City / district ---
p.CityName = knownCities.FirstOrDefault(c => text.Contains(Normalize(c)));
p.DistrictName = knownDistricts.OrderByDescending(d => d.Length)
.FirstOrDefault(d => text.Contains(Normalize(d)));
// --- Profit share (درصدی / سهم) ---
var latinForShare = ToLatinDigits(text);
var share = Regex.Match(latinForShare, @"(\d{1,3})\s*(?:٪|%|درصد)");
if (!share.Success) share = Regex.Match(latinForShare, @"(?:٪|%)\s*(\d{1,3})");
if (share.Success && int.TryParse(share.Groups[1].Value, out var pct) && pct is > 0 and <= 100)
{ p.SharePercent = pct; p.Notes.Add($"سهم درآمد: {pct}٪"); }
else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت"))
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
// --- Fixed pay ---
if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
else
{
var amount = ExtractAmount(text);
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
}
// --- Talent extras (only meaningful for «آماده به کار») ---
if (p.Kind == ListingKind.Talent)
{
var latinT = ToLatinDigits(text);
var exp = Regex.Match(latinT, @"سابقه[^\d]{0,8}(\d{1,2})\s*سال");
if (!exp.Success) exp = Regex.Match(latinT, @"(\d{1,2})\s*سال\s*سابقه");
if (exp.Success && int.TryParse(exp.Groups[1].Value, out var yrs) && yrs is > 0 and <= 60)
{ p.YearsExperience = yrs; p.Notes.Add($"سابقه: {yrs} سال"); }
p.IsLicensed = ContainsAny(text, "پروانه دار", "پروانه‌دار", "دارای پروانه", "پروانه فعالیت", "پروانه طبابت");
if (p.IsLicensed) p.Notes.Add("پروانه‌دار");
p.PersonName = ExtractPersonName(text);
if (p.PersonName is not null) p.Notes.Add($"نام: {p.PersonName}");
var area = Regex.Match(text, @"منطقه\s*[۰-۹0-9]{1,2}");
if (area.Success) { p.AreaNote = area.Value.Trim(); p.Notes.Add($"محدوده: {p.AreaNote}"); }
}
// --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) ---
if (p.Kind != ListingKind.Talent)
{
p.FacilityName = ExtractFacilityName(text);
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
}
// --- Phone (mobile preferred, landline as fallback) ---
var latinPhone = ToLatinDigits(text);
var mobile = Regex.Match(latinPhone, @"(?:\+?98|0)?9\d{9}");
if (mobile.Success)
{
var d = Regex.Replace(mobile.Value, @"\D", "");
if (d.StartsWith("98")) d = "0" + d[2..];
if (d.Length == 10 && d.StartsWith("9")) d = "0" + d;
p.Phone = d;
}
else
{
var land = Regex.Match(latinPhone, @"0\d{2,3}[\s-]?\d{7,8}");
if (land.Success) p.Phone = Regex.Replace(land.Value, @"\D", "");
}
return p;
}
// Words that introduce a facility name, longest/most-specific first.
private static readonly string[] FacilityKeywords =
{
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
"آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان",
};
// Words that clearly aren't part of a facility's name — stop collecting here.
private static readonly string[] NameStops =
{
"جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب",
"دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز",
"شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم",
};
/// <summary>Best-effort hospital/clinic name: a facility keyword plus up to three name words.</summary>
private static string? ExtractFacilityName(string text)
{
foreach (var kw in FacilityKeywords)
{
var idx = text.IndexOf(kw, StringComparison.Ordinal);
if (idx < 0) continue;
var after = text[(idx + kw.Length)..];
var words = after.Split(
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' },
StringSplitOptions.RemoveEmptyEntries);
var picked = new List<string>();
foreach (var w in words)
{
if (NameStops.Contains(w)) break;
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
if (w.Length == 1) break; // stray letters
picked.Add(w);
if (picked.Count >= 3) break;
}
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
return (kw + " " + string.Join(" ", picked)).Trim();
}
return null;
}
// Titles that introduce a person's name in «آماده به کار» posts.
private static readonly string[] PersonTitles = { "دکتر", "خانم دکتر", "آقای دکتر", "مهندس", "سرکار خانم", "جناب آقای", "خانم", "آقای" };
/// <summary>Best-effort person name: a title (دکتر/خانم/…) plus up to two following words.</summary>
private static string? ExtractPersonName(string text)
{
foreach (var title in PersonTitles)
{
var idx = text.IndexOf(title, StringComparison.Ordinal);
if (idx < 0) continue;
var after = text[(idx + title.Length)..];
var words = after.Split(
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/' },
StringSplitOptions.RemoveEmptyEntries);
var picked = new List<string>();
foreach (var w in words)
{
if (NameStops.Contains(w)) break;
if (Regex.IsMatch(w, @"[\d]")) break;
if (w.Length == 1) break;
picked.Add(w);
if (picked.Count >= 2) break;
}
if (picked.Count == 0) continue;
return (title + " " + string.Join(" ", picked)).Trim();
}
return null;
}
/// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary>
private static long? ExtractAmount(string text)
{
var latin = ToLatinDigits(text);
// e.g. "۲ میلیون" / "2.5 میلیون"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون");
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
return (long)(m * 1_000_000);
// Otherwise the largest plain number that looks like money (>= 6 digits after removing separators).
long best = 0;
foreach (Match num in Regex.Matches(latin, @"[\d٬,،.]{6,}"))
{
var digits = Regex.Replace(num.Value, @"[^\d]", "");
if (digits.Length >= 6 && long.TryParse(digits, out var v) && v > best) best = v;
}
return best > 0 ? best : null;
}
private static string Normalize(string s) => s
.Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim();
private static bool ContainsAny(string text, params string[] needles)
=> needles.Any(n => text.Contains(n));
private static string ToLatinDigits(string s)
{
var chars = s.ToCharArray();
for (var i = 0; i < chars.Length; i++)
{
if (chars[i] >= '۰' && chars[i] <= '۹') chars[i] = (char)('0' + (chars[i] - '۰'));
else if (chars[i] >= '٠' && chars[i] <= '٩') chars[i] = (char)('0' + (chars[i] - '٠'));
}
return new string(chars);
}
}