From 213af9db48790145da2c5f044d7b29a1e13a81e2 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Mon, 8 Jun 2026 08:11:14 +0330 Subject: [PATCH] AI tag/category assignment + phone extraction from web ads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AI (when enabled, now that the server proxy is up): - AiStructured gains phone, personName, yearsExperience, isLicensed. - The auditor appends an authoritative output-schema to the admin prompt so classification stays correct even with an older stored prompt — it now classifies kind as shift|job|talent and extracts the contact phone and talent details. - Ingestion publish prefers the AI's tags (kind/role/city/facility/phone + talent fields) over the heuristic parser when present. - Default prompt updated to describe the three kinds + new fields. Phone extraction from websites (Medjobs / generic sites), where the number sits behind a "تماس با این آگهی" reveal: - HtmlUtil.HarvestPhones scans the full markup for tel: links, JSON-LD "telephone", data-*phone* attributes, and inline Iranian mobile/landline numbers (Persian digits folded), normalized (mobiles 09…, landlines 0…). - Medjobs + Website sources append harvested numbers to the ad text so the parser/AI capture them; manual review then prefills the phone too. - Parser phone extraction now also captures a landline as a fallback. Note: if a site loads the number purely via XHR (not in HTML), a per-source reveal endpoint would be a follow-up. Co-Authored-By: Claude Opus 4.8 --- src/JobsMedical.Web/Models/AppSetting.cs | 14 ++++-- src/JobsMedical.Web/Services/ListingParser.cs | 18 ++++++-- .../Services/Scraping/AiAuditor.cs | 29 ++++++++++-- .../Services/Scraping/IngestionService.cs | 23 ++++++---- .../Services/Scraping/MedjobsListingSource.cs | 9 +++- .../Scraping/TelegramListingSource.cs | 45 +++++++++++++++++++ .../Services/Scraping/WebsiteListingSource.cs | 8 +++- 7 files changed, 126 insertions(+), 20 deletions(-) diff --git a/src/JobsMedical.Web/Models/AppSetting.cs b/src/JobsMedical.Web/Models/AppSetting.cs index bdbfcea..cb4aa1f 100644 --- a/src/JobsMedical.Web/Models/AppSetting.cs +++ b/src/JobsMedical.Web/Models/AppSetting.cs @@ -115,14 +115,20 @@ public class AppSetting public const string DefaultPrompt = """ تو دستیار بررسی آگهی‌های کاری حوزه درمان برای پلتفرم «همکادر» هستی. هر آگهی خام را بخوان و تصمیم بگیر: - - approve: آگهی واقعی و مرتبط با شیفت/استخدام کادر درمان است و اطلاعات کافی دارد. + - approve: آگهی واقعی و مرتبط با کادر درمان است و اطلاعات کافی دارد. - reject: تبلیغ، اسپم، نامرتبط، یا فاقد اطلاعات حداقلی است. - review: مرتبط است اما ناقص/مبهم و نیاز به بررسی انسانی دارد. - نقش، شهر/محله، نوع شیفت، نوع همکاری، مبلغ یا درصد سهم، و عنوان را در صورت وجود استخراج کن. + سه نوع آگهی داریم: + - shift: مرکز درمانی برای یک شیفت نیرو می‌خواهد. + - job: مرکز درمانی برای استخدام دائم نیرو می‌خواهد. + - talent: خودِ کادر درمان اعلام «آماده به کار / آماده همکاری» کرده است. + نقش، شهر/محله، نوع شیفت/همکاری، مبلغ یا درصد سهم، عنوان، نام مرکز، و شماره تماس را در صورت وجود استخراج کن. + برای talent: نام فرد، سال سابقه و پروانه‌دار بودن را هم استخراج کن. فقط با یک شیء JSON پاسخ بده با کلیدهای: decision (approve|reject|review)، confidence (0-100)، reason (فارسی کوتاه)، - kind (shift|job)، role، city، district، shiftType (day|evening|night|oncall)، + kind (shift|job|talent)، role، city، district، shiftType (day|evening|night|oncall)، employmentType (fulltime|parttime|contract|plan)، payAmount (عدد تومان یا null)، - sharePercent (0-100 یا null)، title، facilityName. + sharePercent (0-100 یا null)، title، facilityName، phone، + personName، yearsExperience (عدد یا null)، isLicensed (true|false). """; } diff --git a/src/JobsMedical.Web/Services/ListingParser.cs b/src/JobsMedical.Web/Services/ListingParser.cs index 68172d7..98096af 100644 --- a/src/JobsMedical.Web/Services/ListingParser.cs +++ b/src/JobsMedical.Web/Services/ListingParser.cs @@ -154,9 +154,21 @@ public class HeuristicListingParser : IListingParser if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}"); } - // --- Phone --- - var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}"); - if (phone.Success) p.Phone = phone.Value; + // --- Phone (mobile preferred, landline as fallback) --- + var latinPhone = ToLatinDigits(text); + var mobile = Regex.Match(latinPhone, @"(?:\+?98|0)?9\d{9}"); + if (mobile.Success) + { + var d = Regex.Replace(mobile.Value, @"\D", ""); + if (d.StartsWith("98")) d = "0" + d[2..]; + if (d.Length == 10 && d.StartsWith("9")) d = "0" + d; + p.Phone = d; + } + else + { + var land = Regex.Match(latinPhone, @"0\d{2,3}[\s-]?\d{7,8}"); + if (land.Success) p.Phone = Regex.Replace(land.Value, @"\D", ""); + } return p; } diff --git a/src/JobsMedical.Web/Services/Scraping/AiAuditor.cs b/src/JobsMedical.Web/Services/Scraping/AiAuditor.cs index 41c271e..6a5aed7 100644 --- a/src/JobsMedical.Web/Services/Scraping/AiAuditor.cs +++ b/src/JobsMedical.Web/Services/Scraping/AiAuditor.cs @@ -7,7 +7,8 @@ namespace JobsMedical.Web.Services.Scraping; public record AiStructured( string? Kind, string? Role, string? City, string? District, string? ShiftType, - string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName); + string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName, + string? Phone = null, string? PersonName = null, int? YearsExperience = null, bool? IsLicensed = null); /// An AI verdict on a raw listing. public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data) @@ -30,6 +31,24 @@ public interface IAiAuditor /// public class OpenAiCompatibleAuditor : IAiAuditor { + // Authoritative output contract appended to the admin prompt so tags/categories stay correct + // (including the «آماده به کار» type and contact phone) regardless of the stored prompt text. + private const string OutputSchema = """ + فقط یک شیء JSON با این کلیدها برگردان (هر فیلد نامشخص = null): + decision: approve|reject|review + confidence: عدد ۰ تا ۱۰۰ + reason: توضیح کوتاه فارسی + kind: shift (شیفت توسط مرکز) | job (استخدام توسط مرکز) | talent (کادر درمان که خودش «آماده به کار» است) + role: عنوان دقیق نقش درمانی (مثل پرستار، پزشک عمومی، دندانپزشک، تکنسین اتاق عمل، ماما، کارشناس آزمایشگاه) + city, district: نام شهر و محله/منطقه در صورت ذکر + shiftType: day|evening|night|oncall (فقط برای shift) + employmentType: fulltime|parttime|contract|plan + payAmount: عدد تومان یا null ، sharePercent: عدد ۰ تا ۱۰۰ یا null (مثل «۵۰٪ تسویه») + title: عنوان کوتاه ، facilityName: نام مرکز درمانی (فقط برای shift/job) + phone: شماره تماس (موبایل یا ثابت) به‌صورت رقم لاتین، یا null + personName: نام فرد (فقط برای talent) ، yearsExperience: سال سابقه عدد یا null ، isLicensed: true/false (پروانه‌دار) + """; + private readonly ScrapeHttpClients _clients; private readonly ILogger _log; @@ -52,7 +71,9 @@ public class OpenAiCompatibleAuditor : IAiAuditor response_format = new { type = "json_object" }, messages = new object[] { - new { role = "system", content = s.AiSystemPrompt }, + // Admin prompt + an authoritative output schema, so classification/tags stay + // correct even if the stored prompt predates the talent/phone fields. + new { role = "system", content = s.AiSystemPrompt + "\n\n" + OutputSchema }, new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." }, }, }; @@ -100,10 +121,12 @@ public class OpenAiCompatibleAuditor : IAiAuditor int I(string k, int d) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : d; long? L(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt64(out var n) ? n : null; int? NI(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : null; + bool? B(string k) => r.TryGetProperty(k, out var v) && (v.ValueKind == JsonValueKind.True || v.ValueKind == JsonValueKind.False) ? v.GetBoolean() : null; var decision = (S("decision") ?? "review").ToLowerInvariant(); var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"), - S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName")); + S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"), + Phone: S("phone"), PersonName: S("personName"), YearsExperience: NI("yearsExperience"), IsLicensed: B("isLicensed")); return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data); } } diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index fdcbe33..e45dd52 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -175,16 +175,23 @@ public class IngestionService // «آماده به کار» — a worker offering themselves. No facility involved. if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده")) { + // Prefer the AI's tags when present, else the heuristic parser. + var tPay = d?.PayAmount ?? parsed.PayAmount; + var tShare = d?.SharePercent ?? parsed.SharePercent; _db.TalentListings.Add(new TalentListing { Role = role, City = city, DistrictId = district?.Id, - PersonName = parsed.PersonName, YearsExperience = parsed.YearsExperience, - IsLicensed = parsed.IsLicensed, AreaNote = parsed.AreaNote, - Availability = parsed.EmploymentType, Gender = parsed.Gender, - PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage - : parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift, - PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent, - Phone = parsed.Phone, Description = raw.RawText, + PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName, + YearsExperience = d?.YearsExperience ?? parsed.YearsExperience, + IsLicensed = d?.IsLicensed ?? parsed.IsLicensed, + AreaNote = parsed.AreaNote, + Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType), + Gender = parsed.Gender, + PayType = tShare is not null && tPay is null ? PayType.Percentage + : tPay is null ? PayType.Negotiable : PayType.PerShift, + PayAmount = tPay, SharePercent = tShare, + Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, + Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, }); raw.Status = RawListingStatus.Normalized; @@ -201,7 +208,7 @@ public class IngestionService facility = new Facility { Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id, - Phone = parsed.Phone, IsVerified = false, + Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false, }; _db.Facilities.Add(facility); facilities.Add(facility); // so later listings in this run match it too diff --git a/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs b/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs index bba29fd..592b8a0 100644 --- a/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs @@ -93,7 +93,14 @@ public class MedjobsListingSource : IListingSource var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p)); var text = HtmlUtil.ToPlainText(string.Join("\n", parts)); - return text.Length > 1800 ? text[..1800] : text; + if (text.Length > 1800) text = text[..1800]; + + // The contact number is often outside the description (in a tel: link / data attribute the + // page reveals on click). Harvest it from the full HTML and append so the parser/AI see it. + var phones = HtmlUtil.HarvestPhones(html); + if (phones.Count > 0 && !phones.Any(text.Contains)) + text += "\nشماره تماس: " + string.Join("، ", phones); + return text; } private static string? Meta(string html, string prop) diff --git a/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs b/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs index 8c8e1e4..91b2a6b 100644 --- a/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs @@ -62,4 +62,49 @@ internal static class HtmlUtil s = Regex.Replace(s, "[ \\t]+", " "); return s.Trim(); } + + /// Convert Persian/Arabic-Indic digits to Latin. + public static string ToLatinDigits(string s) + { + var a = s.ToCharArray(); + for (var i = 0; i < a.Length; i++) + { + if (a[i] >= '۰' && a[i] <= '۹') a[i] = (char)('0' + (a[i] - '۰')); + else if (a[i] >= '٠' && a[i] <= '٩') a[i] = (char)('0' + (a[i] - '٠')); + } + return new string(a); + } + + /// + /// Pull Iranian phone numbers out of a page's HTML — including ones a site reveals on click + /// (often still present as a tel: link, a data-*phone* attribute, or JSON-LD "telephone"). + /// Returns normalized numbers (mobiles as 09xxxxxxxxx, landlines with leading 0), mobiles first. + /// + public static List HarvestPhones(string html) + { + if (string.IsNullOrEmpty(html)) return new(); + var latin = ToLatinDigits(html); + var found = new List(); + + void Add(string raw) + { + var d = Regex.Replace(raw, @"\D", ""); + if (d.StartsWith("0098")) d = "0" + d[4..]; + else if (d.StartsWith("98") && d.Length >= 12) d = "0" + d[2..]; + if (Regex.IsMatch(d, @"^9\d{9}$")) d = "0" + d; // 9xxxxxxxxx → 09xxxxxxxxx + bool ok = Regex.IsMatch(d, @"^09\d{9}$") // mobile + || Regex.IsMatch(d, @"^0\d{10}$"); // landline w/ area code + if (ok && !found.Contains(d)) found.Add(d); + } + + // Highest-signal sources first. + foreach (Match m in Regex.Matches(latin, @"tel:\+?([\d\s\-]{7,})")) Add(m.Groups[1].Value); + foreach (Match m in Regex.Matches(latin, "\"telephone\"\\s*:\\s*\"([^\"]+)\"")) Add(m.Groups[1].Value); + foreach (Match m in Regex.Matches(latin, "data-[\\w-]*phone[\\w-]*=[\"']([^\"']+)[\"']", RegexOptions.IgnoreCase)) Add(m.Groups[1].Value); + // Then bare numbers anywhere in the markup — mobiles, then landlines. + foreach (Match m in Regex.Matches(latin, @"(?:\+?98|0)?9\d{9}")) Add(m.Value); + foreach (Match m in Regex.Matches(latin, @"0\d{2,3}[\s-]?\d{7,8}")) Add(m.Value); + + return found.Take(3).ToList(); + } } diff --git a/src/JobsMedical.Web/Services/Scraping/WebsiteListingSource.cs b/src/JobsMedical.Web/Services/Scraping/WebsiteListingSource.cs index 65d0257..c50e8df 100644 --- a/src/JobsMedical.Web/Services/Scraping/WebsiteListingSource.cs +++ b/src/JobsMedical.Web/Services/Scraping/WebsiteListingSource.cs @@ -51,7 +51,13 @@ public class WebsiteListingSource : IListingSource string? body = Between(html, "rtcl-description") ?? Between(html, "entry-content") ?? Between(html, "job-description") ?? Meta(html, "og:description"); var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x)))); - return text.Length > 1800 ? text[..1800] : text; + if (text.Length > 1800) text = text[..1800]; + + // Append any contact number found in the full markup (tel:/data-phone/JSON-LD/inline). + var phones = HtmlUtil.HarvestPhones(html); + if (phones.Count > 0 && !phones.Any(text.Contains)) + text += "\nشماره تماس: " + string.Join("، ", phones); + return text; } private static string? Meta(string html, string prop)