AI tag/category assignment + phone extraction from web ads
CI/CD / CI · dotnet build (push) Successful in 2m37s
CI/CD / Deploy · hamkadr (push) Successful in 1m11s

AI (when enabled, now that the server proxy is up):
- AiStructured gains phone, personName, yearsExperience, isLicensed.
- The auditor appends an authoritative output-schema to the admin prompt
  so classification stays correct even with an older stored prompt — it
  now classifies kind as shift|job|talent and extracts the contact phone
  and talent details.
- Ingestion publish prefers the AI's tags (kind/role/city/facility/phone +
  talent fields) over the heuristic parser when present.
- Default prompt updated to describe the three kinds + new fields.

Phone extraction from websites (Medjobs / generic sites), where the
number sits behind a "تماس با این آگهی" reveal:
- HtmlUtil.HarvestPhones scans the full markup for tel: links, JSON-LD
  "telephone", data-*phone* attributes, and inline Iranian mobile/landline
  numbers (Persian digits folded), normalized (mobiles 09…, landlines 0…).
- Medjobs + Website sources append harvested numbers to the ad text so the
  parser/AI capture them; manual review then prefills the phone too.
- Parser phone extraction now also captures a landline as a fallback.

Note: if a site loads the number purely via XHR (not in HTML), a
per-source reveal endpoint would be a follow-up.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-08 08:11:14 +03:30
parent 4e5df73cf7
commit 213af9db48
7 changed files with 126 additions and 20 deletions
+10 -4
View File
@@ -115,14 +115,20 @@ public class AppSetting
public const string DefaultPrompt = """ public const string DefaultPrompt = """
تو دستیار بررسی آگهیهای کاری حوزه درمان برای پلتفرم «همکادر» هستی. تو دستیار بررسی آگهیهای کاری حوزه درمان برای پلتفرم «همکادر» هستی.
هر آگهی خام را بخوان و تصمیم بگیر: هر آگهی خام را بخوان و تصمیم بگیر:
- approve: آگهی واقعی و مرتبط با شیفت/استخدام کادر درمان است و اطلاعات کافی دارد. - approve: آگهی واقعی و مرتبط با کادر درمان است و اطلاعات کافی دارد.
- reject: تبلیغ، اسپم، نامرتبط، یا فاقد اطلاعات حداقلی است. - reject: تبلیغ، اسپم، نامرتبط، یا فاقد اطلاعات حداقلی است.
- review: مرتبط است اما ناقص/مبهم و نیاز به بررسی انسانی دارد. - review: مرتبط است اما ناقص/مبهم و نیاز به بررسی انسانی دارد.
نقش، شهر/محله، نوع شیفت، نوع همکاری، مبلغ یا درصد سهم، و عنوان را در صورت وجود استخراج کن. سه نوع آگهی داریم:
- shift: مرکز درمانی برای یک شیفت نیرو میخواهد.
- job: مرکز درمانی برای استخدام دائم نیرو میخواهد.
- talent: خودِ کادر درمان اعلام «آماده به کار / آماده همکاری» کرده است.
نقش، شهر/محله، نوع شیفت/همکاری، مبلغ یا درصد سهم، عنوان، نام مرکز، و شماره تماس را در صورت وجود استخراج کن.
برای talent: نام فرد، سال سابقه و پروانهدار بودن را هم استخراج کن.
فقط با یک شیء JSON پاسخ بده با کلیدهای: فقط با یک شیء JSON پاسخ بده با کلیدهای:
decision (approve|reject|review)، confidence (0-100)، reason (فارسی کوتاه)، decision (approve|reject|review)، confidence (0-100)، reason (فارسی کوتاه)،
kind (shift|job)، role، city، district، shiftType (day|evening|night|oncall)، kind (shift|job|talent)، role، city، district، shiftType (day|evening|night|oncall)،
employmentType (fulltime|parttime|contract|plan)، payAmount (عدد تومان یا null)، employmentType (fulltime|parttime|contract|plan)، payAmount (عدد تومان یا null)،
sharePercent (0-100 یا null)، title، facilityName. sharePercent (0-100 یا null)، title، facilityName، phone،
personName، yearsExperience (عدد یا null)، isLicensed (true|false).
"""; """;
} }
+15 -3
View File
@@ -154,9 +154,21 @@ public class HeuristicListingParser : IListingParser
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}"); if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
} }
// --- Phone --- // --- Phone (mobile preferred, landline as fallback) ---
var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}"); var latinPhone = ToLatinDigits(text);
if (phone.Success) p.Phone = phone.Value; var mobile = Regex.Match(latinPhone, @"(?:\+?98|0)?9\d{9}");
if (mobile.Success)
{
var d = Regex.Replace(mobile.Value, @"\D", "");
if (d.StartsWith("98")) d = "0" + d[2..];
if (d.Length == 10 && d.StartsWith("9")) d = "0" + d;
p.Phone = d;
}
else
{
var land = Regex.Match(latinPhone, @"0\d{2,3}[\s-]?\d{7,8}");
if (land.Success) p.Phone = Regex.Replace(land.Value, @"\D", "");
}
return p; return p;
} }
@@ -7,7 +7,8 @@ namespace JobsMedical.Web.Services.Scraping;
public record AiStructured( public record AiStructured(
string? Kind, string? Role, string? City, string? District, string? ShiftType, string? Kind, string? Role, string? City, string? District, string? ShiftType,
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName); string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName,
string? Phone = null, string? PersonName = null, int? YearsExperience = null, bool? IsLicensed = null);
/// <summary>An AI verdict on a raw listing.</summary> /// <summary>An AI verdict on a raw listing.</summary>
public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data) public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data)
@@ -30,6 +31,24 @@ public interface IAiAuditor
/// </summary> /// </summary>
public class OpenAiCompatibleAuditor : IAiAuditor public class OpenAiCompatibleAuditor : IAiAuditor
{ {
// Authoritative output contract appended to the admin prompt so tags/categories stay correct
// (including the «آماده به کار» type and contact phone) regardless of the stored prompt text.
private const string OutputSchema = """
فقط یک شیء JSON با این کلیدها برگردان (هر فیلد نامشخص = null):
decision: approve|reject|review
confidence: عدد ۰ تا ۱۰۰
reason: توضیح کوتاه فارسی
kind: shift (شیفت توسط مرکز) | job (استخدام توسط مرکز) | talent (کادر درمان که خودش «آماده به کار» است)
role: عنوان دقیق نقش درمانی (مثل پرستار، پزشک عمومی، دندانپزشک، تکنسین اتاق عمل، ماما، کارشناس آزمایشگاه)
city, district: نام شهر و محله/منطقه در صورت ذکر
shiftType: day|evening|night|oncall (فقط برای shift)
employmentType: fulltime|parttime|contract|plan
payAmount: عدد تومان یا null ، sharePercent: عدد ۰ تا ۱۰۰ یا null (مثل «۵۰٪ تسویه»)
title: عنوان کوتاه ، facilityName: نام مرکز درمانی (فقط برای shift/job)
phone: شماره تماس (موبایل یا ثابت) بهصورت رقم لاتین، یا null
personName: نام فرد (فقط برای talent) ، yearsExperience: سال سابقه عدد یا null ، isLicensed: true/false (پروانهدار)
""";
private readonly ScrapeHttpClients _clients; private readonly ScrapeHttpClients _clients;
private readonly ILogger<OpenAiCompatibleAuditor> _log; private readonly ILogger<OpenAiCompatibleAuditor> _log;
@@ -52,7 +71,9 @@ public class OpenAiCompatibleAuditor : IAiAuditor
response_format = new { type = "json_object" }, response_format = new { type = "json_object" },
messages = new object[] messages = new object[]
{ {
new { role = "system", content = s.AiSystemPrompt }, // Admin prompt + an authoritative output schema, so classification/tags stay
// correct even if the stored prompt predates the talent/phone fields.
new { role = "system", content = s.AiSystemPrompt + "\n\n" + OutputSchema },
new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." }, new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." },
}, },
}; };
@@ -100,10 +121,12 @@ public class OpenAiCompatibleAuditor : IAiAuditor
int I(string k, int d) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : d; int I(string k, int d) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : d;
long? L(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt64(out var n) ? n : null; long? L(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt64(out var n) ? n : null;
int? NI(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : null; int? NI(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : null;
bool? B(string k) => r.TryGetProperty(k, out var v) && (v.ValueKind == JsonValueKind.True || v.ValueKind == JsonValueKind.False) ? v.GetBoolean() : null;
var decision = (S("decision") ?? "review").ToLowerInvariant(); var decision = (S("decision") ?? "review").ToLowerInvariant();
var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"), var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"),
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName")); S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"),
Phone: S("phone"), PersonName: S("personName"), YearsExperience: NI("yearsExperience"), IsLicensed: B("isLicensed"));
return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data); return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data);
} }
} }
@@ -175,16 +175,23 @@ public class IngestionService
// «آماده به کار» — a worker offering themselves. No facility involved. // «آماده به کار» — a worker offering themselves. No facility involved.
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده")) if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
{ {
// Prefer the AI's tags when present, else the heuristic parser.
var tPay = d?.PayAmount ?? parsed.PayAmount;
var tShare = d?.SharePercent ?? parsed.SharePercent;
_db.TalentListings.Add(new TalentListing _db.TalentListings.Add(new TalentListing
{ {
Role = role, City = city, DistrictId = district?.Id, Role = role, City = city, DistrictId = district?.Id,
PersonName = parsed.PersonName, YearsExperience = parsed.YearsExperience, PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
IsLicensed = parsed.IsLicensed, AreaNote = parsed.AreaNote, YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
Availability = parsed.EmploymentType, Gender = parsed.Gender, IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage AreaNote = parsed.AreaNote,
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift, Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent, Gender = parsed.Gender,
Phone = parsed.Phone, Description = raw.RawText, PayType = tShare is not null && tPay is null ? PayType.Percentage
: tPay is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = tPay, SharePercent = tShare,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
Description = raw.RawText,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
}); });
raw.Status = RawListingStatus.Normalized; raw.Status = RawListingStatus.Normalized;
@@ -201,7 +208,7 @@ public class IngestionService
facility = new Facility facility = new Facility
{ {
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id, Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
Phone = parsed.Phone, IsVerified = false, Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
}; };
_db.Facilities.Add(facility); _db.Facilities.Add(facility);
facilities.Add(facility); // so later listings in this run match it too facilities.Add(facility); // so later listings in this run match it too
@@ -93,7 +93,14 @@ public class MedjobsListingSource : IListingSource
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p)); var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
var text = HtmlUtil.ToPlainText(string.Join("\n", parts)); var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
return text.Length > 1800 ? text[..1800] : text; if (text.Length > 1800) text = text[..1800];
// The contact number is often outside the description (in a tel: link / data attribute the
// page reveals on click). Harvest it from the full HTML and append so the parser/AI see it.
var phones = HtmlUtil.HarvestPhones(html);
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
} }
private static string? Meta(string html, string prop) private static string? Meta(string html, string prop)
@@ -62,4 +62,49 @@ internal static class HtmlUtil
s = Regex.Replace(s, "[ \\t]+", " "); s = Regex.Replace(s, "[ \\t]+", " ");
return s.Trim(); return s.Trim();
} }
/// <summary>Convert Persian/Arabic-Indic digits to Latin.</summary>
public static string ToLatinDigits(string s)
{
var a = s.ToCharArray();
for (var i = 0; i < a.Length; i++)
{
if (a[i] >= '۰' && a[i] <= '۹') a[i] = (char)('0' + (a[i] - '۰'));
else if (a[i] >= '٠' && a[i] <= '٩') a[i] = (char)('0' + (a[i] - '٠'));
}
return new string(a);
}
/// <summary>
/// Pull Iranian phone numbers out of a page's HTML — including ones a site reveals on click
/// (often still present as a tel: link, a data-*phone* attribute, or JSON-LD "telephone").
/// Returns normalized numbers (mobiles as 09xxxxxxxxx, landlines with leading 0), mobiles first.
/// </summary>
public static List<string> HarvestPhones(string html)
{
if (string.IsNullOrEmpty(html)) return new();
var latin = ToLatinDigits(html);
var found = new List<string>();
void Add(string raw)
{
var d = Regex.Replace(raw, @"\D", "");
if (d.StartsWith("0098")) d = "0" + d[4..];
else if (d.StartsWith("98") && d.Length >= 12) d = "0" + d[2..];
if (Regex.IsMatch(d, @"^9\d{9}$")) d = "0" + d; // 9xxxxxxxxx → 09xxxxxxxxx
bool ok = Regex.IsMatch(d, @"^09\d{9}$") // mobile
|| Regex.IsMatch(d, @"^0\d{10}$"); // landline w/ area code
if (ok && !found.Contains(d)) found.Add(d);
}
// Highest-signal sources first.
foreach (Match m in Regex.Matches(latin, @"tel:\+?([\d\s\-]{7,})")) Add(m.Groups[1].Value);
foreach (Match m in Regex.Matches(latin, "\"telephone\"\\s*:\\s*\"([^\"]+)\"")) Add(m.Groups[1].Value);
foreach (Match m in Regex.Matches(latin, "data-[\\w-]*phone[\\w-]*=[\"']([^\"']+)[\"']", RegexOptions.IgnoreCase)) Add(m.Groups[1].Value);
// Then bare numbers anywhere in the markup — mobiles, then landlines.
foreach (Match m in Regex.Matches(latin, @"(?:\+?98|0)?9\d{9}")) Add(m.Value);
foreach (Match m in Regex.Matches(latin, @"0\d{2,3}[\s-]?\d{7,8}")) Add(m.Value);
return found.Take(3).ToList();
}
} }
@@ -51,7 +51,13 @@ public class WebsiteListingSource : IListingSource
string? body = Between(html, "rtcl-description") ?? Between(html, "entry-content") string? body = Between(html, "rtcl-description") ?? Between(html, "entry-content")
?? Between(html, "job-description") ?? Meta(html, "og:description"); ?? Between(html, "job-description") ?? Meta(html, "og:description");
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x)))); var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x))));
return text.Length > 1800 ? text[..1800] : text; if (text.Length > 1800) text = text[..1800];
// Append any contact number found in the full markup (tel:/data-phone/JSON-LD/inline).
var phones = HtmlUtil.HarvestPhones(html);
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
} }
private static string? Meta(string html, string prop) private static string? Meta(string html, string prop)