AI tag/category assignment + phone extraction from web ads
CI/CD / CI · dotnet build (push) Successful in 2m37s
CI/CD / Deploy · hamkadr (push) Successful in 1m11s

AI (when enabled, now that the server proxy is up):
- AiStructured gains phone, personName, yearsExperience, isLicensed.
- The auditor appends an authoritative output-schema to the admin prompt
  so classification stays correct even with an older stored prompt — it
  now classifies kind as shift|job|talent and extracts the contact phone
  and talent details.
- Ingestion publish prefers the AI's tags (kind/role/city/facility/phone +
  talent fields) over the heuristic parser when present.
- Default prompt updated to describe the three kinds + new fields.

Phone extraction from websites (Medjobs / generic sites), where the
number sits behind a "تماس با این آگهی" reveal:
- HtmlUtil.HarvestPhones scans the full markup for tel: links, JSON-LD
  "telephone", data-*phone* attributes, and inline Iranian mobile/landline
  numbers (Persian digits folded), normalized (mobiles 09…, landlines 0…).
- Medjobs + Website sources append harvested numbers to the ad text so the
  parser/AI capture them; manual review then prefills the phone too.
- Parser phone extraction now also captures a landline as a fallback.

Note: if a site loads the number purely via XHR (not in HTML), a
per-source reveal endpoint would be a follow-up.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-08 08:11:14 +03:30
parent 4e5df73cf7
commit 213af9db48
7 changed files with 126 additions and 20 deletions
+10 -4
View File
@@ -115,14 +115,20 @@ public class AppSetting
public const string DefaultPrompt = """
تو دستیار بررسی آگهیهای کاری حوزه درمان برای پلتفرم «همکادر» هستی.
هر آگهی خام را بخوان و تصمیم بگیر:
- approve: آگهی واقعی و مرتبط با شیفت/استخدام کادر درمان است و اطلاعات کافی دارد.
- approve: آگهی واقعی و مرتبط با کادر درمان است و اطلاعات کافی دارد.
- reject: تبلیغ، اسپم، نامرتبط، یا فاقد اطلاعات حداقلی است.
- review: مرتبط است اما ناقص/مبهم و نیاز به بررسی انسانی دارد.
نقش، شهر/محله، نوع شیفت، نوع همکاری، مبلغ یا درصد سهم، و عنوان را در صورت وجود استخراج کن.
سه نوع آگهی داریم:
- shift: مرکز درمانی برای یک شیفت نیرو میخواهد.
- job: مرکز درمانی برای استخدام دائم نیرو میخواهد.
- talent: خودِ کادر درمان اعلام «آماده به کار / آماده همکاری» کرده است.
نقش، شهر/محله، نوع شیفت/همکاری، مبلغ یا درصد سهم، عنوان، نام مرکز، و شماره تماس را در صورت وجود استخراج کن.
برای talent: نام فرد، سال سابقه و پروانهدار بودن را هم استخراج کن.
فقط با یک شیء JSON پاسخ بده با کلیدهای:
decision (approve|reject|review)، confidence (0-100)، reason (فارسی کوتاه)،
kind (shift|job)، role، city، district، shiftType (day|evening|night|oncall)،
kind (shift|job|talent)، role، city، district، shiftType (day|evening|night|oncall)،
employmentType (fulltime|parttime|contract|plan)، payAmount (عدد تومان یا null)،
sharePercent (0-100 یا null)، title، facilityName.
sharePercent (0-100 یا null)، title، facilityName، phone،
personName، yearsExperience (عدد یا null)، isLicensed (true|false).
""";
}
+15 -3
View File
@@ -154,9 +154,21 @@ public class HeuristicListingParser : IListingParser
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
}
// --- Phone ---
var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}");
if (phone.Success) p.Phone = phone.Value;
// --- Phone (mobile preferred, landline as fallback) ---
var latinPhone = ToLatinDigits(text);
var mobile = Regex.Match(latinPhone, @"(?:\+?98|0)?9\d{9}");
if (mobile.Success)
{
var d = Regex.Replace(mobile.Value, @"\D", "");
if (d.StartsWith("98")) d = "0" + d[2..];
if (d.Length == 10 && d.StartsWith("9")) d = "0" + d;
p.Phone = d;
}
else
{
var land = Regex.Match(latinPhone, @"0\d{2,3}[\s-]?\d{7,8}");
if (land.Success) p.Phone = Regex.Replace(land.Value, @"\D", "");
}
return p;
}
@@ -7,7 +7,8 @@ namespace JobsMedical.Web.Services.Scraping;
public record AiStructured(
string? Kind, string? Role, string? City, string? District, string? ShiftType,
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName);
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName,
string? Phone = null, string? PersonName = null, int? YearsExperience = null, bool? IsLicensed = null);
/// <summary>An AI verdict on a raw listing.</summary>
public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data)
@@ -30,6 +31,24 @@ public interface IAiAuditor
/// </summary>
public class OpenAiCompatibleAuditor : IAiAuditor
{
// Authoritative output contract appended to the admin prompt so tags/categories stay correct
// (including the «آماده به کار» type and contact phone) regardless of the stored prompt text.
private const string OutputSchema = """
فقط یک شیء JSON با این کلیدها برگردان (هر فیلد نامشخص = null):
decision: approve|reject|review
confidence: عدد ۰ تا ۱۰۰
reason: توضیح کوتاه فارسی
kind: shift (شیفت توسط مرکز) | job (استخدام توسط مرکز) | talent (کادر درمان که خودش «آماده به کار» است)
role: عنوان دقیق نقش درمانی (مثل پرستار، پزشک عمومی، دندانپزشک، تکنسین اتاق عمل، ماما، کارشناس آزمایشگاه)
city, district: نام شهر و محله/منطقه در صورت ذکر
shiftType: day|evening|night|oncall (فقط برای shift)
employmentType: fulltime|parttime|contract|plan
payAmount: عدد تومان یا null ، sharePercent: عدد ۰ تا ۱۰۰ یا null (مثل «۵۰٪ تسویه»)
title: عنوان کوتاه ، facilityName: نام مرکز درمانی (فقط برای shift/job)
phone: شماره تماس (موبایل یا ثابت) بهصورت رقم لاتین، یا null
personName: نام فرد (فقط برای talent) ، yearsExperience: سال سابقه عدد یا null ، isLicensed: true/false (پروانهدار)
""";
private readonly ScrapeHttpClients _clients;
private readonly ILogger<OpenAiCompatibleAuditor> _log;
@@ -52,7 +71,9 @@ public class OpenAiCompatibleAuditor : IAiAuditor
response_format = new { type = "json_object" },
messages = new object[]
{
new { role = "system", content = s.AiSystemPrompt },
// Admin prompt + an authoritative output schema, so classification/tags stay
// correct even if the stored prompt predates the talent/phone fields.
new { role = "system", content = s.AiSystemPrompt + "\n\n" + OutputSchema },
new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." },
},
};
@@ -100,10 +121,12 @@ public class OpenAiCompatibleAuditor : IAiAuditor
int I(string k, int d) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : d;
long? L(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt64(out var n) ? n : null;
int? NI(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : null;
bool? B(string k) => r.TryGetProperty(k, out var v) && (v.ValueKind == JsonValueKind.True || v.ValueKind == JsonValueKind.False) ? v.GetBoolean() : null;
var decision = (S("decision") ?? "review").ToLowerInvariant();
var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"),
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"));
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"),
Phone: S("phone"), PersonName: S("personName"), YearsExperience: NI("yearsExperience"), IsLicensed: B("isLicensed"));
return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data);
}
}
@@ -175,16 +175,23 @@ public class IngestionService
// «آماده به کار» — a worker offering themselves. No facility involved.
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
{
// Prefer the AI's tags when present, else the heuristic parser.
var tPay = d?.PayAmount ?? parsed.PayAmount;
var tShare = d?.SharePercent ?? parsed.SharePercent;
_db.TalentListings.Add(new TalentListing
{
Role = role, City = city, DistrictId = district?.Id,
PersonName = parsed.PersonName, YearsExperience = parsed.YearsExperience,
IsLicensed = parsed.IsLicensed, AreaNote = parsed.AreaNote,
Availability = parsed.EmploymentType, Gender = parsed.Gender,
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
Phone = parsed.Phone, Description = raw.RawText,
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
AreaNote = parsed.AreaNote,
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
Gender = parsed.Gender,
PayType = tShare is not null && tPay is null ? PayType.Percentage
: tPay is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = tPay, SharePercent = tShare,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
Description = raw.RawText,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
});
raw.Status = RawListingStatus.Normalized;
@@ -201,7 +208,7 @@ public class IngestionService
facility = new Facility
{
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
Phone = parsed.Phone, IsVerified = false,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
};
_db.Facilities.Add(facility);
facilities.Add(facility); // so later listings in this run match it too
@@ -93,7 +93,14 @@ public class MedjobsListingSource : IListingSource
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
return text.Length > 1800 ? text[..1800] : text;
if (text.Length > 1800) text = text[..1800];
// The contact number is often outside the description (in a tel: link / data attribute the
// page reveals on click). Harvest it from the full HTML and append so the parser/AI see it.
var phones = HtmlUtil.HarvestPhones(html);
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
}
private static string? Meta(string html, string prop)
@@ -62,4 +62,49 @@ internal static class HtmlUtil
s = Regex.Replace(s, "[ \\t]+", " ");
return s.Trim();
}
/// <summary>Convert Persian/Arabic-Indic digits to Latin.</summary>
public static string ToLatinDigits(string s)
{
var a = s.ToCharArray();
for (var i = 0; i < a.Length; i++)
{
if (a[i] >= '۰' && a[i] <= '۹') a[i] = (char)('0' + (a[i] - '۰'));
else if (a[i] >= '٠' && a[i] <= '٩') a[i] = (char)('0' + (a[i] - '٠'));
}
return new string(a);
}
/// <summary>
/// Pull Iranian phone numbers out of a page's HTML — including ones a site reveals on click
/// (often still present as a tel: link, a data-*phone* attribute, or JSON-LD "telephone").
/// Returns normalized numbers (mobiles as 09xxxxxxxxx, landlines with leading 0), mobiles first.
/// </summary>
public static List<string> HarvestPhones(string html)
{
if (string.IsNullOrEmpty(html)) return new();
var latin = ToLatinDigits(html);
var found = new List<string>();
void Add(string raw)
{
var d = Regex.Replace(raw, @"\D", "");
if (d.StartsWith("0098")) d = "0" + d[4..];
else if (d.StartsWith("98") && d.Length >= 12) d = "0" + d[2..];
if (Regex.IsMatch(d, @"^9\d{9}$")) d = "0" + d; // 9xxxxxxxxx → 09xxxxxxxxx
bool ok = Regex.IsMatch(d, @"^09\d{9}$") // mobile
|| Regex.IsMatch(d, @"^0\d{10}$"); // landline w/ area code
if (ok && !found.Contains(d)) found.Add(d);
}
// Highest-signal sources first.
foreach (Match m in Regex.Matches(latin, @"tel:\+?([\d\s\-]{7,})")) Add(m.Groups[1].Value);
foreach (Match m in Regex.Matches(latin, "\"telephone\"\\s*:\\s*\"([^\"]+)\"")) Add(m.Groups[1].Value);
foreach (Match m in Regex.Matches(latin, "data-[\\w-]*phone[\\w-]*=[\"']([^\"']+)[\"']", RegexOptions.IgnoreCase)) Add(m.Groups[1].Value);
// Then bare numbers anywhere in the markup — mobiles, then landlines.
foreach (Match m in Regex.Matches(latin, @"(?:\+?98|0)?9\d{9}")) Add(m.Value);
foreach (Match m in Regex.Matches(latin, @"0\d{2,3}[\s-]?\d{7,8}")) Add(m.Value);
return found.Take(3).ToList();
}
}
@@ -51,7 +51,13 @@ public class WebsiteListingSource : IListingSource
string? body = Between(html, "rtcl-description") ?? Between(html, "entry-content")
?? Between(html, "job-description") ?? Meta(html, "og:description");
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x))));
return text.Length > 1800 ? text[..1800] : text;
if (text.Length > 1800) text = text[..1800];
// Append any contact number found in the full markup (tel:/data-phone/JSON-LD/inline).
var phones = HtmlUtil.HarvestPhones(html);
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
}
private static string? Meta(string html, string prop)