AI tag/category assignment + phone extraction from web ads
AI (when enabled, now that the server proxy is up): - AiStructured gains phone, personName, yearsExperience, isLicensed. - The auditor appends an authoritative output-schema to the admin prompt so classification stays correct even with an older stored prompt — it now classifies kind as shift|job|talent and extracts the contact phone and talent details. - Ingestion publish prefers the AI's tags (kind/role/city/facility/phone + talent fields) over the heuristic parser when present. - Default prompt updated to describe the three kinds + new fields. Phone extraction from websites (Medjobs / generic sites), where the number sits behind a "تماس با این آگهی" reveal: - HtmlUtil.HarvestPhones scans the full markup for tel: links, JSON-LD "telephone", data-*phone* attributes, and inline Iranian mobile/landline numbers (Persian digits folded), normalized (mobiles 09…, landlines 0…). - Medjobs + Website sources append harvested numbers to the ad text so the parser/AI capture them; manual review then prefills the phone too. - Parser phone extraction now also captures a landline as a fallback. Note: if a site loads the number purely via XHR (not in HTML), a per-source reveal endpoint would be a follow-up. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -115,14 +115,20 @@ public class AppSetting
|
||||
public const string DefaultPrompt = """
|
||||
تو دستیار بررسی آگهیهای کاری حوزه درمان برای پلتفرم «همکادر» هستی.
|
||||
هر آگهی خام را بخوان و تصمیم بگیر:
|
||||
- approve: آگهی واقعی و مرتبط با شیفت/استخدام کادر درمان است و اطلاعات کافی دارد.
|
||||
- approve: آگهی واقعی و مرتبط با کادر درمان است و اطلاعات کافی دارد.
|
||||
- reject: تبلیغ، اسپم، نامرتبط، یا فاقد اطلاعات حداقلی است.
|
||||
- review: مرتبط است اما ناقص/مبهم و نیاز به بررسی انسانی دارد.
|
||||
نقش، شهر/محله، نوع شیفت، نوع همکاری، مبلغ یا درصد سهم، و عنوان را در صورت وجود استخراج کن.
|
||||
سه نوع آگهی داریم:
|
||||
- shift: مرکز درمانی برای یک شیفت نیرو میخواهد.
|
||||
- job: مرکز درمانی برای استخدام دائم نیرو میخواهد.
|
||||
- talent: خودِ کادر درمان اعلام «آماده به کار / آماده همکاری» کرده است.
|
||||
نقش، شهر/محله، نوع شیفت/همکاری، مبلغ یا درصد سهم، عنوان، نام مرکز، و شماره تماس را در صورت وجود استخراج کن.
|
||||
برای talent: نام فرد، سال سابقه و پروانهدار بودن را هم استخراج کن.
|
||||
فقط با یک شیء JSON پاسخ بده با کلیدهای:
|
||||
decision (approve|reject|review)، confidence (0-100)، reason (فارسی کوتاه)،
|
||||
kind (shift|job)، role، city، district، shiftType (day|evening|night|oncall)،
|
||||
kind (shift|job|talent)، role، city، district، shiftType (day|evening|night|oncall)،
|
||||
employmentType (fulltime|parttime|contract|plan)، payAmount (عدد تومان یا null)،
|
||||
sharePercent (0-100 یا null)، title، facilityName.
|
||||
sharePercent (0-100 یا null)، title، facilityName، phone،
|
||||
personName، yearsExperience (عدد یا null)، isLicensed (true|false).
|
||||
""";
|
||||
}
|
||||
|
||||
@@ -154,9 +154,21 @@ public class HeuristicListingParser : IListingParser
|
||||
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
|
||||
}
|
||||
|
||||
// --- Phone ---
|
||||
var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}");
|
||||
if (phone.Success) p.Phone = phone.Value;
|
||||
// --- Phone (mobile preferred, landline as fallback) ---
|
||||
var latinPhone = ToLatinDigits(text);
|
||||
var mobile = Regex.Match(latinPhone, @"(?:\+?98|0)?9\d{9}");
|
||||
if (mobile.Success)
|
||||
{
|
||||
var d = Regex.Replace(mobile.Value, @"\D", "");
|
||||
if (d.StartsWith("98")) d = "0" + d[2..];
|
||||
if (d.Length == 10 && d.StartsWith("9")) d = "0" + d;
|
||||
p.Phone = d;
|
||||
}
|
||||
else
|
||||
{
|
||||
var land = Regex.Match(latinPhone, @"0\d{2,3}[\s-]?\d{7,8}");
|
||||
if (land.Success) p.Phone = Regex.Replace(land.Value, @"\D", "");
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
@@ -7,7 +7,8 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public record AiStructured(
|
||||
string? Kind, string? Role, string? City, string? District, string? ShiftType,
|
||||
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName);
|
||||
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName,
|
||||
string? Phone = null, string? PersonName = null, int? YearsExperience = null, bool? IsLicensed = null);
|
||||
|
||||
/// <summary>An AI verdict on a raw listing.</summary>
|
||||
public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data)
|
||||
@@ -30,6 +31,24 @@ public interface IAiAuditor
|
||||
/// </summary>
|
||||
public class OpenAiCompatibleAuditor : IAiAuditor
|
||||
{
|
||||
// Authoritative output contract appended to the admin prompt so tags/categories stay correct
|
||||
// (including the «آماده به کار» type and contact phone) regardless of the stored prompt text.
|
||||
private const string OutputSchema = """
|
||||
فقط یک شیء JSON با این کلیدها برگردان (هر فیلد نامشخص = null):
|
||||
decision: approve|reject|review
|
||||
confidence: عدد ۰ تا ۱۰۰
|
||||
reason: توضیح کوتاه فارسی
|
||||
kind: shift (شیفت توسط مرکز) | job (استخدام توسط مرکز) | talent (کادر درمان که خودش «آماده به کار» است)
|
||||
role: عنوان دقیق نقش درمانی (مثل پرستار، پزشک عمومی، دندانپزشک، تکنسین اتاق عمل، ماما، کارشناس آزمایشگاه)
|
||||
city, district: نام شهر و محله/منطقه در صورت ذکر
|
||||
shiftType: day|evening|night|oncall (فقط برای shift)
|
||||
employmentType: fulltime|parttime|contract|plan
|
||||
payAmount: عدد تومان یا null ، sharePercent: عدد ۰ تا ۱۰۰ یا null (مثل «۵۰٪ تسویه»)
|
||||
title: عنوان کوتاه ، facilityName: نام مرکز درمانی (فقط برای shift/job)
|
||||
phone: شماره تماس (موبایل یا ثابت) بهصورت رقم لاتین، یا null
|
||||
personName: نام فرد (فقط برای talent) ، yearsExperience: سال سابقه عدد یا null ، isLicensed: true/false (پروانهدار)
|
||||
""";
|
||||
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<OpenAiCompatibleAuditor> _log;
|
||||
|
||||
@@ -52,7 +71,9 @@ public class OpenAiCompatibleAuditor : IAiAuditor
|
||||
response_format = new { type = "json_object" },
|
||||
messages = new object[]
|
||||
{
|
||||
new { role = "system", content = s.AiSystemPrompt },
|
||||
// Admin prompt + an authoritative output schema, so classification/tags stay
|
||||
// correct even if the stored prompt predates the talent/phone fields.
|
||||
new { role = "system", content = s.AiSystemPrompt + "\n\n" + OutputSchema },
|
||||
new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." },
|
||||
},
|
||||
};
|
||||
@@ -100,10 +121,12 @@ public class OpenAiCompatibleAuditor : IAiAuditor
|
||||
int I(string k, int d) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : d;
|
||||
long? L(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt64(out var n) ? n : null;
|
||||
int? NI(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : null;
|
||||
bool? B(string k) => r.TryGetProperty(k, out var v) && (v.ValueKind == JsonValueKind.True || v.ValueKind == JsonValueKind.False) ? v.GetBoolean() : null;
|
||||
|
||||
var decision = (S("decision") ?? "review").ToLowerInvariant();
|
||||
var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"),
|
||||
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"));
|
||||
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"),
|
||||
Phone: S("phone"), PersonName: S("personName"), YearsExperience: NI("yearsExperience"), IsLicensed: B("isLicensed"));
|
||||
return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,16 +175,23 @@ public class IngestionService
|
||||
// «آماده به کار» — a worker offering themselves. No facility involved.
|
||||
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
|
||||
{
|
||||
// Prefer the AI's tags when present, else the heuristic parser.
|
||||
var tPay = d?.PayAmount ?? parsed.PayAmount;
|
||||
var tShare = d?.SharePercent ?? parsed.SharePercent;
|
||||
_db.TalentListings.Add(new TalentListing
|
||||
{
|
||||
Role = role, City = city, DistrictId = district?.Id,
|
||||
PersonName = parsed.PersonName, YearsExperience = parsed.YearsExperience,
|
||||
IsLicensed = parsed.IsLicensed, AreaNote = parsed.AreaNote,
|
||||
Availability = parsed.EmploymentType, Gender = parsed.Gender,
|
||||
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
||||
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
||||
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
||||
Phone = parsed.Phone, Description = raw.RawText,
|
||||
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
|
||||
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
|
||||
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
|
||||
AreaNote = parsed.AreaNote,
|
||||
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||
Gender = parsed.Gender,
|
||||
PayType = tShare is not null && tPay is null ? PayType.Percentage
|
||||
: tPay is null ? PayType.Negotiable : PayType.PerShift,
|
||||
PayAmount = tPay, SharePercent = tShare,
|
||||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
|
||||
Description = raw.RawText,
|
||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||
});
|
||||
raw.Status = RawListingStatus.Normalized;
|
||||
@@ -201,7 +208,7 @@ public class IngestionService
|
||||
facility = new Facility
|
||||
{
|
||||
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
||||
Phone = parsed.Phone, IsVerified = false,
|
||||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
|
||||
};
|
||||
_db.Facilities.Add(facility);
|
||||
facilities.Add(facility); // so later listings in this run match it too
|
||||
|
||||
@@ -93,7 +93,14 @@ public class MedjobsListingSource : IListingSource
|
||||
|
||||
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
|
||||
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
|
||||
return text.Length > 1800 ? text[..1800] : text;
|
||||
if (text.Length > 1800) text = text[..1800];
|
||||
|
||||
// The contact number is often outside the description (in a tel: link / data attribute the
|
||||
// page reveals on click). Harvest it from the full HTML and append so the parser/AI see it.
|
||||
var phones = HtmlUtil.HarvestPhones(html);
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
return text;
|
||||
}
|
||||
|
||||
private static string? Meta(string html, string prop)
|
||||
|
||||
@@ -62,4 +62,49 @@ internal static class HtmlUtil
|
||||
s = Regex.Replace(s, "[ \\t]+", " ");
|
||||
return s.Trim();
|
||||
}
|
||||
|
||||
/// <summary>Convert Persian/Arabic-Indic digits to Latin.</summary>
|
||||
public static string ToLatinDigits(string s)
|
||||
{
|
||||
var a = s.ToCharArray();
|
||||
for (var i = 0; i < a.Length; i++)
|
||||
{
|
||||
if (a[i] >= '۰' && a[i] <= '۹') a[i] = (char)('0' + (a[i] - '۰'));
|
||||
else if (a[i] >= '٠' && a[i] <= '٩') a[i] = (char)('0' + (a[i] - '٠'));
|
||||
}
|
||||
return new string(a);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Pull Iranian phone numbers out of a page's HTML — including ones a site reveals on click
|
||||
/// (often still present as a tel: link, a data-*phone* attribute, or JSON-LD "telephone").
|
||||
/// Returns normalized numbers (mobiles as 09xxxxxxxxx, landlines with leading 0), mobiles first.
|
||||
/// </summary>
|
||||
public static List<string> HarvestPhones(string html)
|
||||
{
|
||||
if (string.IsNullOrEmpty(html)) return new();
|
||||
var latin = ToLatinDigits(html);
|
||||
var found = new List<string>();
|
||||
|
||||
void Add(string raw)
|
||||
{
|
||||
var d = Regex.Replace(raw, @"\D", "");
|
||||
if (d.StartsWith("0098")) d = "0" + d[4..];
|
||||
else if (d.StartsWith("98") && d.Length >= 12) d = "0" + d[2..];
|
||||
if (Regex.IsMatch(d, @"^9\d{9}$")) d = "0" + d; // 9xxxxxxxxx → 09xxxxxxxxx
|
||||
bool ok = Regex.IsMatch(d, @"^09\d{9}$") // mobile
|
||||
|| Regex.IsMatch(d, @"^0\d{10}$"); // landline w/ area code
|
||||
if (ok && !found.Contains(d)) found.Add(d);
|
||||
}
|
||||
|
||||
// Highest-signal sources first.
|
||||
foreach (Match m in Regex.Matches(latin, @"tel:\+?([\d\s\-]{7,})")) Add(m.Groups[1].Value);
|
||||
foreach (Match m in Regex.Matches(latin, "\"telephone\"\\s*:\\s*\"([^\"]+)\"")) Add(m.Groups[1].Value);
|
||||
foreach (Match m in Regex.Matches(latin, "data-[\\w-]*phone[\\w-]*=[\"']([^\"']+)[\"']", RegexOptions.IgnoreCase)) Add(m.Groups[1].Value);
|
||||
// Then bare numbers anywhere in the markup — mobiles, then landlines.
|
||||
foreach (Match m in Regex.Matches(latin, @"(?:\+?98|0)?9\d{9}")) Add(m.Value);
|
||||
foreach (Match m in Regex.Matches(latin, @"0\d{2,3}[\s-]?\d{7,8}")) Add(m.Value);
|
||||
|
||||
return found.Take(3).ToList();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,7 +51,13 @@ public class WebsiteListingSource : IListingSource
|
||||
string? body = Between(html, "rtcl-description") ?? Between(html, "entry-content")
|
||||
?? Between(html, "job-description") ?? Meta(html, "og:description");
|
||||
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x))));
|
||||
return text.Length > 1800 ? text[..1800] : text;
|
||||
if (text.Length > 1800) text = text[..1800];
|
||||
|
||||
// Append any contact number found in the full markup (tel:/data-phone/JSON-LD/inline).
|
||||
var phones = HtmlUtil.HarvestPhones(html);
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
return text;
|
||||
}
|
||||
|
||||
private static string? Meta(string html, string prop)
|
||||
|
||||
Reference in New Issue
Block a user