AI tag/category assignment + phone extraction from web ads
AI (when enabled, now that the server proxy is up): - AiStructured gains phone, personName, yearsExperience, isLicensed. - The auditor appends an authoritative output-schema to the admin prompt so classification stays correct even with an older stored prompt — it now classifies kind as shift|job|talent and extracts the contact phone and talent details. - Ingestion publish prefers the AI's tags (kind/role/city/facility/phone + talent fields) over the heuristic parser when present. - Default prompt updated to describe the three kinds + new fields. Phone extraction from websites (Medjobs / generic sites), where the number sits behind a "تماس با این آگهی" reveal: - HtmlUtil.HarvestPhones scans the full markup for tel: links, JSON-LD "telephone", data-*phone* attributes, and inline Iranian mobile/landline numbers (Persian digits folded), normalized (mobiles 09…, landlines 0…). - Medjobs + Website sources append harvested numbers to the ad text so the parser/AI capture them; manual review then prefills the phone too. - Parser phone extraction now also captures a landline as a fallback. Note: if a site loads the number purely via XHR (not in HTML), a per-source reveal endpoint would be a follow-up. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -115,14 +115,20 @@ public class AppSetting
|
|||||||
public const string DefaultPrompt = """
|
public const string DefaultPrompt = """
|
||||||
تو دستیار بررسی آگهیهای کاری حوزه درمان برای پلتفرم «همکادر» هستی.
|
تو دستیار بررسی آگهیهای کاری حوزه درمان برای پلتفرم «همکادر» هستی.
|
||||||
هر آگهی خام را بخوان و تصمیم بگیر:
|
هر آگهی خام را بخوان و تصمیم بگیر:
|
||||||
- approve: آگهی واقعی و مرتبط با شیفت/استخدام کادر درمان است و اطلاعات کافی دارد.
|
- approve: آگهی واقعی و مرتبط با کادر درمان است و اطلاعات کافی دارد.
|
||||||
- reject: تبلیغ، اسپم، نامرتبط، یا فاقد اطلاعات حداقلی است.
|
- reject: تبلیغ، اسپم، نامرتبط، یا فاقد اطلاعات حداقلی است.
|
||||||
- review: مرتبط است اما ناقص/مبهم و نیاز به بررسی انسانی دارد.
|
- review: مرتبط است اما ناقص/مبهم و نیاز به بررسی انسانی دارد.
|
||||||
نقش، شهر/محله، نوع شیفت، نوع همکاری، مبلغ یا درصد سهم، و عنوان را در صورت وجود استخراج کن.
|
سه نوع آگهی داریم:
|
||||||
|
- shift: مرکز درمانی برای یک شیفت نیرو میخواهد.
|
||||||
|
- job: مرکز درمانی برای استخدام دائم نیرو میخواهد.
|
||||||
|
- talent: خودِ کادر درمان اعلام «آماده به کار / آماده همکاری» کرده است.
|
||||||
|
نقش، شهر/محله، نوع شیفت/همکاری، مبلغ یا درصد سهم، عنوان، نام مرکز، و شماره تماس را در صورت وجود استخراج کن.
|
||||||
|
برای talent: نام فرد، سال سابقه و پروانهدار بودن را هم استخراج کن.
|
||||||
فقط با یک شیء JSON پاسخ بده با کلیدهای:
|
فقط با یک شیء JSON پاسخ بده با کلیدهای:
|
||||||
decision (approve|reject|review)، confidence (0-100)، reason (فارسی کوتاه)،
|
decision (approve|reject|review)، confidence (0-100)، reason (فارسی کوتاه)،
|
||||||
kind (shift|job)، role، city، district، shiftType (day|evening|night|oncall)،
|
kind (shift|job|talent)، role، city، district، shiftType (day|evening|night|oncall)،
|
||||||
employmentType (fulltime|parttime|contract|plan)، payAmount (عدد تومان یا null)،
|
employmentType (fulltime|parttime|contract|plan)، payAmount (عدد تومان یا null)،
|
||||||
sharePercent (0-100 یا null)، title، facilityName.
|
sharePercent (0-100 یا null)، title، facilityName، phone،
|
||||||
|
personName، yearsExperience (عدد یا null)، isLicensed (true|false).
|
||||||
""";
|
""";
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -154,9 +154,21 @@ public class HeuristicListingParser : IListingParser
|
|||||||
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
|
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Phone ---
|
// --- Phone (mobile preferred, landline as fallback) ---
|
||||||
var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}");
|
var latinPhone = ToLatinDigits(text);
|
||||||
if (phone.Success) p.Phone = phone.Value;
|
var mobile = Regex.Match(latinPhone, @"(?:\+?98|0)?9\d{9}");
|
||||||
|
if (mobile.Success)
|
||||||
|
{
|
||||||
|
var d = Regex.Replace(mobile.Value, @"\D", "");
|
||||||
|
if (d.StartsWith("98")) d = "0" + d[2..];
|
||||||
|
if (d.Length == 10 && d.StartsWith("9")) d = "0" + d;
|
||||||
|
p.Phone = d;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
var land = Regex.Match(latinPhone, @"0\d{2,3}[\s-]?\d{7,8}");
|
||||||
|
if (land.Success) p.Phone = Regex.Replace(land.Value, @"\D", "");
|
||||||
|
}
|
||||||
|
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,8 @@ namespace JobsMedical.Web.Services.Scraping;
|
|||||||
|
|
||||||
public record AiStructured(
|
public record AiStructured(
|
||||||
string? Kind, string? Role, string? City, string? District, string? ShiftType,
|
string? Kind, string? Role, string? City, string? District, string? ShiftType,
|
||||||
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName);
|
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName,
|
||||||
|
string? Phone = null, string? PersonName = null, int? YearsExperience = null, bool? IsLicensed = null);
|
||||||
|
|
||||||
/// <summary>An AI verdict on a raw listing.</summary>
|
/// <summary>An AI verdict on a raw listing.</summary>
|
||||||
public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data)
|
public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data)
|
||||||
@@ -30,6 +31,24 @@ public interface IAiAuditor
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public class OpenAiCompatibleAuditor : IAiAuditor
|
public class OpenAiCompatibleAuditor : IAiAuditor
|
||||||
{
|
{
|
||||||
|
// Authoritative output contract appended to the admin prompt so tags/categories stay correct
|
||||||
|
// (including the «آماده به کار» type and contact phone) regardless of the stored prompt text.
|
||||||
|
private const string OutputSchema = """
|
||||||
|
فقط یک شیء JSON با این کلیدها برگردان (هر فیلد نامشخص = null):
|
||||||
|
decision: approve|reject|review
|
||||||
|
confidence: عدد ۰ تا ۱۰۰
|
||||||
|
reason: توضیح کوتاه فارسی
|
||||||
|
kind: shift (شیفت توسط مرکز) | job (استخدام توسط مرکز) | talent (کادر درمان که خودش «آماده به کار» است)
|
||||||
|
role: عنوان دقیق نقش درمانی (مثل پرستار، پزشک عمومی، دندانپزشک، تکنسین اتاق عمل، ماما، کارشناس آزمایشگاه)
|
||||||
|
city, district: نام شهر و محله/منطقه در صورت ذکر
|
||||||
|
shiftType: day|evening|night|oncall (فقط برای shift)
|
||||||
|
employmentType: fulltime|parttime|contract|plan
|
||||||
|
payAmount: عدد تومان یا null ، sharePercent: عدد ۰ تا ۱۰۰ یا null (مثل «۵۰٪ تسویه»)
|
||||||
|
title: عنوان کوتاه ، facilityName: نام مرکز درمانی (فقط برای shift/job)
|
||||||
|
phone: شماره تماس (موبایل یا ثابت) بهصورت رقم لاتین، یا null
|
||||||
|
personName: نام فرد (فقط برای talent) ، yearsExperience: سال سابقه عدد یا null ، isLicensed: true/false (پروانهدار)
|
||||||
|
""";
|
||||||
|
|
||||||
private readonly ScrapeHttpClients _clients;
|
private readonly ScrapeHttpClients _clients;
|
||||||
private readonly ILogger<OpenAiCompatibleAuditor> _log;
|
private readonly ILogger<OpenAiCompatibleAuditor> _log;
|
||||||
|
|
||||||
@@ -52,7 +71,9 @@ public class OpenAiCompatibleAuditor : IAiAuditor
|
|||||||
response_format = new { type = "json_object" },
|
response_format = new { type = "json_object" },
|
||||||
messages = new object[]
|
messages = new object[]
|
||||||
{
|
{
|
||||||
new { role = "system", content = s.AiSystemPrompt },
|
// Admin prompt + an authoritative output schema, so classification/tags stay
|
||||||
|
// correct even if the stored prompt predates the talent/phone fields.
|
||||||
|
new { role = "system", content = s.AiSystemPrompt + "\n\n" + OutputSchema },
|
||||||
new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." },
|
new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." },
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -100,10 +121,12 @@ public class OpenAiCompatibleAuditor : IAiAuditor
|
|||||||
int I(string k, int d) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : d;
|
int I(string k, int d) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : d;
|
||||||
long? L(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt64(out var n) ? n : null;
|
long? L(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt64(out var n) ? n : null;
|
||||||
int? NI(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : null;
|
int? NI(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.Number && v.TryGetInt32(out var n) ? n : null;
|
||||||
|
bool? B(string k) => r.TryGetProperty(k, out var v) && (v.ValueKind == JsonValueKind.True || v.ValueKind == JsonValueKind.False) ? v.GetBoolean() : null;
|
||||||
|
|
||||||
var decision = (S("decision") ?? "review").ToLowerInvariant();
|
var decision = (S("decision") ?? "review").ToLowerInvariant();
|
||||||
var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"),
|
var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"),
|
||||||
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"));
|
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"),
|
||||||
|
Phone: S("phone"), PersonName: S("personName"), YearsExperience: NI("yearsExperience"), IsLicensed: B("isLicensed"));
|
||||||
return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data);
|
return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -175,16 +175,23 @@ public class IngestionService
|
|||||||
// «آماده به کار» — a worker offering themselves. No facility involved.
|
// «آماده به کار» — a worker offering themselves. No facility involved.
|
||||||
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
|
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
|
||||||
{
|
{
|
||||||
|
// Prefer the AI's tags when present, else the heuristic parser.
|
||||||
|
var tPay = d?.PayAmount ?? parsed.PayAmount;
|
||||||
|
var tShare = d?.SharePercent ?? parsed.SharePercent;
|
||||||
_db.TalentListings.Add(new TalentListing
|
_db.TalentListings.Add(new TalentListing
|
||||||
{
|
{
|
||||||
Role = role, City = city, DistrictId = district?.Id,
|
Role = role, City = city, DistrictId = district?.Id,
|
||||||
PersonName = parsed.PersonName, YearsExperience = parsed.YearsExperience,
|
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
|
||||||
IsLicensed = parsed.IsLicensed, AreaNote = parsed.AreaNote,
|
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
|
||||||
Availability = parsed.EmploymentType, Gender = parsed.Gender,
|
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
|
||||||
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
AreaNote = parsed.AreaNote,
|
||||||
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||||
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
Gender = parsed.Gender,
|
||||||
Phone = parsed.Phone, Description = raw.RawText,
|
PayType = tShare is not null && tPay is null ? PayType.Percentage
|
||||||
|
: tPay is null ? PayType.Negotiable : PayType.PerShift,
|
||||||
|
PayAmount = tPay, SharePercent = tShare,
|
||||||
|
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
|
||||||
|
Description = raw.RawText,
|
||||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||||
});
|
});
|
||||||
raw.Status = RawListingStatus.Normalized;
|
raw.Status = RawListingStatus.Normalized;
|
||||||
@@ -201,7 +208,7 @@ public class IngestionService
|
|||||||
facility = new Facility
|
facility = new Facility
|
||||||
{
|
{
|
||||||
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
||||||
Phone = parsed.Phone, IsVerified = false,
|
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
|
||||||
};
|
};
|
||||||
_db.Facilities.Add(facility);
|
_db.Facilities.Add(facility);
|
||||||
facilities.Add(facility); // so later listings in this run match it too
|
facilities.Add(facility); // so later listings in this run match it too
|
||||||
|
|||||||
@@ -93,7 +93,14 @@ public class MedjobsListingSource : IListingSource
|
|||||||
|
|
||||||
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
|
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
|
||||||
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
|
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
|
||||||
return text.Length > 1800 ? text[..1800] : text;
|
if (text.Length > 1800) text = text[..1800];
|
||||||
|
|
||||||
|
// The contact number is often outside the description (in a tel: link / data attribute the
|
||||||
|
// page reveals on click). Harvest it from the full HTML and append so the parser/AI see it.
|
||||||
|
var phones = HtmlUtil.HarvestPhones(html);
|
||||||
|
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||||
|
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string? Meta(string html, string prop)
|
private static string? Meta(string html, string prop)
|
||||||
|
|||||||
@@ -62,4 +62,49 @@ internal static class HtmlUtil
|
|||||||
s = Regex.Replace(s, "[ \\t]+", " ");
|
s = Regex.Replace(s, "[ \\t]+", " ");
|
||||||
return s.Trim();
|
return s.Trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>Convert Persian/Arabic-Indic digits to Latin.</summary>
|
||||||
|
public static string ToLatinDigits(string s)
|
||||||
|
{
|
||||||
|
var a = s.ToCharArray();
|
||||||
|
for (var i = 0; i < a.Length; i++)
|
||||||
|
{
|
||||||
|
if (a[i] >= '۰' && a[i] <= '۹') a[i] = (char)('0' + (a[i] - '۰'));
|
||||||
|
else if (a[i] >= '٠' && a[i] <= '٩') a[i] = (char)('0' + (a[i] - '٠'));
|
||||||
|
}
|
||||||
|
return new string(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Pull Iranian phone numbers out of a page's HTML — including ones a site reveals on click
|
||||||
|
/// (often still present as a tel: link, a data-*phone* attribute, or JSON-LD "telephone").
|
||||||
|
/// Returns normalized numbers (mobiles as 09xxxxxxxxx, landlines with leading 0), mobiles first.
|
||||||
|
/// </summary>
|
||||||
|
public static List<string> HarvestPhones(string html)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrEmpty(html)) return new();
|
||||||
|
var latin = ToLatinDigits(html);
|
||||||
|
var found = new List<string>();
|
||||||
|
|
||||||
|
void Add(string raw)
|
||||||
|
{
|
||||||
|
var d = Regex.Replace(raw, @"\D", "");
|
||||||
|
if (d.StartsWith("0098")) d = "0" + d[4..];
|
||||||
|
else if (d.StartsWith("98") && d.Length >= 12) d = "0" + d[2..];
|
||||||
|
if (Regex.IsMatch(d, @"^9\d{9}$")) d = "0" + d; // 9xxxxxxxxx → 09xxxxxxxxx
|
||||||
|
bool ok = Regex.IsMatch(d, @"^09\d{9}$") // mobile
|
||||||
|
|| Regex.IsMatch(d, @"^0\d{10}$"); // landline w/ area code
|
||||||
|
if (ok && !found.Contains(d)) found.Add(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Highest-signal sources first.
|
||||||
|
foreach (Match m in Regex.Matches(latin, @"tel:\+?([\d\s\-]{7,})")) Add(m.Groups[1].Value);
|
||||||
|
foreach (Match m in Regex.Matches(latin, "\"telephone\"\\s*:\\s*\"([^\"]+)\"")) Add(m.Groups[1].Value);
|
||||||
|
foreach (Match m in Regex.Matches(latin, "data-[\\w-]*phone[\\w-]*=[\"']([^\"']+)[\"']", RegexOptions.IgnoreCase)) Add(m.Groups[1].Value);
|
||||||
|
// Then bare numbers anywhere in the markup — mobiles, then landlines.
|
||||||
|
foreach (Match m in Regex.Matches(latin, @"(?:\+?98|0)?9\d{9}")) Add(m.Value);
|
||||||
|
foreach (Match m in Regex.Matches(latin, @"0\d{2,3}[\s-]?\d{7,8}")) Add(m.Value);
|
||||||
|
|
||||||
|
return found.Take(3).ToList();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,7 +51,13 @@ public class WebsiteListingSource : IListingSource
|
|||||||
string? body = Between(html, "rtcl-description") ?? Between(html, "entry-content")
|
string? body = Between(html, "rtcl-description") ?? Between(html, "entry-content")
|
||||||
?? Between(html, "job-description") ?? Meta(html, "og:description");
|
?? Between(html, "job-description") ?? Meta(html, "og:description");
|
||||||
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x))));
|
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x))));
|
||||||
return text.Length > 1800 ? text[..1800] : text;
|
if (text.Length > 1800) text = text[..1800];
|
||||||
|
|
||||||
|
// Append any contact number found in the full markup (tel:/data-phone/JSON-LD/inline).
|
||||||
|
var phones = HtmlUtil.HarvestPhones(html);
|
||||||
|
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||||
|
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string? Meta(string html, string prop)
|
private static string? Meta(string html, string prop)
|
||||||
|
|||||||
Reference in New Issue
Block a user