Divar geo-coords to facility map + medical gate + RawListing FK/geo migrations
This commit is contained in:
@@ -171,5 +171,14 @@ public class AppDbContext : DbContext, IDataProtectionKeyContext
|
|||||||
// Dedupe ingested listings by content hash.
|
// Dedupe ingested listings by content hash.
|
||||||
b.Entity<RawListing>().HasIndex(r => r.ContentHash);
|
b.Entity<RawListing>().HasIndex(r => r.ContentHash);
|
||||||
b.Entity<RawListing>().HasIndex(r => r.Status);
|
b.Entity<RawListing>().HasIndex(r => r.Status);
|
||||||
|
// A RawListing only LINKS to the post it produced — it must outlive that post (it's the
|
||||||
|
// dedupe cache). So deleting a Shift/Talent NULLs the back-reference rather than orphaning a
|
||||||
|
// dangling FK or blocking the delete. LinkedTalentId previously had no FK at all (orphan risk).
|
||||||
|
b.Entity<RawListing>()
|
||||||
|
.HasOne(r => r.LinkedShift).WithMany()
|
||||||
|
.HasForeignKey(r => r.LinkedShiftId).OnDelete(DeleteBehavior.SetNull);
|
||||||
|
b.Entity<RawListing>()
|
||||||
|
.HasOne(r => r.LinkedTalent).WithMany()
|
||||||
|
.HasForeignKey(r => r.LinkedTalentId).OnDelete(DeleteBehavior.SetNull);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+1581
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,69 @@
|
|||||||
|
using Microsoft.EntityFrameworkCore.Migrations;
|
||||||
|
|
||||||
|
#nullable disable
|
||||||
|
|
||||||
|
namespace JobsMedical.Web.Migrations
|
||||||
|
{
|
||||||
|
/// <inheritdoc />
|
||||||
|
public partial class RawListingLinkFks : Migration
|
||||||
|
{
|
||||||
|
/// <inheritdoc />
|
||||||
|
protected override void Up(MigrationBuilder migrationBuilder)
|
||||||
|
{
|
||||||
|
migrationBuilder.DropForeignKey(
|
||||||
|
name: "FK_RawListings_Shifts_LinkedShiftId",
|
||||||
|
table: "RawListings");
|
||||||
|
|
||||||
|
// LinkedTalentId never had an FK before, so existing rows may point at deleted talent.
|
||||||
|
// Null those orphans first, otherwise AddForeignKey below fails on a populated DB.
|
||||||
|
migrationBuilder.Sql(
|
||||||
|
"UPDATE \"RawListings\" r SET \"LinkedTalentId\" = NULL " +
|
||||||
|
"WHERE r.\"LinkedTalentId\" IS NOT NULL " +
|
||||||
|
"AND NOT EXISTS (SELECT 1 FROM \"TalentListings\" t WHERE t.\"Id\" = r.\"LinkedTalentId\");");
|
||||||
|
|
||||||
|
migrationBuilder.CreateIndex(
|
||||||
|
name: "IX_RawListings_LinkedTalentId",
|
||||||
|
table: "RawListings",
|
||||||
|
column: "LinkedTalentId");
|
||||||
|
|
||||||
|
migrationBuilder.AddForeignKey(
|
||||||
|
name: "FK_RawListings_Shifts_LinkedShiftId",
|
||||||
|
table: "RawListings",
|
||||||
|
column: "LinkedShiftId",
|
||||||
|
principalTable: "Shifts",
|
||||||
|
principalColumn: "Id",
|
||||||
|
onDelete: ReferentialAction.SetNull);
|
||||||
|
|
||||||
|
migrationBuilder.AddForeignKey(
|
||||||
|
name: "FK_RawListings_TalentListings_LinkedTalentId",
|
||||||
|
table: "RawListings",
|
||||||
|
column: "LinkedTalentId",
|
||||||
|
principalTable: "TalentListings",
|
||||||
|
principalColumn: "Id",
|
||||||
|
onDelete: ReferentialAction.SetNull);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
protected override void Down(MigrationBuilder migrationBuilder)
|
||||||
|
{
|
||||||
|
migrationBuilder.DropForeignKey(
|
||||||
|
name: "FK_RawListings_Shifts_LinkedShiftId",
|
||||||
|
table: "RawListings");
|
||||||
|
|
||||||
|
migrationBuilder.DropForeignKey(
|
||||||
|
name: "FK_RawListings_TalentListings_LinkedTalentId",
|
||||||
|
table: "RawListings");
|
||||||
|
|
||||||
|
migrationBuilder.DropIndex(
|
||||||
|
name: "IX_RawListings_LinkedTalentId",
|
||||||
|
table: "RawListings");
|
||||||
|
|
||||||
|
migrationBuilder.AddForeignKey(
|
||||||
|
name: "FK_RawListings_Shifts_LinkedShiftId",
|
||||||
|
table: "RawListings",
|
||||||
|
column: "LinkedShiftId",
|
||||||
|
principalTable: "Shifts",
|
||||||
|
principalColumn: "Id");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,38 @@
|
|||||||
|
using Microsoft.EntityFrameworkCore.Migrations;
|
||||||
|
|
||||||
|
#nullable disable
|
||||||
|
|
||||||
|
namespace JobsMedical.Web.Migrations
|
||||||
|
{
|
||||||
|
/// <inheritdoc />
|
||||||
|
public partial class RawListingGeo : Migration
|
||||||
|
{
|
||||||
|
/// <inheritdoc />
|
||||||
|
protected override void Up(MigrationBuilder migrationBuilder)
|
||||||
|
{
|
||||||
|
migrationBuilder.AddColumn<double>(
|
||||||
|
name: "Lat",
|
||||||
|
table: "RawListings",
|
||||||
|
type: "double precision",
|
||||||
|
nullable: true);
|
||||||
|
|
||||||
|
migrationBuilder.AddColumn<double>(
|
||||||
|
name: "Lng",
|
||||||
|
table: "RawListings",
|
||||||
|
type: "double precision",
|
||||||
|
nullable: true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
protected override void Down(MigrationBuilder migrationBuilder)
|
||||||
|
{
|
||||||
|
migrationBuilder.DropColumn(
|
||||||
|
name: "Lat",
|
||||||
|
table: "RawListings");
|
||||||
|
|
||||||
|
migrationBuilder.DropColumn(
|
||||||
|
name: "Lng",
|
||||||
|
table: "RawListings");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -748,12 +748,18 @@ namespace JobsMedical.Web.Migrations
|
|||||||
b.Property<DateTime>("FetchedAt")
|
b.Property<DateTime>("FetchedAt")
|
||||||
.HasColumnType("timestamp with time zone");
|
.HasColumnType("timestamp with time zone");
|
||||||
|
|
||||||
|
b.Property<double?>("Lat")
|
||||||
|
.HasColumnType("double precision");
|
||||||
|
|
||||||
b.Property<int?>("LinkedShiftId")
|
b.Property<int?>("LinkedShiftId")
|
||||||
.HasColumnType("integer");
|
.HasColumnType("integer");
|
||||||
|
|
||||||
b.Property<int?>("LinkedTalentId")
|
b.Property<int?>("LinkedTalentId")
|
||||||
.HasColumnType("integer");
|
.HasColumnType("integer");
|
||||||
|
|
||||||
|
b.Property<double?>("Lng")
|
||||||
|
.HasColumnType("double precision");
|
||||||
|
|
||||||
b.Property<string>("ParsedJson")
|
b.Property<string>("ParsedJson")
|
||||||
.HasColumnType("text");
|
.HasColumnType("text");
|
||||||
|
|
||||||
@@ -783,6 +789,8 @@ namespace JobsMedical.Web.Migrations
|
|||||||
|
|
||||||
b.HasIndex("LinkedShiftId");
|
b.HasIndex("LinkedShiftId");
|
||||||
|
|
||||||
|
b.HasIndex("LinkedTalentId");
|
||||||
|
|
||||||
b.HasIndex("Status");
|
b.HasIndex("Status");
|
||||||
|
|
||||||
b.ToTable("RawListings");
|
b.ToTable("RawListings");
|
||||||
@@ -1415,9 +1423,17 @@ namespace JobsMedical.Web.Migrations
|
|||||||
{
|
{
|
||||||
b.HasOne("JobsMedical.Web.Models.Shift", "LinkedShift")
|
b.HasOne("JobsMedical.Web.Models.Shift", "LinkedShift")
|
||||||
.WithMany()
|
.WithMany()
|
||||||
.HasForeignKey("LinkedShiftId");
|
.HasForeignKey("LinkedShiftId")
|
||||||
|
.OnDelete(DeleteBehavior.SetNull);
|
||||||
|
|
||||||
|
b.HasOne("JobsMedical.Web.Models.TalentListing", "LinkedTalent")
|
||||||
|
.WithMany()
|
||||||
|
.HasForeignKey("LinkedTalentId")
|
||||||
|
.OnDelete(DeleteBehavior.SetNull);
|
||||||
|
|
||||||
b.Navigation("LinkedShift");
|
b.Navigation("LinkedShift");
|
||||||
|
|
||||||
|
b.Navigation("LinkedTalent");
|
||||||
});
|
});
|
||||||
|
|
||||||
modelBuilder.Entity("JobsMedical.Web.Models.Review", b =>
|
modelBuilder.Entity("JobsMedical.Web.Models.Review", b =>
|
||||||
|
|||||||
@@ -25,10 +25,16 @@ public class RawListing
|
|||||||
public Shift? LinkedShift { get; set; }
|
public Shift? LinkedShift { get; set; }
|
||||||
|
|
||||||
public int? LinkedTalentId { get; set; } // آگهی «آماده به کار» ساختهشده از این متن
|
public int? LinkedTalentId { get; set; } // آگهی «آماده به کار» ساختهشده از این متن
|
||||||
|
public TalentListing? LinkedTalent { get; set; }
|
||||||
|
|
||||||
[MaxLength(500)]
|
[MaxLength(500)]
|
||||||
public string? SourceUrl { get; set; }
|
public string? SourceUrl { get; set; }
|
||||||
|
|
||||||
|
/// <summary>Approximate coordinates harvested from the source (e.g. Divar's fuzzed map center).
|
||||||
|
/// Carried through the review queue so a manual publish can still place the facility on the map.</summary>
|
||||||
|
public double? Lat { get; set; }
|
||||||
|
public double? Lng { get; set; }
|
||||||
|
|
||||||
/// <summary>SHA-256 of the normalized text — used to dedupe across ingestion runs.</summary>
|
/// <summary>SHA-256 of the normalized text — used to dedupe across ingestion runs.</summary>
|
||||||
[MaxLength(64)]
|
[MaxLength(64)]
|
||||||
public string? ContentHash { get; set; }
|
public string? ContentHash { get; set; }
|
||||||
|
|||||||
@@ -40,6 +40,14 @@
|
|||||||
<p class="muted" style="font-size:11px; margin:8px 0 0;">
|
<p class="muted" style="font-size:11px; margin:8px 0 0;">
|
||||||
موتور: واکشی ← حذف تکراری ← تجزیه ← اعتبارسنجی ← صف بررسی.
|
موتور: واکشی ← حذف تکراری ← تجزیه ← اعتبارسنجی ← صف بررسی.
|
||||||
</p>
|
</p>
|
||||||
|
<form method="post" onsubmit="return confirm('⚠ همهی آیتمهای جمعآوریشده (کش) و همهی آگهیهای منتشرشده از جمعآوری حذف میشوند (آگهیهای ثبتشده توسط مراکز دستنخورده میمانند)، سپس همهچیز با هوش مصنوعی دوباره جمعآوری و افزوده میشود. این کار بازگشتناپذیر است. ادامه میدهی؟');">
|
||||||
|
<button type="submit" asp-page-handler="PurgeAndReingest" class="btn btn-outline btn-block" style="margin-top:8px; color:var(--danger); border-color:var(--danger);">
|
||||||
|
🔄 پاکسازی کش و جمعآوری مجدد با هوش مصنوعی
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
<p class="muted" style="font-size:11px; margin:6px 0 0;">
|
||||||
|
کش حذف تکراری و آگهیهای جمعآوریشده پاک و از نو با AI پردازش میشوند. (آگهیهای مراکز حذف نمیشوند.)
|
||||||
|
</p>
|
||||||
|
|
||||||
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
|
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
|
||||||
|
|
||||||
|
|||||||
@@ -65,6 +65,35 @@ public class IndexModel : PageModel
|
|||||||
return RedirectToPage();
|
return RedirectToPage();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// DESTRUCTIVE rebuild, in two distinct deletes:
|
||||||
|
/// 1. The DEDUPE CACHE — ALL RawListings, including any added via «افزودن دستی». These are not
|
||||||
|
/// published content; they're the crawl/staging rows whose ContentHash blocks re-ingesting
|
||||||
|
/// the same ad. Wiping them lets everything be re-fetched and re-judged by the AI.
|
||||||
|
/// 2. AGGREGATED listings only — Shifts/JobOpenings/TalentListings with Source==Aggregated, i.e.
|
||||||
|
/// produced by ingestion. Employer/admin-posted listings (Source==Direct) are left untouched.
|
||||||
|
/// Then re-fetch everything and re-run it through the (now AI-enabled) pipeline.
|
||||||
|
/// RawListings are deleted first so their LinkedShift/LinkedTalent FKs (SetNull) don't dangle;
|
||||||
|
/// DB cascade clears ContactMethods / Applications / InterestEvents when the posts are deleted.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<IActionResult> OnPostPurgeAndReingestAsync()
|
||||||
|
{
|
||||||
|
int rawCount, shifts, jobs, talent;
|
||||||
|
await using (var tx = await _db.Database.BeginTransactionAsync())
|
||||||
|
{
|
||||||
|
rawCount = await _db.RawListings.ExecuteDeleteAsync(); // clear dedupe cache
|
||||||
|
shifts = await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
||||||
|
jobs = await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
||||||
|
talent = await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
||||||
|
await tx.CommitAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
var s = await _ingest.RunAsync(); // fresh fetch → AI audit → publish/queue
|
||||||
|
IngestMessage = $"پاکسازی شد (حذف: {rawCount} آیتم کش، {shifts} شیفت، {jobs} استخدام، {talent} آمادهبهکارِ جمعآوریشده). " +
|
||||||
|
$"جمعآوری مجدد: {s.TotalPublished} منتشر، {s.TotalQueued} در صف، {s.TotalFlagged} پرچم، {s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
|
||||||
|
return RedirectToPage();
|
||||||
|
}
|
||||||
|
|
||||||
private async Task LoadAsync()
|
private async Task LoadAsync()
|
||||||
{
|
{
|
||||||
Queue = await _db.RawListings
|
Queue = await _db.RawListings
|
||||||
|
|||||||
@@ -282,13 +282,26 @@ public class ReviewModel : PageModel
|
|||||||
if (cityId is null) return null; // no cities seeded — cannot create a facility
|
if (cityId is null) return null; // no cities seeded — cannot create a facility
|
||||||
|
|
||||||
// No facility named in the ad → use/create the shared placeholder.
|
// No facility named in the ad → use/create the shared placeholder.
|
||||||
var name = string.IsNullOrWhiteSpace(NewFacilityName) ? UnknownFacilityName : NewFacilityName.Trim();
|
var isPlaceholder = string.IsNullOrWhiteSpace(NewFacilityName);
|
||||||
|
var name = isPlaceholder ? UnknownFacilityName : NewFacilityName.Trim();
|
||||||
|
|
||||||
|
// Approximate coords carried from the crawl (e.g. Divar). NEVER apply them to the shared
|
||||||
|
// «نامشخص» placeholder — it's reused across many ads, so a single ad's point would mislead.
|
||||||
|
bool HasGeo() => !isPlaceholder && Raw?.Lat is not null;
|
||||||
|
|
||||||
// Reuse an existing facility that's exactly or closely the same (Persian-aware fuzzy
|
// Reuse an existing facility that's exactly or closely the same (Persian-aware fuzzy
|
||||||
// match), so we don't create duplicates like «بیمارستان میلاد» vs «میلاد».
|
// match), so we don't create duplicates like «بیمارستان میلاد» vs «میلاد».
|
||||||
var all = await _db.Facilities.ToListAsync();
|
var all = await _db.Facilities.ToListAsync();
|
||||||
var match = FacilityMatcher.FindBest(all, name, cityId);
|
var match = FacilityMatcher.FindBest(all, name, cityId);
|
||||||
if (match is not null) return match.Id;
|
if (match is not null)
|
||||||
|
{
|
||||||
|
if (HasGeo() && match.Lat is null && match.Lng is null) // backfill only, never overwrite
|
||||||
|
{
|
||||||
|
match.Lat = Raw!.Lat; match.Lng = Raw.Lng;
|
||||||
|
await _db.SaveChangesAsync();
|
||||||
|
}
|
||||||
|
return match.Id;
|
||||||
|
}
|
||||||
|
|
||||||
var facility = new Facility
|
var facility = new Facility
|
||||||
{
|
{
|
||||||
@@ -297,6 +310,8 @@ public class ReviewModel : PageModel
|
|||||||
Type = FacilityType.Hospital,
|
Type = FacilityType.Hospital,
|
||||||
Verification = VerificationStatus.Unverified,
|
Verification = VerificationStatus.Unverified,
|
||||||
IsVerified = false,
|
IsVerified = false,
|
||||||
|
Lat = HasGeo() ? Raw!.Lat : null,
|
||||||
|
Lng = HasGeo() ? Raw!.Lng : null,
|
||||||
};
|
};
|
||||||
_db.Facilities.Add(facility);
|
_db.Facilities.Add(facility);
|
||||||
await _db.SaveChangesAsync();
|
await _db.SaveChangesAsync();
|
||||||
|
|||||||
@@ -59,17 +59,25 @@ public class DivarListingSource : IListingSource
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
using var doc = JsonDocument.Parse(body);
|
using var doc = JsonDocument.Parse(body);
|
||||||
|
var cityLabel = CityLabel(s.DivarCity); // every result is from the city we searched
|
||||||
foreach (var (text, token) in Harvest(doc.RootElement).Take(25))
|
foreach (var (text, token) in Harvest(doc.RootElement).Take(25))
|
||||||
{
|
{
|
||||||
var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir";
|
var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir";
|
||||||
var withPhone = text;
|
var itemText = text;
|
||||||
|
// Stamp the city so the parser/AI always resolve a location (Divar's own location
|
||||||
|
// line isn't always in the search row; the searched city is authoritative).
|
||||||
|
if (!string.IsNullOrWhiteSpace(cityLabel) && !text.Contains(cityLabel))
|
||||||
|
itemText += $"\n📍 {cityLabel}";
|
||||||
|
double? lat = null, lng = null;
|
||||||
if (token is not null)
|
if (token is not null)
|
||||||
{
|
{
|
||||||
var phones = await RevealPhonesAsync(client, token, s, ct);
|
// One detail fetch yields BOTH the phone and the map coordinates.
|
||||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
var (phones, gLat, gLng) = await FetchDetailAsync(client, token, ct);
|
||||||
withPhone = text + "\nشماره تماس: " + string.Join("، ", phones);
|
if (phones.Count > 0 && !phones.Any(itemText.Contains))
|
||||||
|
itemText += "\nشماره تماس: " + string.Join("، ", phones);
|
||||||
|
lat = gLat; lng = gLng;
|
||||||
}
|
}
|
||||||
items.Add(new ScrapedItem("دیوار", withPhone, url));
|
items.Add(new ScrapedItem("دیوار", itemText, url, lat, lng));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
|
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
|
||||||
@@ -95,16 +103,31 @@ public class DivarListingSource : IListingSource
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>Persian display name for the searched city (slug/number/Persian → Persian), used to
|
||||||
|
/// stamp every Divar result with its (authoritative) location.</summary>
|
||||||
|
private static string CityLabel(string? city) => (city ?? "").Trim().ToLowerInvariant() switch
|
||||||
|
{
|
||||||
|
"1" or "tehran" or "تهران" => "تهران",
|
||||||
|
"3" or "isfahan" or "esfahan" or "اصفهان" => "اصفهان",
|
||||||
|
"4" or "mashhad" or "مشهد" => "مشهد",
|
||||||
|
"5" or "shiraz" or "شیراز" => "شیراز",
|
||||||
|
"6" or "tabriz" or "تبریز" => "تبریز",
|
||||||
|
"1745" or "karaj" or "کرج" => "کرج",
|
||||||
|
_ => (city ?? "").Trim(),
|
||||||
|
};
|
||||||
|
|
||||||
// The post detail endpoint returns the FULL description — many Divar job ads write the phone
|
// The post detail endpoint returns the FULL description — many Divar job ads write the phone
|
||||||
// straight into the body, so we can harvest it without Divar's (login-gated) contact reveal.
|
// straight into the body, so we can harvest it without Divar's (login-gated) contact reveal.
|
||||||
private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/";
|
private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/";
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Fetch a post's detail JSON and harvest any contact number it contains (mostly numbers the
|
/// Fetch a post's detail JSON ONCE and harvest both (a) any contact number it contains (mostly
|
||||||
/// poster wrote into the description). Divar's true "نمایش شماره" reveal is auth-gated; this
|
/// numbers the poster wrote into the description; Divar's true "نمایش شماره" reveal is auth-gated)
|
||||||
/// covers the common case where the number is in the ad text. Fails soft.
|
/// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a
|
||||||
|
/// circle). Fails soft — returns whatever it could extract.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private async Task<List<string>> RevealPhonesAsync(HttpClient client, string token, AppSetting s, CancellationToken ct)
|
private async Task<(List<string> phones, double? lat, double? lng)> FetchDetailAsync(
|
||||||
|
HttpClient client, string token, CancellationToken ct)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
@@ -112,18 +135,68 @@ public class DivarListingSource : IListingSource
|
|||||||
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
||||||
req.Headers.TryAddWithoutValidation("Accept", "application/json");
|
req.Headers.TryAddWithoutValidation("Accept", "application/json");
|
||||||
using var resp = await client.SendAsync(req, ct);
|
using var resp = await client.SendAsync(req, ct);
|
||||||
if (!resp.IsSuccessStatusCode) return new();
|
if (!resp.IsSuccessStatusCode) return (new(), null, null);
|
||||||
var body = await resp.Content.ReadAsStringAsync(ct);
|
var body = await resp.Content.ReadAsStringAsync(ct);
|
||||||
if (body.Contains("BLOCKING_VIEW")) return new();
|
if (body.Contains("BLOCKING_VIEW")) return (new(), null, null);
|
||||||
return HtmlUtil.HarvestPhones(body);
|
var phones = HtmlUtil.HarvestPhones(body);
|
||||||
|
double? lat = null, lng = null;
|
||||||
|
try { using var doc = JsonDocument.Parse(body); if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } }
|
||||||
|
catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ }
|
||||||
|
return (phones, lat, lng);
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
|
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
|
||||||
return new();
|
return (new(), null, null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…).
|
||||||
|
private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Tolerantly find an approximate (lat, lng) anywhere in Divar's detail JSON. Divar's shape
|
||||||
|
/// shifts (sometimes `latitude`/`longitude`, sometimes nested under `location`/`coordinates`),
|
||||||
|
/// so we walk the tree and accept the first OBJECT that holds BOTH a latitude-like and a
|
||||||
|
/// longitude-like numeric property whose values fall inside Iran. Pairing within one object
|
||||||
|
/// avoids matching a stray lat to an unrelated lng. Returns null if nothing plausible is found.
|
||||||
|
/// </summary>
|
||||||
|
private static (double lat, double lng)? FindLatLng(JsonElement el)
|
||||||
|
{
|
||||||
|
if (el.ValueKind == JsonValueKind.Object)
|
||||||
|
{
|
||||||
|
double? lat = null, lng = null;
|
||||||
|
foreach (var p in el.EnumerateObject())
|
||||||
|
{
|
||||||
|
if (lat is null && IsLatKey(p.Name) && TryNum(p.Value, out var la)) lat = la;
|
||||||
|
else if (lng is null && IsLngKey(p.Name) && TryNum(p.Value, out var lo)) lng = lo;
|
||||||
|
}
|
||||||
|
if (lat is double L && lng is double G && L is >= MinLat and <= MaxLat && G is >= MinLng and <= MaxLng)
|
||||||
|
return (L, G);
|
||||||
|
foreach (var p in el.EnumerateObject())
|
||||||
|
if (FindLatLng(p.Value) is { } r) return r;
|
||||||
|
}
|
||||||
|
else if (el.ValueKind == JsonValueKind.Array)
|
||||||
|
foreach (var item in el.EnumerateArray())
|
||||||
|
if (FindLatLng(item) is { } r) return r;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsLatKey(string k) => k.Equals("latitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lat", StringComparison.OrdinalIgnoreCase);
|
||||||
|
private static bool IsLngKey(string k) =>
|
||||||
|
k.Equals("longitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lng", StringComparison.OrdinalIgnoreCase)
|
||||||
|
|| k.Equals("lon", StringComparison.OrdinalIgnoreCase) || k.Equals("long", StringComparison.OrdinalIgnoreCase);
|
||||||
|
|
||||||
|
/// <summary>Coordinate may be a JSON number or a numeric string ("35.7"). Invariant culture.</summary>
|
||||||
|
private static bool TryNum(JsonElement v, out double d)
|
||||||
|
{
|
||||||
|
if (v.ValueKind == JsonValueKind.Number) return v.TryGetDouble(out d);
|
||||||
|
if (v.ValueKind == JsonValueKind.String)
|
||||||
|
return double.TryParse(v.GetString(), System.Globalization.NumberStyles.Float,
|
||||||
|
System.Globalization.CultureInfo.InvariantCulture, out d);
|
||||||
|
d = 0; return false;
|
||||||
|
}
|
||||||
|
|
||||||
private static readonly string[] DescKeys =
|
private static readonly string[] DescKeys =
|
||||||
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
||||||
|
|
||||||
@@ -134,9 +207,11 @@ public class DivarListingSource : IListingSource
|
|||||||
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
|
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
|
||||||
{
|
{
|
||||||
var sb = new StringBuilder(t.GetString());
|
var sb = new StringBuilder(t.GetString());
|
||||||
|
// Append ALL present description fields — the location/time line («… در تهران، جنتآباد»)
|
||||||
|
// is usually in bottom_description_text, so don't stop at the first match.
|
||||||
foreach (var k in DescKeys)
|
foreach (var k in DescKeys)
|
||||||
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
|
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String && d.GetString() is { Length: > 0 } v)
|
||||||
{ sb.Append(" — ").Append(d.GetString()); break; }
|
sb.Append(" — ").Append(v);
|
||||||
var text = sb.ToString().Trim();
|
var text = sb.ToString().Trim();
|
||||||
if (text.Length >= 15) yield return (text, FindToken(el));
|
if (text.Length >= 15) yield return (text, FindToken(el));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,8 +2,11 @@ using JobsMedical.Web.Models;
|
|||||||
|
|
||||||
namespace JobsMedical.Web.Services.Scraping;
|
namespace JobsMedical.Web.Services.Scraping;
|
||||||
|
|
||||||
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).</summary>
|
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).
|
||||||
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null);
|
/// Lat/Lng are an APPROXIMATE location when the source exposes one (e.g. Divar's privacy-fuzzed
|
||||||
|
/// map center) — used to place an aggregated facility on the map / enable «near me».</summary>
|
||||||
|
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null,
|
||||||
|
double? Lat = null, double? Lng = null);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// A pluggable source the ingestion engine pulls from. Configuration (enabled, channels, tokens)
|
/// A pluggable source the ingestion engine pulls from. Configuration (enabled, channels, tokens)
|
||||||
|
|||||||
@@ -46,6 +46,10 @@ public class IngestionService
|
|||||||
|
|
||||||
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
|
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
|
||||||
|
|
||||||
|
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
|
||||||
|
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
|
||||||
|
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
|
||||||
|
|
||||||
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
||||||
{
|
{
|
||||||
var settings = await _settings.GetAsync();
|
var settings = await _settings.GetAsync();
|
||||||
@@ -71,7 +75,17 @@ public class IngestionService
|
|||||||
{
|
{
|
||||||
fetched++;
|
fetched++;
|
||||||
var hash = Hash(item.RawText);
|
var hash = Hash(item.RawText);
|
||||||
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
|
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
|
||||||
|
if (existing is not null)
|
||||||
|
{
|
||||||
|
// Best-effort geo retry: coords are normally captured only on first ingest, but a
|
||||||
|
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
|
||||||
|
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
|
||||||
|
// coords and the row has none, so an item still sitting in the queue can be placed on
|
||||||
|
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
|
||||||
|
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
|
||||||
|
dupes++; continue;
|
||||||
|
}
|
||||||
|
|
||||||
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
|
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
|
||||||
var val = _validator.Validate(item.RawText, parsed);
|
var val = _validator.Validate(item.RawText, parsed);
|
||||||
@@ -91,6 +105,7 @@ public class IngestionService
|
|||||||
Confidence = confidence,
|
Confidence = confidence,
|
||||||
ValidationNotes = reason,
|
ValidationNotes = reason,
|
||||||
Status = status,
|
Status = status,
|
||||||
|
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
|
||||||
};
|
};
|
||||||
_db.RawListings.Add(raw);
|
_db.RawListings.Add(raw);
|
||||||
|
|
||||||
@@ -146,8 +161,15 @@ public class IngestionService
|
|||||||
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
|
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
|
||||||
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
|
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
|
||||||
if (ai.Approve)
|
if (ai.Approve)
|
||||||
|
{
|
||||||
|
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
|
||||||
|
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
|
||||||
|
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
|
||||||
|
if (!val.LooksMedical)
|
||||||
|
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
|
||||||
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
|
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
|
||||||
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
|
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
|
||||||
|
}
|
||||||
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
|
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -218,10 +240,15 @@ public class IngestionService
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Never surface the crawl source (e.g. «مدجابز») in a public facility name.
|
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
|
||||||
|
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
|
||||||
|
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
|
||||||
|
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
|
||||||
|
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
|
||||||
|
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
|
||||||
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
||||||
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
||||||
: "مرکز درمانی (نامشخص)";
|
: UnknownFacilityName;
|
||||||
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
||||||
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
||||||
if (facility is null)
|
if (facility is null)
|
||||||
@@ -230,10 +257,17 @@ public class IngestionService
|
|||||||
{
|
{
|
||||||
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
||||||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
|
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
|
||||||
|
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
|
||||||
};
|
};
|
||||||
_db.Facilities.Add(facility);
|
_db.Facilities.Add(facility);
|
||||||
facilities.Add(facility); // so later listings in this run match it too
|
facilities.Add(facility); // so later listings in this run match it too
|
||||||
}
|
}
|
||||||
|
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
|
||||||
|
{
|
||||||
|
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
|
||||||
|
// real (employer-set or verified) location with Divar's fuzzy point.
|
||||||
|
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
|
||||||
|
}
|
||||||
|
|
||||||
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
||||||
{
|
{
|
||||||
@@ -278,24 +312,33 @@ public class IngestionService
|
|||||||
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
|
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>Find an existing role by Persian-normalized name; if none, create a new Role (dynamic
|
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
|
||||||
/// taxonomy) using the AI's suggested category — reusing an existing category when one normalizes
|
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
|
||||||
/// to the same text — and add it to the in-run list so later items reuse it instead of duplicating.</summary>
|
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
|
||||||
|
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
|
||||||
|
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
|
||||||
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
|
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
|
||||||
{
|
{
|
||||||
var norm = NormalizeFa(name);
|
var norm = NormalizeFa(name);
|
||||||
|
|
||||||
|
// (1) Already a known role (same word or spelling variant).
|
||||||
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
|
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
|
||||||
if (match is not null) return match;
|
if (match is not null) return match;
|
||||||
|
|
||||||
var wantCat = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
|
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
|
||||||
// Collapse onto an existing category that normalizes the same, so «تکنسین» != «تکنسين» doesn't fork.
|
if (RoleAliases.TryGetValue(norm, out var canonical))
|
||||||
var existingCat = roles.Select(r => r.Category)
|
{
|
||||||
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == NormalizeFa(wantCat));
|
var canonNorm = NormalizeFa(canonical);
|
||||||
|
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
|
||||||
|
if (aliased is not null) return aliased;
|
||||||
|
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
|
||||||
|
}
|
||||||
|
|
||||||
|
// (3) Genuinely new role — create it under a canonical-resolved category.
|
||||||
var created = new Role
|
var created = new Role
|
||||||
{
|
{
|
||||||
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
|
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
|
||||||
Category = Clamp(existingCat ?? wantCat, 50), // respect Role.Category MaxLength(50)
|
Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50)
|
||||||
IsActive = true,
|
IsActive = true,
|
||||||
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
|
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
|
||||||
};
|
};
|
||||||
@@ -306,6 +349,58 @@ public class IngestionService
|
|||||||
return created;
|
return created;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>Map an AI-suggested category to a canonical one: synonym alias first
|
||||||
|
/// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is.</summary>
|
||||||
|
private static string ResolveCategory(List<Role> roles, string? category)
|
||||||
|
{
|
||||||
|
var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
|
||||||
|
// Resolve to a canonical first (synonym alias), then to whichever normalized form is the
|
||||||
|
// matching target. Crucially, ALWAYS prefer a category string already stored on a role — even
|
||||||
|
// after an alias maps to a canonical — so we never fork a second variant of the same group.
|
||||||
|
var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw;
|
||||||
|
var targetNorm = NormalizeFa(target);
|
||||||
|
return roles.Select(r => r.Category)
|
||||||
|
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
|
||||||
|
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
|
||||||
|
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
|
||||||
|
{
|
||||||
|
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
|
||||||
|
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
|
||||||
|
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
|
||||||
|
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
|
||||||
|
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
|
||||||
|
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
|
||||||
|
["تکنسین فوریتهای پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
|
||||||
|
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
|
||||||
|
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
|
||||||
|
});
|
||||||
|
|
||||||
|
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
|
||||||
|
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
|
||||||
|
{
|
||||||
|
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
|
||||||
|
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
|
||||||
|
["ماما"] = new[] { "مامایی", "midwifery" },
|
||||||
|
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
|
||||||
|
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
|
||||||
|
});
|
||||||
|
|
||||||
|
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
|
||||||
|
/// also mapping each canonical's own normalized form to itself.</summary>
|
||||||
|
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
|
||||||
|
{
|
||||||
|
var map = new Dictionary<string, string>();
|
||||||
|
foreach (var (canonical, aliases) in src)
|
||||||
|
{
|
||||||
|
map[NormalizeFa(canonical)] = canonical;
|
||||||
|
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
|
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
|
||||||
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
|
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
|
||||||
private static string NormalizeFa(string? s) => Regex.Replace(
|
private static string NormalizeFa(string? s) => Regex.Replace(
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ using JobsMedical.Web.Models;
|
|||||||
|
|
||||||
namespace JobsMedical.Web.Services.Scraping;
|
namespace JobsMedical.Web.Services.Scraping;
|
||||||
|
|
||||||
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
|
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues, bool LooksMedical = false);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
|
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
|
||||||
@@ -64,7 +64,7 @@ public class ListingValidator
|
|||||||
if (isPromo)
|
if (isPromo)
|
||||||
{
|
{
|
||||||
issues.Add("آگهی تبلیغاتی/آموزشی است، نه استخدام/شیفت");
|
issues.Add("آگهی تبلیغاتی/آموزشی است، نه استخدام/شیفت");
|
||||||
return new ValidationResult(false, true, 0, issues); // IsSpam → auto-discard
|
return new ValidationResult(false, true, 0, issues, looksMedical); // IsSpam → auto-discard
|
||||||
}
|
}
|
||||||
|
|
||||||
// «آماده به کار»: a worker offering themselves. No facility/shift-date expected; the role
|
// «آماده به کار»: a worker offering themselves. No facility/shift-date expected; the role
|
||||||
@@ -84,7 +84,7 @@ public class ListingValidator
|
|||||||
if (tlen < 20) { ts -= 20; issues.Add("متن خیلی کوتاه است"); }
|
if (tlen < 20) { ts -= 20; issues.Add("متن خیلی کوتاه است"); }
|
||||||
ts = Math.Clamp(ts, 0, 100);
|
ts = Math.Clamp(ts, 0, 100);
|
||||||
bool tValid = !isSpam && looksMedical && ts >= 50; // role(40)+medical(10) passes w/o phone
|
bool tValid = !isSpam && looksMedical && ts >= 50; // role(40)+medical(10) passes w/o phone
|
||||||
return new ValidationResult(tValid, isSpam, ts, issues);
|
return new ValidationResult(tValid, isSpam, ts, issues, looksMedical);
|
||||||
}
|
}
|
||||||
|
|
||||||
int score = 0;
|
int score = 0;
|
||||||
@@ -107,6 +107,6 @@ public class ListingValidator
|
|||||||
|
|
||||||
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
|
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
|
||||||
bool isValid = !isSpam && looksMedical && score >= 50;
|
bool isValid = !isSpam && looksMedical && score >= 50;
|
||||||
return new ValidationResult(isValid, isSpam, score, issues);
|
return new ValidationResult(isValid, isSpam, score, issues, looksMedical);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user