Divar geo-coords to facility map + medical gate + RawListing FK/geo migrations
This commit is contained in:
@@ -171,5 +171,14 @@ public class AppDbContext : DbContext, IDataProtectionKeyContext
|
||||
// Dedupe ingested listings by content hash.
|
||||
b.Entity<RawListing>().HasIndex(r => r.ContentHash);
|
||||
b.Entity<RawListing>().HasIndex(r => r.Status);
|
||||
// A RawListing only LINKS to the post it produced — it must outlive that post (it's the
|
||||
// dedupe cache). So deleting a Shift/Talent NULLs the back-reference rather than orphaning a
|
||||
// dangling FK or blocking the delete. LinkedTalentId previously had no FK at all (orphan risk).
|
||||
b.Entity<RawListing>()
|
||||
.HasOne(r => r.LinkedShift).WithMany()
|
||||
.HasForeignKey(r => r.LinkedShiftId).OnDelete(DeleteBehavior.SetNull);
|
||||
b.Entity<RawListing>()
|
||||
.HasOne(r => r.LinkedTalent).WithMany()
|
||||
.HasForeignKey(r => r.LinkedTalentId).OnDelete(DeleteBehavior.SetNull);
|
||||
}
|
||||
}
|
||||
|
||||
+1581
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,69 @@
|
||||
using Microsoft.EntityFrameworkCore.Migrations;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace JobsMedical.Web.Migrations
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public partial class RawListingLinkFks : Migration
|
||||
{
|
||||
/// <inheritdoc />
|
||||
protected override void Up(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.DropForeignKey(
|
||||
name: "FK_RawListings_Shifts_LinkedShiftId",
|
||||
table: "RawListings");
|
||||
|
||||
// LinkedTalentId never had an FK before, so existing rows may point at deleted talent.
|
||||
// Null those orphans first, otherwise AddForeignKey below fails on a populated DB.
|
||||
migrationBuilder.Sql(
|
||||
"UPDATE \"RawListings\" r SET \"LinkedTalentId\" = NULL " +
|
||||
"WHERE r.\"LinkedTalentId\" IS NOT NULL " +
|
||||
"AND NOT EXISTS (SELECT 1 FROM \"TalentListings\" t WHERE t.\"Id\" = r.\"LinkedTalentId\");");
|
||||
|
||||
migrationBuilder.CreateIndex(
|
||||
name: "IX_RawListings_LinkedTalentId",
|
||||
table: "RawListings",
|
||||
column: "LinkedTalentId");
|
||||
|
||||
migrationBuilder.AddForeignKey(
|
||||
name: "FK_RawListings_Shifts_LinkedShiftId",
|
||||
table: "RawListings",
|
||||
column: "LinkedShiftId",
|
||||
principalTable: "Shifts",
|
||||
principalColumn: "Id",
|
||||
onDelete: ReferentialAction.SetNull);
|
||||
|
||||
migrationBuilder.AddForeignKey(
|
||||
name: "FK_RawListings_TalentListings_LinkedTalentId",
|
||||
table: "RawListings",
|
||||
column: "LinkedTalentId",
|
||||
principalTable: "TalentListings",
|
||||
principalColumn: "Id",
|
||||
onDelete: ReferentialAction.SetNull);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void Down(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.DropForeignKey(
|
||||
name: "FK_RawListings_Shifts_LinkedShiftId",
|
||||
table: "RawListings");
|
||||
|
||||
migrationBuilder.DropForeignKey(
|
||||
name: "FK_RawListings_TalentListings_LinkedTalentId",
|
||||
table: "RawListings");
|
||||
|
||||
migrationBuilder.DropIndex(
|
||||
name: "IX_RawListings_LinkedTalentId",
|
||||
table: "RawListings");
|
||||
|
||||
migrationBuilder.AddForeignKey(
|
||||
name: "FK_RawListings_Shifts_LinkedShiftId",
|
||||
table: "RawListings",
|
||||
column: "LinkedShiftId",
|
||||
principalTable: "Shifts",
|
||||
principalColumn: "Id");
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,38 @@
|
||||
using Microsoft.EntityFrameworkCore.Migrations;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace JobsMedical.Web.Migrations
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public partial class RawListingGeo : Migration
|
||||
{
|
||||
/// <inheritdoc />
|
||||
protected override void Up(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.AddColumn<double>(
|
||||
name: "Lat",
|
||||
table: "RawListings",
|
||||
type: "double precision",
|
||||
nullable: true);
|
||||
|
||||
migrationBuilder.AddColumn<double>(
|
||||
name: "Lng",
|
||||
table: "RawListings",
|
||||
type: "double precision",
|
||||
nullable: true);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void Down(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.DropColumn(
|
||||
name: "Lat",
|
||||
table: "RawListings");
|
||||
|
||||
migrationBuilder.DropColumn(
|
||||
name: "Lng",
|
||||
table: "RawListings");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -748,12 +748,18 @@ namespace JobsMedical.Web.Migrations
|
||||
b.Property<DateTime>("FetchedAt")
|
||||
.HasColumnType("timestamp with time zone");
|
||||
|
||||
b.Property<double?>("Lat")
|
||||
.HasColumnType("double precision");
|
||||
|
||||
b.Property<int?>("LinkedShiftId")
|
||||
.HasColumnType("integer");
|
||||
|
||||
b.Property<int?>("LinkedTalentId")
|
||||
.HasColumnType("integer");
|
||||
|
||||
b.Property<double?>("Lng")
|
||||
.HasColumnType("double precision");
|
||||
|
||||
b.Property<string>("ParsedJson")
|
||||
.HasColumnType("text");
|
||||
|
||||
@@ -783,6 +789,8 @@ namespace JobsMedical.Web.Migrations
|
||||
|
||||
b.HasIndex("LinkedShiftId");
|
||||
|
||||
b.HasIndex("LinkedTalentId");
|
||||
|
||||
b.HasIndex("Status");
|
||||
|
||||
b.ToTable("RawListings");
|
||||
@@ -1415,9 +1423,17 @@ namespace JobsMedical.Web.Migrations
|
||||
{
|
||||
b.HasOne("JobsMedical.Web.Models.Shift", "LinkedShift")
|
||||
.WithMany()
|
||||
.HasForeignKey("LinkedShiftId");
|
||||
.HasForeignKey("LinkedShiftId")
|
||||
.OnDelete(DeleteBehavior.SetNull);
|
||||
|
||||
b.HasOne("JobsMedical.Web.Models.TalentListing", "LinkedTalent")
|
||||
.WithMany()
|
||||
.HasForeignKey("LinkedTalentId")
|
||||
.OnDelete(DeleteBehavior.SetNull);
|
||||
|
||||
b.Navigation("LinkedShift");
|
||||
|
||||
b.Navigation("LinkedTalent");
|
||||
});
|
||||
|
||||
modelBuilder.Entity("JobsMedical.Web.Models.Review", b =>
|
||||
|
||||
@@ -25,10 +25,16 @@ public class RawListing
|
||||
public Shift? LinkedShift { get; set; }
|
||||
|
||||
public int? LinkedTalentId { get; set; } // آگهی «آماده به کار» ساختهشده از این متن
|
||||
public TalentListing? LinkedTalent { get; set; }
|
||||
|
||||
[MaxLength(500)]
|
||||
public string? SourceUrl { get; set; }
|
||||
|
||||
/// <summary>Approximate coordinates harvested from the source (e.g. Divar's fuzzed map center).
|
||||
/// Carried through the review queue so a manual publish can still place the facility on the map.</summary>
|
||||
public double? Lat { get; set; }
|
||||
public double? Lng { get; set; }
|
||||
|
||||
/// <summary>SHA-256 of the normalized text — used to dedupe across ingestion runs.</summary>
|
||||
[MaxLength(64)]
|
||||
public string? ContentHash { get; set; }
|
||||
|
||||
@@ -40,6 +40,14 @@
|
||||
<p class="muted" style="font-size:11px; margin:8px 0 0;">
|
||||
موتور: واکشی ← حذف تکراری ← تجزیه ← اعتبارسنجی ← صف بررسی.
|
||||
</p>
|
||||
<form method="post" onsubmit="return confirm('⚠ همهی آیتمهای جمعآوریشده (کش) و همهی آگهیهای منتشرشده از جمعآوری حذف میشوند (آگهیهای ثبتشده توسط مراکز دستنخورده میمانند)، سپس همهچیز با هوش مصنوعی دوباره جمعآوری و افزوده میشود. این کار بازگشتناپذیر است. ادامه میدهی؟');">
|
||||
<button type="submit" asp-page-handler="PurgeAndReingest" class="btn btn-outline btn-block" style="margin-top:8px; color:var(--danger); border-color:var(--danger);">
|
||||
🔄 پاکسازی کش و جمعآوری مجدد با هوش مصنوعی
|
||||
</button>
|
||||
</form>
|
||||
<p class="muted" style="font-size:11px; margin:6px 0 0;">
|
||||
کش حذف تکراری و آگهیهای جمعآوریشده پاک و از نو با AI پردازش میشوند. (آگهیهای مراکز حذف نمیشوند.)
|
||||
</p>
|
||||
|
||||
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
|
||||
|
||||
|
||||
@@ -65,6 +65,35 @@ public class IndexModel : PageModel
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DESTRUCTIVE rebuild, in two distinct deletes:
|
||||
/// 1. The DEDUPE CACHE — ALL RawListings, including any added via «افزودن دستی». These are not
|
||||
/// published content; they're the crawl/staging rows whose ContentHash blocks re-ingesting
|
||||
/// the same ad. Wiping them lets everything be re-fetched and re-judged by the AI.
|
||||
/// 2. AGGREGATED listings only — Shifts/JobOpenings/TalentListings with Source==Aggregated, i.e.
|
||||
/// produced by ingestion. Employer/admin-posted listings (Source==Direct) are left untouched.
|
||||
/// Then re-fetch everything and re-run it through the (now AI-enabled) pipeline.
|
||||
/// RawListings are deleted first so their LinkedShift/LinkedTalent FKs (SetNull) don't dangle;
|
||||
/// DB cascade clears ContactMethods / Applications / InterestEvents when the posts are deleted.
|
||||
/// </summary>
|
||||
public async Task<IActionResult> OnPostPurgeAndReingestAsync()
|
||||
{
|
||||
int rawCount, shifts, jobs, talent;
|
||||
await using (var tx = await _db.Database.BeginTransactionAsync())
|
||||
{
|
||||
rawCount = await _db.RawListings.ExecuteDeleteAsync(); // clear dedupe cache
|
||||
shifts = await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
||||
jobs = await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
||||
talent = await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
||||
await tx.CommitAsync();
|
||||
}
|
||||
|
||||
var s = await _ingest.RunAsync(); // fresh fetch → AI audit → publish/queue
|
||||
IngestMessage = $"پاکسازی شد (حذف: {rawCount} آیتم کش، {shifts} شیفت، {jobs} استخدام، {talent} آمادهبهکارِ جمعآوریشده). " +
|
||||
$"جمعآوری مجدد: {s.TotalPublished} منتشر، {s.TotalQueued} در صف، {s.TotalFlagged} پرچم، {s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
private async Task LoadAsync()
|
||||
{
|
||||
Queue = await _db.RawListings
|
||||
|
||||
@@ -282,13 +282,26 @@ public class ReviewModel : PageModel
|
||||
if (cityId is null) return null; // no cities seeded — cannot create a facility
|
||||
|
||||
// No facility named in the ad → use/create the shared placeholder.
|
||||
var name = string.IsNullOrWhiteSpace(NewFacilityName) ? UnknownFacilityName : NewFacilityName.Trim();
|
||||
var isPlaceholder = string.IsNullOrWhiteSpace(NewFacilityName);
|
||||
var name = isPlaceholder ? UnknownFacilityName : NewFacilityName.Trim();
|
||||
|
||||
// Approximate coords carried from the crawl (e.g. Divar). NEVER apply them to the shared
|
||||
// «نامشخص» placeholder — it's reused across many ads, so a single ad's point would mislead.
|
||||
bool HasGeo() => !isPlaceholder && Raw?.Lat is not null;
|
||||
|
||||
// Reuse an existing facility that's exactly or closely the same (Persian-aware fuzzy
|
||||
// match), so we don't create duplicates like «بیمارستان میلاد» vs «میلاد».
|
||||
var all = await _db.Facilities.ToListAsync();
|
||||
var match = FacilityMatcher.FindBest(all, name, cityId);
|
||||
if (match is not null) return match.Id;
|
||||
if (match is not null)
|
||||
{
|
||||
if (HasGeo() && match.Lat is null && match.Lng is null) // backfill only, never overwrite
|
||||
{
|
||||
match.Lat = Raw!.Lat; match.Lng = Raw.Lng;
|
||||
await _db.SaveChangesAsync();
|
||||
}
|
||||
return match.Id;
|
||||
}
|
||||
|
||||
var facility = new Facility
|
||||
{
|
||||
@@ -297,6 +310,8 @@ public class ReviewModel : PageModel
|
||||
Type = FacilityType.Hospital,
|
||||
Verification = VerificationStatus.Unverified,
|
||||
IsVerified = false,
|
||||
Lat = HasGeo() ? Raw!.Lat : null,
|
||||
Lng = HasGeo() ? Raw!.Lng : null,
|
||||
};
|
||||
_db.Facilities.Add(facility);
|
||||
await _db.SaveChangesAsync();
|
||||
|
||||
@@ -59,17 +59,25 @@ public class DivarListingSource : IListingSource
|
||||
continue;
|
||||
}
|
||||
using var doc = JsonDocument.Parse(body);
|
||||
var cityLabel = CityLabel(s.DivarCity); // every result is from the city we searched
|
||||
foreach (var (text, token) in Harvest(doc.RootElement).Take(25))
|
||||
{
|
||||
var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir";
|
||||
var withPhone = text;
|
||||
var itemText = text;
|
||||
// Stamp the city so the parser/AI always resolve a location (Divar's own location
|
||||
// line isn't always in the search row; the searched city is authoritative).
|
||||
if (!string.IsNullOrWhiteSpace(cityLabel) && !text.Contains(cityLabel))
|
||||
itemText += $"\n📍 {cityLabel}";
|
||||
double? lat = null, lng = null;
|
||||
if (token is not null)
|
||||
{
|
||||
var phones = await RevealPhonesAsync(client, token, s, ct);
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
withPhone = text + "\nشماره تماس: " + string.Join("، ", phones);
|
||||
// One detail fetch yields BOTH the phone and the map coordinates.
|
||||
var (phones, gLat, gLng) = await FetchDetailAsync(client, token, ct);
|
||||
if (phones.Count > 0 && !phones.Any(itemText.Contains))
|
||||
itemText += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
lat = gLat; lng = gLng;
|
||||
}
|
||||
items.Add(new ScrapedItem("دیوار", withPhone, url));
|
||||
items.Add(new ScrapedItem("دیوار", itemText, url, lat, lng));
|
||||
}
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
|
||||
@@ -95,16 +103,31 @@ public class DivarListingSource : IListingSource
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Persian display name for the searched city (slug/number/Persian → Persian), used to
|
||||
/// stamp every Divar result with its (authoritative) location.</summary>
|
||||
private static string CityLabel(string? city) => (city ?? "").Trim().ToLowerInvariant() switch
|
||||
{
|
||||
"1" or "tehran" or "تهران" => "تهران",
|
||||
"3" or "isfahan" or "esfahan" or "اصفهان" => "اصفهان",
|
||||
"4" or "mashhad" or "مشهد" => "مشهد",
|
||||
"5" or "shiraz" or "شیراز" => "شیراز",
|
||||
"6" or "tabriz" or "تبریز" => "تبریز",
|
||||
"1745" or "karaj" or "کرج" => "کرج",
|
||||
_ => (city ?? "").Trim(),
|
||||
};
|
||||
|
||||
// The post detail endpoint returns the FULL description — many Divar job ads write the phone
|
||||
// straight into the body, so we can harvest it without Divar's (login-gated) contact reveal.
|
||||
private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/";
|
||||
|
||||
/// <summary>
|
||||
/// Fetch a post's detail JSON and harvest any contact number it contains (mostly numbers the
|
||||
/// poster wrote into the description). Divar's true "نمایش شماره" reveal is auth-gated; this
|
||||
/// covers the common case where the number is in the ad text. Fails soft.
|
||||
/// Fetch a post's detail JSON ONCE and harvest both (a) any contact number it contains (mostly
|
||||
/// numbers the poster wrote into the description; Divar's true "نمایش شماره" reveal is auth-gated)
|
||||
/// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a
|
||||
/// circle). Fails soft — returns whatever it could extract.
|
||||
/// </summary>
|
||||
private async Task<List<string>> RevealPhonesAsync(HttpClient client, string token, AppSetting s, CancellationToken ct)
|
||||
private async Task<(List<string> phones, double? lat, double? lng)> FetchDetailAsync(
|
||||
HttpClient client, string token, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
@@ -112,18 +135,68 @@ public class DivarListingSource : IListingSource
|
||||
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
||||
req.Headers.TryAddWithoutValidation("Accept", "application/json");
|
||||
using var resp = await client.SendAsync(req, ct);
|
||||
if (!resp.IsSuccessStatusCode) return new();
|
||||
if (!resp.IsSuccessStatusCode) return (new(), null, null);
|
||||
var body = await resp.Content.ReadAsStringAsync(ct);
|
||||
if (body.Contains("BLOCKING_VIEW")) return new();
|
||||
return HtmlUtil.HarvestPhones(body);
|
||||
if (body.Contains("BLOCKING_VIEW")) return (new(), null, null);
|
||||
var phones = HtmlUtil.HarvestPhones(body);
|
||||
double? lat = null, lng = null;
|
||||
try { using var doc = JsonDocument.Parse(body); if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } }
|
||||
catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ }
|
||||
return (phones, lat, lng);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
|
||||
return new();
|
||||
return (new(), null, null);
|
||||
}
|
||||
}
|
||||
|
||||
// Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…).
|
||||
private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;
|
||||
|
||||
/// <summary>
|
||||
/// Tolerantly find an approximate (lat, lng) anywhere in Divar's detail JSON. Divar's shape
|
||||
/// shifts (sometimes `latitude`/`longitude`, sometimes nested under `location`/`coordinates`),
|
||||
/// so we walk the tree and accept the first OBJECT that holds BOTH a latitude-like and a
|
||||
/// longitude-like numeric property whose values fall inside Iran. Pairing within one object
|
||||
/// avoids matching a stray lat to an unrelated lng. Returns null if nothing plausible is found.
|
||||
/// </summary>
|
||||
private static (double lat, double lng)? FindLatLng(JsonElement el)
|
||||
{
|
||||
if (el.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
double? lat = null, lng = null;
|
||||
foreach (var p in el.EnumerateObject())
|
||||
{
|
||||
if (lat is null && IsLatKey(p.Name) && TryNum(p.Value, out var la)) lat = la;
|
||||
else if (lng is null && IsLngKey(p.Name) && TryNum(p.Value, out var lo)) lng = lo;
|
||||
}
|
||||
if (lat is double L && lng is double G && L is >= MinLat and <= MaxLat && G is >= MinLng and <= MaxLng)
|
||||
return (L, G);
|
||||
foreach (var p in el.EnumerateObject())
|
||||
if (FindLatLng(p.Value) is { } r) return r;
|
||||
}
|
||||
else if (el.ValueKind == JsonValueKind.Array)
|
||||
foreach (var item in el.EnumerateArray())
|
||||
if (FindLatLng(item) is { } r) return r;
|
||||
return null;
|
||||
}
|
||||
|
||||
private static bool IsLatKey(string k) => k.Equals("latitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lat", StringComparison.OrdinalIgnoreCase);
|
||||
private static bool IsLngKey(string k) =>
|
||||
k.Equals("longitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lng", StringComparison.OrdinalIgnoreCase)
|
||||
|| k.Equals("lon", StringComparison.OrdinalIgnoreCase) || k.Equals("long", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
/// <summary>Coordinate may be a JSON number or a numeric string ("35.7"). Invariant culture.</summary>
|
||||
private static bool TryNum(JsonElement v, out double d)
|
||||
{
|
||||
if (v.ValueKind == JsonValueKind.Number) return v.TryGetDouble(out d);
|
||||
if (v.ValueKind == JsonValueKind.String)
|
||||
return double.TryParse(v.GetString(), System.Globalization.NumberStyles.Float,
|
||||
System.Globalization.CultureInfo.InvariantCulture, out d);
|
||||
d = 0; return false;
|
||||
}
|
||||
|
||||
private static readonly string[] DescKeys =
|
||||
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
||||
|
||||
@@ -134,9 +207,11 @@ public class DivarListingSource : IListingSource
|
||||
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
|
||||
{
|
||||
var sb = new StringBuilder(t.GetString());
|
||||
// Append ALL present description fields — the location/time line («… در تهران، جنتآباد»)
|
||||
// is usually in bottom_description_text, so don't stop at the first match.
|
||||
foreach (var k in DescKeys)
|
||||
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
|
||||
{ sb.Append(" — ").Append(d.GetString()); break; }
|
||||
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String && d.GetString() is { Length: > 0 } v)
|
||||
sb.Append(" — ").Append(v);
|
||||
var text = sb.ToString().Trim();
|
||||
if (text.Length >= 15) yield return (text, FindToken(el));
|
||||
}
|
||||
|
||||
@@ -2,8 +2,11 @@ using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).</summary>
|
||||
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null);
|
||||
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).
|
||||
/// Lat/Lng are an APPROXIMATE location when the source exposes one (e.g. Divar's privacy-fuzzed
|
||||
/// map center) — used to place an aggregated facility on the map / enable «near me».</summary>
|
||||
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null,
|
||||
double? Lat = null, double? Lng = null);
|
||||
|
||||
/// <summary>
|
||||
/// A pluggable source the ingestion engine pulls from. Configuration (enabled, channels, tokens)
|
||||
|
||||
@@ -46,6 +46,10 @@ public class IngestionService
|
||||
|
||||
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
|
||||
|
||||
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
|
||||
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
|
||||
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
|
||||
|
||||
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
||||
{
|
||||
var settings = await _settings.GetAsync();
|
||||
@@ -71,7 +75,17 @@ public class IngestionService
|
||||
{
|
||||
fetched++;
|
||||
var hash = Hash(item.RawText);
|
||||
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
|
||||
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
|
||||
if (existing is not null)
|
||||
{
|
||||
// Best-effort geo retry: coords are normally captured only on first ingest, but a
|
||||
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
|
||||
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
|
||||
// coords and the row has none, so an item still sitting in the queue can be placed on
|
||||
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
|
||||
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
|
||||
dupes++; continue;
|
||||
}
|
||||
|
||||
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
|
||||
var val = _validator.Validate(item.RawText, parsed);
|
||||
@@ -91,6 +105,7 @@ public class IngestionService
|
||||
Confidence = confidence,
|
||||
ValidationNotes = reason,
|
||||
Status = status,
|
||||
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
|
||||
};
|
||||
_db.RawListings.Add(raw);
|
||||
|
||||
@@ -146,8 +161,15 @@ public class IngestionService
|
||||
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
|
||||
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
|
||||
if (ai.Approve)
|
||||
{
|
||||
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
|
||||
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
|
||||
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
|
||||
if (!val.LooksMedical)
|
||||
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
|
||||
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
|
||||
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
|
||||
}
|
||||
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
|
||||
}
|
||||
|
||||
@@ -218,10 +240,15 @@ public class IngestionService
|
||||
return;
|
||||
}
|
||||
|
||||
// Never surface the crawl source (e.g. «مدجابز») in a public facility name.
|
||||
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
|
||||
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
|
||||
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
|
||||
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
|
||||
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
|
||||
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
|
||||
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
||||
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
||||
: "مرکز درمانی (نامشخص)";
|
||||
: UnknownFacilityName;
|
||||
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
||||
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
||||
if (facility is null)
|
||||
@@ -230,10 +257,17 @@ public class IngestionService
|
||||
{
|
||||
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
||||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
|
||||
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
|
||||
};
|
||||
_db.Facilities.Add(facility);
|
||||
facilities.Add(facility); // so later listings in this run match it too
|
||||
}
|
||||
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
|
||||
{
|
||||
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
|
||||
// real (employer-set or verified) location with Divar's fuzzy point.
|
||||
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
|
||||
}
|
||||
|
||||
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
||||
{
|
||||
@@ -278,24 +312,33 @@ public class IngestionService
|
||||
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
|
||||
}
|
||||
|
||||
/// <summary>Find an existing role by Persian-normalized name; if none, create a new Role (dynamic
|
||||
/// taxonomy) using the AI's suggested category — reusing an existing category when one normalizes
|
||||
/// to the same text — and add it to the in-run list so later items reuse it instead of duplicating.</summary>
|
||||
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
|
||||
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
|
||||
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
|
||||
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
|
||||
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
|
||||
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
|
||||
{
|
||||
var norm = NormalizeFa(name);
|
||||
|
||||
// (1) Already a known role (same word or spelling variant).
|
||||
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
|
||||
if (match is not null) return match;
|
||||
|
||||
var wantCat = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
|
||||
// Collapse onto an existing category that normalizes the same, so «تکنسین» != «تکنسين» doesn't fork.
|
||||
var existingCat = roles.Select(r => r.Category)
|
||||
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == NormalizeFa(wantCat));
|
||||
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
|
||||
if (RoleAliases.TryGetValue(norm, out var canonical))
|
||||
{
|
||||
var canonNorm = NormalizeFa(canonical);
|
||||
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
|
||||
if (aliased is not null) return aliased;
|
||||
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
|
||||
}
|
||||
|
||||
// (3) Genuinely new role — create it under a canonical-resolved category.
|
||||
var created = new Role
|
||||
{
|
||||
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
|
||||
Category = Clamp(existingCat ?? wantCat, 50), // respect Role.Category MaxLength(50)
|
||||
Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50)
|
||||
IsActive = true,
|
||||
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
|
||||
};
|
||||
@@ -306,6 +349,58 @@ public class IngestionService
|
||||
return created;
|
||||
}
|
||||
|
||||
/// <summary>Map an AI-suggested category to a canonical one: synonym alias first
|
||||
/// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is.</summary>
|
||||
private static string ResolveCategory(List<Role> roles, string? category)
|
||||
{
|
||||
var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
|
||||
// Resolve to a canonical first (synonym alias), then to whichever normalized form is the
|
||||
// matching target. Crucially, ALWAYS prefer a category string already stored on a role — even
|
||||
// after an alias maps to a canonical — so we never fork a second variant of the same group.
|
||||
var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw;
|
||||
var targetNorm = NormalizeFa(target);
|
||||
return roles.Select(r => r.Category)
|
||||
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target;
|
||||
}
|
||||
|
||||
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
|
||||
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
|
||||
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
|
||||
{
|
||||
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
|
||||
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
|
||||
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
|
||||
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
|
||||
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
|
||||
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
|
||||
["تکنسین فوریتهای پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
|
||||
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
|
||||
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
|
||||
});
|
||||
|
||||
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
|
||||
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
|
||||
{
|
||||
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
|
||||
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
|
||||
["ماما"] = new[] { "مامایی", "midwifery" },
|
||||
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
|
||||
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
|
||||
});
|
||||
|
||||
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
|
||||
/// also mapping each canonical's own normalized form to itself.</summary>
|
||||
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
|
||||
{
|
||||
var map = new Dictionary<string, string>();
|
||||
foreach (var (canonical, aliases) in src)
|
||||
{
|
||||
map[NormalizeFa(canonical)] = canonical;
|
||||
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
|
||||
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
|
||||
private static string NormalizeFa(string? s) => Regex.Replace(
|
||||
|
||||
@@ -3,7 +3,7 @@ using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
|
||||
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues, bool LooksMedical = false);
|
||||
|
||||
/// <summary>
|
||||
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
|
||||
@@ -64,7 +64,7 @@ public class ListingValidator
|
||||
if (isPromo)
|
||||
{
|
||||
issues.Add("آگهی تبلیغاتی/آموزشی است، نه استخدام/شیفت");
|
||||
return new ValidationResult(false, true, 0, issues); // IsSpam → auto-discard
|
||||
return new ValidationResult(false, true, 0, issues, looksMedical); // IsSpam → auto-discard
|
||||
}
|
||||
|
||||
// «آماده به کار»: a worker offering themselves. No facility/shift-date expected; the role
|
||||
@@ -84,7 +84,7 @@ public class ListingValidator
|
||||
if (tlen < 20) { ts -= 20; issues.Add("متن خیلی کوتاه است"); }
|
||||
ts = Math.Clamp(ts, 0, 100);
|
||||
bool tValid = !isSpam && looksMedical && ts >= 50; // role(40)+medical(10) passes w/o phone
|
||||
return new ValidationResult(tValid, isSpam, ts, issues);
|
||||
return new ValidationResult(tValid, isSpam, ts, issues, looksMedical);
|
||||
}
|
||||
|
||||
int score = 0;
|
||||
@@ -107,6 +107,6 @@ public class ListingValidator
|
||||
|
||||
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
|
||||
bool isValid = !isSpam && looksMedical && score >= 50;
|
||||
return new ValidationResult(isValid, isSpam, score, issues);
|
||||
return new ValidationResult(isValid, isSpam, score, issues, looksMedical);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user