Divar geo-coords to facility map + medical gate + RawListing FK/geo migrations
CI/CD / CI · dotnet build (push) Successful in 2m6s
CI/CD / Deploy · hamkadr (push) Successful in 2m3s

This commit is contained in:
soroush.asadi
2026-06-09 21:38:55 +03:30
parent cf5e0011c4
commit 380243b669
14 changed files with 3567 additions and 36 deletions
+9
View File
@@ -171,5 +171,14 @@ public class AppDbContext : DbContext, IDataProtectionKeyContext
// Dedupe ingested listings by content hash.
b.Entity<RawListing>().HasIndex(r => r.ContentHash);
b.Entity<RawListing>().HasIndex(r => r.Status);
// A RawListing only LINKS to the post it produced — it must outlive that post (it's the
// dedupe cache). So deleting a Shift/Talent NULLs the back-reference rather than orphaning a
// dangling FK or blocking the delete. LinkedTalentId previously had no FK at all (orphan risk).
b.Entity<RawListing>()
.HasOne(r => r.LinkedShift).WithMany()
.HasForeignKey(r => r.LinkedShiftId).OnDelete(DeleteBehavior.SetNull);
b.Entity<RawListing>()
.HasOne(r => r.LinkedTalent).WithMany()
.HasForeignKey(r => r.LinkedTalentId).OnDelete(DeleteBehavior.SetNull);
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,69 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace JobsMedical.Web.Migrations
{
/// <inheritdoc />
public partial class RawListingLinkFks : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropForeignKey(
name: "FK_RawListings_Shifts_LinkedShiftId",
table: "RawListings");
// LinkedTalentId never had an FK before, so existing rows may point at deleted talent.
// Null those orphans first, otherwise AddForeignKey below fails on a populated DB.
migrationBuilder.Sql(
"UPDATE \"RawListings\" r SET \"LinkedTalentId\" = NULL " +
"WHERE r.\"LinkedTalentId\" IS NOT NULL " +
"AND NOT EXISTS (SELECT 1 FROM \"TalentListings\" t WHERE t.\"Id\" = r.\"LinkedTalentId\");");
migrationBuilder.CreateIndex(
name: "IX_RawListings_LinkedTalentId",
table: "RawListings",
column: "LinkedTalentId");
migrationBuilder.AddForeignKey(
name: "FK_RawListings_Shifts_LinkedShiftId",
table: "RawListings",
column: "LinkedShiftId",
principalTable: "Shifts",
principalColumn: "Id",
onDelete: ReferentialAction.SetNull);
migrationBuilder.AddForeignKey(
name: "FK_RawListings_TalentListings_LinkedTalentId",
table: "RawListings",
column: "LinkedTalentId",
principalTable: "TalentListings",
principalColumn: "Id",
onDelete: ReferentialAction.SetNull);
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropForeignKey(
name: "FK_RawListings_Shifts_LinkedShiftId",
table: "RawListings");
migrationBuilder.DropForeignKey(
name: "FK_RawListings_TalentListings_LinkedTalentId",
table: "RawListings");
migrationBuilder.DropIndex(
name: "IX_RawListings_LinkedTalentId",
table: "RawListings");
migrationBuilder.AddForeignKey(
name: "FK_RawListings_Shifts_LinkedShiftId",
table: "RawListings",
column: "LinkedShiftId",
principalTable: "Shifts",
principalColumn: "Id");
}
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,38 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace JobsMedical.Web.Migrations
{
/// <inheritdoc />
public partial class RawListingGeo : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.AddColumn<double>(
name: "Lat",
table: "RawListings",
type: "double precision",
nullable: true);
migrationBuilder.AddColumn<double>(
name: "Lng",
table: "RawListings",
type: "double precision",
nullable: true);
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropColumn(
name: "Lat",
table: "RawListings");
migrationBuilder.DropColumn(
name: "Lng",
table: "RawListings");
}
}
}
@@ -748,12 +748,18 @@ namespace JobsMedical.Web.Migrations
b.Property<DateTime>("FetchedAt")
.HasColumnType("timestamp with time zone");
b.Property<double?>("Lat")
.HasColumnType("double precision");
b.Property<int?>("LinkedShiftId")
.HasColumnType("integer");
b.Property<int?>("LinkedTalentId")
.HasColumnType("integer");
b.Property<double?>("Lng")
.HasColumnType("double precision");
b.Property<string>("ParsedJson")
.HasColumnType("text");
@@ -783,6 +789,8 @@ namespace JobsMedical.Web.Migrations
b.HasIndex("LinkedShiftId");
b.HasIndex("LinkedTalentId");
b.HasIndex("Status");
b.ToTable("RawListings");
@@ -1415,9 +1423,17 @@ namespace JobsMedical.Web.Migrations
{
b.HasOne("JobsMedical.Web.Models.Shift", "LinkedShift")
.WithMany()
.HasForeignKey("LinkedShiftId");
.HasForeignKey("LinkedShiftId")
.OnDelete(DeleteBehavior.SetNull);
b.HasOne("JobsMedical.Web.Models.TalentListing", "LinkedTalent")
.WithMany()
.HasForeignKey("LinkedTalentId")
.OnDelete(DeleteBehavior.SetNull);
b.Navigation("LinkedShift");
b.Navigation("LinkedTalent");
});
modelBuilder.Entity("JobsMedical.Web.Models.Review", b =>
+6
View File
@@ -25,10 +25,16 @@ public class RawListing
public Shift? LinkedShift { get; set; }
public int? LinkedTalentId { get; set; } // آگهی «آماده به کار» ساخته‌شده از این متن
public TalentListing? LinkedTalent { get; set; }
[MaxLength(500)]
public string? SourceUrl { get; set; }
/// <summary>Approximate coordinates harvested from the source (e.g. Divar's fuzzed map center).
/// Carried through the review queue so a manual publish can still place the facility on the map.</summary>
public double? Lat { get; set; }
public double? Lng { get; set; }
/// <summary>SHA-256 of the normalized text — used to dedupe across ingestion runs.</summary>
[MaxLength(64)]
public string? ContentHash { get; set; }
@@ -40,6 +40,14 @@
<p class="muted" style="font-size:11px; margin:8px 0 0;">
موتور: واکشی ← حذف تکراری ← تجزیه ← اعتبارسنجی ← صف بررسی.
</p>
<form method="post" onsubmit="return confirm('⚠ همه‌ی آیتم‌های جمع‌آوری‌شده (کش) و همه‌ی آگهی‌های منتشرشده از جمع‌آوری حذف می‌شوند (آگهی‌های ثبت‌شده توسط مراکز دست‌نخورده می‌مانند)، سپس همه‌چیز با هوش مصنوعی دوباره جمع‌آوری و افزوده می‌شود. این کار بازگشت‌ناپذیر است. ادامه می‌دهی؟');">
<button type="submit" asp-page-handler="PurgeAndReingest" class="btn btn-outline btn-block" style="margin-top:8px; color:var(--danger); border-color:var(--danger);">
🔄 پاک‌سازی کش و جمع‌آوری مجدد با هوش مصنوعی
</button>
</form>
<p class="muted" style="font-size:11px; margin:6px 0 0;">
کش حذف تکراری و آگهی‌های جمع‌آوری‌شده پاک و از نو با AI پردازش می‌شوند. (آگهی‌های مراکز حذف نمی‌شوند.)
</p>
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
@@ -65,6 +65,35 @@ public class IndexModel : PageModel
return RedirectToPage();
}
/// <summary>
/// DESTRUCTIVE rebuild, in two distinct deletes:
/// 1. The DEDUPE CACHE — ALL RawListings, including any added via «افزودن دستی». These are not
/// published content; they're the crawl/staging rows whose ContentHash blocks re-ingesting
/// the same ad. Wiping them lets everything be re-fetched and re-judged by the AI.
/// 2. AGGREGATED listings only — Shifts/JobOpenings/TalentListings with Source==Aggregated, i.e.
/// produced by ingestion. Employer/admin-posted listings (Source==Direct) are left untouched.
/// Then re-fetch everything and re-run it through the (now AI-enabled) pipeline.
/// RawListings are deleted first so their LinkedShift/LinkedTalent FKs (SetNull) don't dangle;
/// DB cascade clears ContactMethods / Applications / InterestEvents when the posts are deleted.
/// </summary>
public async Task<IActionResult> OnPostPurgeAndReingestAsync()
{
int rawCount, shifts, jobs, talent;
await using (var tx = await _db.Database.BeginTransactionAsync())
{
rawCount = await _db.RawListings.ExecuteDeleteAsync(); // clear dedupe cache
shifts = await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
jobs = await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
talent = await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
await tx.CommitAsync();
}
var s = await _ingest.RunAsync(); // fresh fetch → AI audit → publish/queue
IngestMessage = $"پاک‌سازی شد (حذف: {rawCount} آیتم کش، {shifts} شیفت، {jobs} استخدام، {talent} آماده‌به‌کارِ جمع‌آوری‌شده). " +
$"جمع‌آوری مجدد: {s.TotalPublished} منتشر، {s.TotalQueued} در صف، {s.TotalFlagged} پرچم، {s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
return RedirectToPage();
}
private async Task LoadAsync()
{
Queue = await _db.RawListings
@@ -282,13 +282,26 @@ public class ReviewModel : PageModel
if (cityId is null) return null; // no cities seeded — cannot create a facility
// No facility named in the ad → use/create the shared placeholder.
var name = string.IsNullOrWhiteSpace(NewFacilityName) ? UnknownFacilityName : NewFacilityName.Trim();
var isPlaceholder = string.IsNullOrWhiteSpace(NewFacilityName);
var name = isPlaceholder ? UnknownFacilityName : NewFacilityName.Trim();
// Approximate coords carried from the crawl (e.g. Divar). NEVER apply them to the shared
// «نامشخص» placeholder — it's reused across many ads, so a single ad's point would mislead.
bool HasGeo() => !isPlaceholder && Raw?.Lat is not null;
// Reuse an existing facility that's exactly or closely the same (Persian-aware fuzzy
// match), so we don't create duplicates like «بیمارستان میلاد» vs «میلاد».
var all = await _db.Facilities.ToListAsync();
var match = FacilityMatcher.FindBest(all, name, cityId);
if (match is not null) return match.Id;
if (match is not null)
{
if (HasGeo() && match.Lat is null && match.Lng is null) // backfill only, never overwrite
{
match.Lat = Raw!.Lat; match.Lng = Raw.Lng;
await _db.SaveChangesAsync();
}
return match.Id;
}
var facility = new Facility
{
@@ -297,6 +310,8 @@ public class ReviewModel : PageModel
Type = FacilityType.Hospital,
Verification = VerificationStatus.Unverified,
IsVerified = false,
Lat = HasGeo() ? Raw!.Lat : null,
Lng = HasGeo() ? Raw!.Lng : null,
};
_db.Facilities.Add(facility);
await _db.SaveChangesAsync();
@@ -59,17 +59,25 @@ public class DivarListingSource : IListingSource
continue;
}
using var doc = JsonDocument.Parse(body);
var cityLabel = CityLabel(s.DivarCity); // every result is from the city we searched
foreach (var (text, token) in Harvest(doc.RootElement).Take(25))
{
var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir";
var withPhone = text;
var itemText = text;
// Stamp the city so the parser/AI always resolve a location (Divar's own location
// line isn't always in the search row; the searched city is authoritative).
if (!string.IsNullOrWhiteSpace(cityLabel) && !text.Contains(cityLabel))
itemText += $"\n📍 {cityLabel}";
double? lat = null, lng = null;
if (token is not null)
{
var phones = await RevealPhonesAsync(client, token, s, ct);
if (phones.Count > 0 && !phones.Any(text.Contains))
withPhone = text + "\nشماره تماس: " + string.Join("، ", phones);
// One detail fetch yields BOTH the phone and the map coordinates.
var (phones, gLat, gLng) = await FetchDetailAsync(client, token, ct);
if (phones.Count > 0 && !phones.Any(itemText.Contains))
itemText += "\nشماره تماس: " + string.Join("، ", phones);
lat = gLat; lng = gLng;
}
items.Add(new ScrapedItem("دیوار", withPhone, url));
items.Add(new ScrapedItem("دیوار", itemText, url, lat, lng));
}
}
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
@@ -95,16 +103,31 @@ public class DivarListingSource : IListingSource
};
}
/// <summary>Persian display name for the searched city (slug/number/Persian → Persian), used to
/// stamp every Divar result with its (authoritative) location.</summary>
private static string CityLabel(string? city) => (city ?? "").Trim().ToLowerInvariant() switch
{
"1" or "tehran" or "تهران" => "تهران",
"3" or "isfahan" or "esfahan" or "اصفهان" => "اصفهان",
"4" or "mashhad" or "مشهد" => "مشهد",
"5" or "shiraz" or "شیراز" => "شیراز",
"6" or "tabriz" or "تبریز" => "تبریز",
"1745" or "karaj" or "کرج" => "کرج",
_ => (city ?? "").Trim(),
};
// The post detail endpoint returns the FULL description — many Divar job ads write the phone
// straight into the body, so we can harvest it without Divar's (login-gated) contact reveal.
private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/";
/// <summary>
/// Fetch a post's detail JSON and harvest any contact number it contains (mostly numbers the
/// poster wrote into the description). Divar's true "نمایش شماره" reveal is auth-gated; this
/// covers the common case where the number is in the ad text. Fails soft.
/// Fetch a post's detail JSON ONCE and harvest both (a) any contact number it contains (mostly
/// numbers the poster wrote into the description; Divar's true "نمایش شماره" reveal is auth-gated)
/// and (b) the post's APPROXIMATE map coordinates (the privacy-fuzzed center Divar shows as a
/// circle). Fails soft — returns whatever it could extract.
/// </summary>
private async Task<List<string>> RevealPhonesAsync(HttpClient client, string token, AppSetting s, CancellationToken ct)
private async Task<(List<string> phones, double? lat, double? lng)> FetchDetailAsync(
HttpClient client, string token, CancellationToken ct)
{
try
{
@@ -112,18 +135,68 @@ public class DivarListingSource : IListingSource
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
req.Headers.TryAddWithoutValidation("Accept", "application/json");
using var resp = await client.SendAsync(req, ct);
if (!resp.IsSuccessStatusCode) return new();
if (!resp.IsSuccessStatusCode) return (new(), null, null);
var body = await resp.Content.ReadAsStringAsync(ct);
if (body.Contains("BLOCKING_VIEW")) return new();
return HtmlUtil.HarvestPhones(body);
if (body.Contains("BLOCKING_VIEW")) return (new(), null, null);
var phones = HtmlUtil.HarvestPhones(body);
double? lat = null, lng = null;
try { using var doc = JsonDocument.Parse(body); if (FindLatLng(doc.RootElement) is { } g) { lat = g.lat; lng = g.lng; } }
catch (JsonException) { /* detail wasn't JSON — phones still harvested from text */ }
return (phones, lat, lng);
}
catch (Exception ex)
{
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
return new();
return (new(), null, null);
}
}
// Iran's bounding box — guards against picking up an unrelated number pair (timestamps, ids…).
private const double MinLat = 24, MaxLat = 40, MinLng = 44, MaxLng = 64;
/// <summary>
/// Tolerantly find an approximate (lat, lng) anywhere in Divar's detail JSON. Divar's shape
/// shifts (sometimes `latitude`/`longitude`, sometimes nested under `location`/`coordinates`),
/// so we walk the tree and accept the first OBJECT that holds BOTH a latitude-like and a
/// longitude-like numeric property whose values fall inside Iran. Pairing within one object
/// avoids matching a stray lat to an unrelated lng. Returns null if nothing plausible is found.
/// </summary>
private static (double lat, double lng)? FindLatLng(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
double? lat = null, lng = null;
foreach (var p in el.EnumerateObject())
{
if (lat is null && IsLatKey(p.Name) && TryNum(p.Value, out var la)) lat = la;
else if (lng is null && IsLngKey(p.Name) && TryNum(p.Value, out var lo)) lng = lo;
}
if (lat is double L && lng is double G && L is >= MinLat and <= MaxLat && G is >= MinLng and <= MaxLng)
return (L, G);
foreach (var p in el.EnumerateObject())
if (FindLatLng(p.Value) is { } r) return r;
}
else if (el.ValueKind == JsonValueKind.Array)
foreach (var item in el.EnumerateArray())
if (FindLatLng(item) is { } r) return r;
return null;
}
private static bool IsLatKey(string k) => k.Equals("latitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lat", StringComparison.OrdinalIgnoreCase);
private static bool IsLngKey(string k) =>
k.Equals("longitude", StringComparison.OrdinalIgnoreCase) || k.Equals("lng", StringComparison.OrdinalIgnoreCase)
|| k.Equals("lon", StringComparison.OrdinalIgnoreCase) || k.Equals("long", StringComparison.OrdinalIgnoreCase);
/// <summary>Coordinate may be a JSON number or a numeric string ("35.7"). Invariant culture.</summary>
private static bool TryNum(JsonElement v, out double d)
{
if (v.ValueKind == JsonValueKind.Number) return v.TryGetDouble(out d);
if (v.ValueKind == JsonValueKind.String)
return double.TryParse(v.GetString(), System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out d);
d = 0; return false;
}
private static readonly string[] DescKeys =
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
@@ -134,9 +207,11 @@ public class DivarListingSource : IListingSource
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
{
var sb = new StringBuilder(t.GetString());
// Append ALL present description fields — the location/time line («… در تهران، جنت‌آباد»)
// is usually in bottom_description_text, so don't stop at the first match.
foreach (var k in DescKeys)
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
{ sb.Append(" — ").Append(d.GetString()); break; }
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String && d.GetString() is { Length: > 0 } v)
sb.Append(" — ").Append(v);
var text = sb.ToString().Trim();
if (text.Length >= 15) yield return (text, FindToken(el));
}
@@ -2,8 +2,11 @@ using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).</summary>
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null);
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).
/// Lat/Lng are an APPROXIMATE location when the source exposes one (e.g. Divar's privacy-fuzzed
/// map center) — used to place an aggregated facility on the map / enable «near me».</summary>
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null,
double? Lat = null, double? Lng = null);
/// <summary>
/// A pluggable source the ingestion engine pulls from. Configuration (enabled, channels, tokens)
@@ -46,6 +46,10 @@ public class IngestionService
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
{
var settings = await _settings.GetAsync();
@@ -71,7 +75,17 @@ public class IngestionService
{
fetched++;
var hash = Hash(item.RawText);
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
if (existing is not null)
{
// Best-effort geo retry: coords are normally captured only on first ingest, but a
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
// coords and the row has none, so an item still sitting in the queue can be placed on
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
dupes++; continue;
}
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
var val = _validator.Validate(item.RawText, parsed);
@@ -91,6 +105,7 @@ public class IngestionService
Confidence = confidence,
ValidationNotes = reason,
Status = status,
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
};
_db.RawListings.Add(raw);
@@ -146,8 +161,15 @@ public class IngestionService
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
if (ai.Approve)
{
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
if (!val.LooksMedical)
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
}
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
}
@@ -218,10 +240,15 @@ public class IngestionService
return;
}
// Never surface the crawl source (e.g. «مدجابز») in a public facility name.
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
: "مرکز درمانی (نامشخص)";
: UnknownFacilityName;
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
if (facility is null)
@@ -230,10 +257,17 @@ public class IngestionService
{
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
};
_db.Facilities.Add(facility);
facilities.Add(facility); // so later listings in this run match it too
}
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
{
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
// real (employer-set or verified) location with Divar's fuzzy point.
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
}
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
{
@@ -278,24 +312,33 @@ public class IngestionService
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
}
/// <summary>Find an existing role by Persian-normalized name; if none, create a new Role (dynamic
/// taxonomy) using the AI's suggested category — reusing an existing category when one normalizes
/// to the same text — and add it to the in-run list so later items reuse it instead of duplicating.</summary>
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
{
var norm = NormalizeFa(name);
// (1) Already a known role (same word or spelling variant).
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
if (match is not null) return match;
var wantCat = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
// Collapse onto an existing category that normalizes the same, so «تکنسین» != «تکنسين» doesn't fork.
var existingCat = roles.Select(r => r.Category)
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == NormalizeFa(wantCat));
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
if (RoleAliases.TryGetValue(norm, out var canonical))
{
var canonNorm = NormalizeFa(canonical);
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
if (aliased is not null) return aliased;
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
}
// (3) Genuinely new role — create it under a canonical-resolved category.
var created = new Role
{
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
Category = Clamp(existingCat ?? wantCat, 50), // respect Role.Category MaxLength(50)
Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50)
IsActive = true,
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
};
@@ -306,6 +349,58 @@ public class IngestionService
return created;
}
/// <summary>Map an AI-suggested category to a canonical one: synonym alias first
/// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is.</summary>
private static string ResolveCategory(List<Role> roles, string? category)
{
var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
// Resolve to a canonical first (synonym alias), then to whichever normalized form is the
// matching target. Crucially, ALWAYS prefer a category string already stored on a role — even
// after an alias maps to a canonical — so we never fork a second variant of the same group.
var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw;
var targetNorm = NormalizeFa(target);
return roles.Select(r => r.Category)
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target;
}
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
{
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
["تکنسین فوریت‌های پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
});
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
{
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
["ماما"] = new[] { "مامایی", "midwifery" },
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
});
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
/// also mapping each canonical's own normalized form to itself.</summary>
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
{
var map = new Dictionary<string, string>();
foreach (var (canonical, aliases) in src)
{
map[NormalizeFa(canonical)] = canonical;
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
}
return map;
}
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
private static string NormalizeFa(string? s) => Regex.Replace(
@@ -3,7 +3,7 @@ using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues, bool LooksMedical = false);
/// <summary>
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
@@ -64,7 +64,7 @@ public class ListingValidator
if (isPromo)
{
issues.Add("آگهی تبلیغاتی/آموزشی است، نه استخدام/شیفت");
return new ValidationResult(false, true, 0, issues); // IsSpam → auto-discard
return new ValidationResult(false, true, 0, issues, looksMedical); // IsSpam → auto-discard
}
// «آماده به کار»: a worker offering themselves. No facility/shift-date expected; the role
@@ -84,7 +84,7 @@ public class ListingValidator
if (tlen < 20) { ts -= 20; issues.Add("متن خیلی کوتاه است"); }
ts = Math.Clamp(ts, 0, 100);
bool tValid = !isSpam && looksMedical && ts >= 50; // role(40)+medical(10) passes w/o phone
return new ValidationResult(tValid, isSpam, ts, issues);
return new ValidationResult(tValid, isSpam, ts, issues, looksMedical);
}
int score = 0;
@@ -107,6 +107,6 @@ public class ListingValidator
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
bool isValid = !isSpam && looksMedical && score >= 50;
return new ValidationResult(isValid, isSpam, score, issues);
return new ValidationResult(isValid, isSpam, score, issues, looksMedical);
}
}