Divar geo-coords to facility map + medical gate + RawListing FK/geo migrations
CI/CD / CI · dotnet build (push) Successful in 2m6s
CI/CD / Deploy · hamkadr (push) Successful in 2m3s

This commit is contained in:
soroush.asadi
2026-06-09 21:38:55 +03:30
parent cf5e0011c4
commit 380243b669
14 changed files with 3567 additions and 36 deletions
@@ -46,6 +46,10 @@ public class IngestionService
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
{
var settings = await _settings.GetAsync();
@@ -71,7 +75,17 @@ public class IngestionService
{
fetched++;
var hash = Hash(item.RawText);
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
if (existing is not null)
{
// Best-effort geo retry: coords are normally captured only on first ingest, but a
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
// coords and the row has none, so an item still sitting in the queue can be placed on
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
dupes++; continue;
}
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
var val = _validator.Validate(item.RawText, parsed);
@@ -91,6 +105,7 @@ public class IngestionService
Confidence = confidence,
ValidationNotes = reason,
Status = status,
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
};
_db.RawListings.Add(raw);
@@ -146,8 +161,15 @@ public class IngestionService
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
if (ai.Approve)
{
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
if (!val.LooksMedical)
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
}
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
}
@@ -218,10 +240,15 @@ public class IngestionService
return;
}
// Never surface the crawl source (e.g. «مدجابز») in a public facility name.
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
: "مرکز درمانی (نامشخص)";
: UnknownFacilityName;
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
if (facility is null)
@@ -230,10 +257,17 @@ public class IngestionService
{
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
};
_db.Facilities.Add(facility);
facilities.Add(facility); // so later listings in this run match it too
}
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
{
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
// real (employer-set or verified) location with Divar's fuzzy point.
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
}
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
{
@@ -278,24 +312,33 @@ public class IngestionService
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
}
/// <summary>Find an existing role by Persian-normalized name; if none, create a new Role (dynamic
/// taxonomy) using the AI's suggested category — reusing an existing category when one normalizes
/// to the same text — and add it to the in-run list so later items reuse it instead of duplicating.</summary>
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
{
var norm = NormalizeFa(name);
// (1) Already a known role (same word or spelling variant).
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
if (match is not null) return match;
var wantCat = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
// Collapse onto an existing category that normalizes the same, so «تکنسین» != «تکنسين» doesn't fork.
var existingCat = roles.Select(r => r.Category)
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == NormalizeFa(wantCat));
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
if (RoleAliases.TryGetValue(norm, out var canonical))
{
var canonNorm = NormalizeFa(canonical);
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
if (aliased is not null) return aliased;
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
}
// (3) Genuinely new role — create it under a canonical-resolved category.
var created = new Role
{
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
Category = Clamp(existingCat ?? wantCat, 50), // respect Role.Category MaxLength(50)
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50)
IsActive = true,
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
};
@@ -306,6 +349,58 @@ public class IngestionService
return created;
}
/// <summary>Map an AI-suggested category to a canonical one: synonym alias first
/// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is.</summary>
private static string ResolveCategory(List<Role> roles, string? category)
{
var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
// Resolve to a canonical first (synonym alias), then to whichever normalized form is the
// matching target. Crucially, ALWAYS prefer a category string already stored on a role — even
// after an alias maps to a canonical — so we never fork a second variant of the same group.
var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw;
var targetNorm = NormalizeFa(target);
return roles.Select(r => r.Category)
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target;
}
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
{
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
["تکنسین فوریت‌های پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
});
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
{
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
["ماما"] = new[] { "مامایی", "midwifery" },
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
});
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
/// also mapping each canonical's own normalized form to itself.</summary>
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
{
var map = new Dictionary<string, string>();
foreach (var (canonical, aliases) in src)
{
map[NormalizeFa(canonical)] = canonical;
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
}
return map;
}
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
private static string NormalizeFa(string? s) => Regex.Replace(