Divar geo-coords to facility map + medical gate + RawListing FK/geo migrations
This commit is contained in:
@@ -46,6 +46,10 @@ public class IngestionService
|
||||
|
||||
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
|
||||
|
||||
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
|
||||
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
|
||||
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
|
||||
|
||||
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
||||
{
|
||||
var settings = await _settings.GetAsync();
|
||||
@@ -71,7 +75,17 @@ public class IngestionService
|
||||
{
|
||||
fetched++;
|
||||
var hash = Hash(item.RawText);
|
||||
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
|
||||
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
|
||||
if (existing is not null)
|
||||
{
|
||||
// Best-effort geo retry: coords are normally captured only on first ingest, but a
|
||||
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
|
||||
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
|
||||
// coords and the row has none, so an item still sitting in the queue can be placed on
|
||||
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
|
||||
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
|
||||
dupes++; continue;
|
||||
}
|
||||
|
||||
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
|
||||
var val = _validator.Validate(item.RawText, parsed);
|
||||
@@ -91,6 +105,7 @@ public class IngestionService
|
||||
Confidence = confidence,
|
||||
ValidationNotes = reason,
|
||||
Status = status,
|
||||
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
|
||||
};
|
||||
_db.RawListings.Add(raw);
|
||||
|
||||
@@ -146,8 +161,15 @@ public class IngestionService
|
||||
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
|
||||
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
|
||||
if (ai.Approve)
|
||||
{
|
||||
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
|
||||
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
|
||||
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
|
||||
if (!val.LooksMedical)
|
||||
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
|
||||
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
|
||||
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
|
||||
}
|
||||
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
|
||||
}
|
||||
|
||||
@@ -218,10 +240,15 @@ public class IngestionService
|
||||
return;
|
||||
}
|
||||
|
||||
// Never surface the crawl source (e.g. «مدجابز») in a public facility name.
|
||||
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
|
||||
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
|
||||
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
|
||||
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
|
||||
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
|
||||
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
|
||||
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
||||
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
||||
: "مرکز درمانی (نامشخص)";
|
||||
: UnknownFacilityName;
|
||||
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
||||
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
||||
if (facility is null)
|
||||
@@ -230,10 +257,17 @@ public class IngestionService
|
||||
{
|
||||
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
||||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
|
||||
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
|
||||
};
|
||||
_db.Facilities.Add(facility);
|
||||
facilities.Add(facility); // so later listings in this run match it too
|
||||
}
|
||||
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
|
||||
{
|
||||
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
|
||||
// real (employer-set or verified) location with Divar's fuzzy point.
|
||||
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
|
||||
}
|
||||
|
||||
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
||||
{
|
||||
@@ -278,24 +312,33 @@ public class IngestionService
|
||||
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
|
||||
}
|
||||
|
||||
/// <summary>Find an existing role by Persian-normalized name; if none, create a new Role (dynamic
|
||||
/// taxonomy) using the AI's suggested category — reusing an existing category when one normalizes
|
||||
/// to the same text — and add it to the in-run list so later items reuse it instead of duplicating.</summary>
|
||||
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
|
||||
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
|
||||
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
|
||||
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
|
||||
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
|
||||
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
|
||||
{
|
||||
var norm = NormalizeFa(name);
|
||||
|
||||
// (1) Already a known role (same word or spelling variant).
|
||||
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
|
||||
if (match is not null) return match;
|
||||
|
||||
var wantCat = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
|
||||
// Collapse onto an existing category that normalizes the same, so «تکنسین» != «تکنسين» doesn't fork.
|
||||
var existingCat = roles.Select(r => r.Category)
|
||||
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == NormalizeFa(wantCat));
|
||||
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
|
||||
if (RoleAliases.TryGetValue(norm, out var canonical))
|
||||
{
|
||||
var canonNorm = NormalizeFa(canonical);
|
||||
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
|
||||
if (aliased is not null) return aliased;
|
||||
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
|
||||
}
|
||||
|
||||
// (3) Genuinely new role — create it under a canonical-resolved category.
|
||||
var created = new Role
|
||||
{
|
||||
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
|
||||
Category = Clamp(existingCat ?? wantCat, 50), // respect Role.Category MaxLength(50)
|
||||
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
|
||||
Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50)
|
||||
IsActive = true,
|
||||
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
|
||||
};
|
||||
@@ -306,6 +349,58 @@ public class IngestionService
|
||||
return created;
|
||||
}
|
||||
|
||||
/// <summary>Map an AI-suggested category to a canonical one: synonym alias first
|
||||
/// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is.</summary>
|
||||
private static string ResolveCategory(List<Role> roles, string? category)
|
||||
{
|
||||
var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
|
||||
// Resolve to a canonical first (synonym alias), then to whichever normalized form is the
|
||||
// matching target. Crucially, ALWAYS prefer a category string already stored on a role — even
|
||||
// after an alias maps to a canonical — so we never fork a second variant of the same group.
|
||||
var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw;
|
||||
var targetNorm = NormalizeFa(target);
|
||||
return roles.Select(r => r.Category)
|
||||
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target;
|
||||
}
|
||||
|
||||
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
|
||||
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
|
||||
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
|
||||
{
|
||||
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
|
||||
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
|
||||
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
|
||||
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
|
||||
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
|
||||
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
|
||||
["تکنسین فوریتهای پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
|
||||
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
|
||||
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
|
||||
});
|
||||
|
||||
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
|
||||
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
|
||||
{
|
||||
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
|
||||
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
|
||||
["ماما"] = new[] { "مامایی", "midwifery" },
|
||||
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
|
||||
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
|
||||
});
|
||||
|
||||
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
|
||||
/// also mapping each canonical's own normalized form to itself.</summary>
|
||||
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
|
||||
{
|
||||
var map = new Dictionary<string, string>();
|
||||
foreach (var (canonical, aliases) in src)
|
||||
{
|
||||
map[NormalizeFa(canonical)] = canonical;
|
||||
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
|
||||
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
|
||||
private static string NormalizeFa(string? s) => Regex.Replace(
|
||||
|
||||
Reference in New Issue
Block a user