Match crawled listings to existing facilities (fuzzy) before creating new
When publishing a scraped listing we now look for a facility we already have that is exactly or closely the same, and only create a new one when there is no match — avoiding duplicates like «بیمارستان میلاد» vs «میلاد». - ListingParser: extract a facility name (keyword + distinctive words) from the post and surface it in the parser notes. - FacilityMatcher: Persian-aware normalization (ي/ك, ZWNJ, punctuation), type-word stripping for a "core" name, contains + Levenshtein similarity, and FindBest (same-city exact → any-city exact → same-city fuzzy → fuzzy). - Review (manual publish): auto-select a matching facility or prefill the new-facility name; resolve-or-create uses fuzzy match; dropdown preselects. - IngestionService (auto-publish): reuse FacilityMatcher against a run-wide facility list (grows as new ones are created) instead of exact-name only. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -52,6 +52,7 @@ public class IngestionService
|
||||
var roles = await _db.Roles.ToListAsync(ct);
|
||||
var cities = await _db.Cities.ToListAsync(ct);
|
||||
var districts = await _db.Districts.ToListAsync(ct);
|
||||
var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create
|
||||
var roleNames = roles.Select(r => r.Name).ToList();
|
||||
var cityNames = cities.Select(c => c.Name).ToList();
|
||||
var districtNames = districts.Select(d => d.Name).ToList();
|
||||
@@ -95,7 +96,7 @@ public class IngestionService
|
||||
|
||||
if (status == RawListingStatus.Normalized)
|
||||
{
|
||||
try { Publish(parsed, ai, raw, roles, cities, districts); published++; }
|
||||
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
|
||||
}
|
||||
else if (status == RawListingStatus.New) queued++;
|
||||
@@ -157,7 +158,7 @@ public class IngestionService
|
||||
}
|
||||
|
||||
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
|
||||
List<Role> roles, List<City> cities, List<District> districts)
|
||||
List<Role> roles, List<City> cities, List<District> districts, List<Facility> facilities)
|
||||
{
|
||||
var d = ai?.Data;
|
||||
var roleName = d?.Role ?? parsed.RoleName;
|
||||
@@ -170,9 +171,10 @@ public class IngestionService
|
||||
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
|
||||
|
||||
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
||||
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
||||
: $"مرکز درمانی (از {raw.SourceChannel})";
|
||||
var facility = _db.Facilities.Local.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id)
|
||||
?? _db.Facilities.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id);
|
||||
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
||||
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
||||
if (facility is null)
|
||||
{
|
||||
facility = new Facility
|
||||
@@ -181,6 +183,7 @@ public class IngestionService
|
||||
Phone = parsed.Phone, IsVerified = false,
|
||||
};
|
||||
_db.Facilities.Add(facility);
|
||||
facilities.Add(facility); // so later listings in this run match it too
|
||||
}
|
||||
|
||||
var kind = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();
|
||||
|
||||
Reference in New Issue
Block a user