Fix facility junk-fold: match the real placeholder by «نامشخص» marker
CI/CD / CI · dotnet build (push) Successful in 30s
CI/CD / Deploy · hamkadr (push) Successful in 1m0s

The junk-removal half of the facility cleanup silently no-op'd because it looked up the
shared placeholder by the exact UnknownFacilityName constant («نامشخص / ثبت نشده»), but
production data uses an older wording («مرکز درمانی (نامشخص)»), so the lookup returned null
and the whole junk pass was skipped (only the duplicate-merge half ran).

Now resolve the placeholder by the «نامشخص» marker and pick the bucket with the most
listings (the real one), and exclude it from the merge pass by id. Re-running the cleanup
will fold «بیمارستان هستم», «... از مدجابز», bare type-word facilities, etc. into it.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 07:17:24 +03:30
parent 88eca92333
commit da55f82c6c
@@ -430,7 +430,6 @@ public class IngestionService
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default) public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
{ {
var facilities = await _db.Facilities.ToListAsync(ct); var facilities = await _db.Facilities.ToListAsync(ct);
var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName);
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId) var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct); .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
@@ -438,6 +437,15 @@ public class IngestionService
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct); .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id); int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
// The shared "unknown" placeholder is worded differently in older data
// («مرکز درمانی (نامشخص)») than the current constant, so an exact-name lookup found nothing and
// the junk-fold step silently no-op'd. Match by the «نامشخص» marker and pick the bucket actually
// used by the most listings — that's the real placeholder junk should fold into.
var placeholder = facilities
.Where(f => f.Name == UnknownFacilityName || FacilityMatcher.Normalize(f.Name).Contains("نامشخص"))
.OrderByDescending(f => Listings(f.Id)).FirstOrDefault();
var placeholderId = placeholder?.Id ?? -1;
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a // Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
// verified facility (those carry real employer data / verification). // verified facility (those carry real employer data / verification).
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
@@ -463,8 +471,8 @@ public class IngestionService
cleaned++; cleaned++;
} }
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper. // 2) Merge same-city Persian-fuzzy duplicates into the best keeper (never the placeholder).
var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct); var remaining = (await _db.Facilities.ToListAsync(ct)).Where(f => f.Id != placeholderId).ToList();
var done = new HashSet<int>(); var done = new HashSet<int>();
foreach (var f in remaining) foreach (var f in remaining)
{ {