Fix facility junk-fold: match the real placeholder by «نامشخص» marker
The junk-removal half of the facility cleanup silently no-op'd because it looked up the shared placeholder by the exact UnknownFacilityName constant («نامشخص / ثبت نشده»), but production data uses an older wording («مرکز درمانی (نامشخص)»), so the lookup returned null and the whole junk pass was skipped (only the duplicate-merge half ran). Now resolve the placeholder by the «نامشخص» marker and pick the bucket with the most listings (the real one), and exclude it from the merge pass by id. Re-running the cleanup will fold «بیمارستان هستم», «... از مدجابز», bare type-word facilities, etc. into it. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -430,7 +430,6 @@ public class IngestionService
|
|||||||
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
|
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
|
||||||
{
|
{
|
||||||
var facilities = await _db.Facilities.ToListAsync(ct);
|
var facilities = await _db.Facilities.ToListAsync(ct);
|
||||||
var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName);
|
|
||||||
|
|
||||||
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
|
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
|
||||||
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
||||||
@@ -438,6 +437,15 @@ public class IngestionService
|
|||||||
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
||||||
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
|
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
|
||||||
|
|
||||||
|
// The shared "unknown" placeholder is worded differently in older data
|
||||||
|
// («مرکز درمانی (نامشخص)») than the current constant, so an exact-name lookup found nothing and
|
||||||
|
// the junk-fold step silently no-op'd. Match by the «نامشخص» marker and pick the bucket actually
|
||||||
|
// used by the most listings — that's the real placeholder junk should fold into.
|
||||||
|
var placeholder = facilities
|
||||||
|
.Where(f => f.Name == UnknownFacilityName || FacilityMatcher.Normalize(f.Name).Contains("نامشخص"))
|
||||||
|
.OrderByDescending(f => Listings(f.Id)).FirstOrDefault();
|
||||||
|
var placeholderId = placeholder?.Id ?? -1;
|
||||||
|
|
||||||
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
|
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
|
||||||
// verified facility (those carry real employer data / verification).
|
// verified facility (those carry real employer data / verification).
|
||||||
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
|
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
|
||||||
@@ -463,8 +471,8 @@ public class IngestionService
|
|||||||
cleaned++;
|
cleaned++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper.
|
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper (never the placeholder).
|
||||||
var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct);
|
var remaining = (await _db.Facilities.ToListAsync(ct)).Where(f => f.Id != placeholderId).ToList();
|
||||||
var done = new HashSet<int>();
|
var done = new HashSet<int>();
|
||||||
foreach (var f in remaining)
|
foreach (var f in remaining)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user