Facility data hygiene: merge duplicates, drop junk-named facilities
CI/CD / CI · dotnet build (push) Successful in 1m51s
CI/CD / Deploy · hamkadr (push) Successful in 2m17s

Cleans up the crawl-generated facility table that surfaced garbage on /Facilities
(«بیمارستان هستم», «... از مدجابز», bare «کلینیک», «سازمان برنامه جنوبی» x3):

- FacilityMatcher.IsJunkName: shared detector for non-names — bare type words, cores
  made only of filler/verb tokens, and leaked crawl-source/placeholder text. Added
  داروخانه/آسایشگاه to the generic type words so bare ones are caught and dedupe better.
- HeuristicListingParser.ExtractFacilityName now rejects junk candidates (and emoji), so
  new ingests fall back to the shared placeholder instead of forging a fake facility.
- IngestionService.MergeAndCleanFacilitiesAsync (+ admin button): folds junk facilities
  into the placeholder and merges Persian-fuzzy duplicates into one keeper, repointing
  their shifts/jobs first. Hard guard: only purely crawl-generated, unmanaged facilities
  are removed — employer-owned and verified facilities are never touched.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 05:40:29 +03:30
parent 8be275596b
commit 88eca92333
5 changed files with 137 additions and 2 deletions
@@ -418,6 +418,79 @@ public class IngestionService
return archived;
}
/// <summary>
/// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities
/// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the
/// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper,
/// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely
/// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and
/// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost.
/// Returns (merged, cleaned).
/// </summary>
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
{
var facilities = await _db.Facilities.ToListAsync(ct);
var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName);
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId)
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
// verified facility (those carry real employer data / verification).
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
&& f.Verification == VerificationStatus.Unverified
&& (placeholder is null || f.Id != placeholder.Id);
async Task AbsorbAsync(int fromId, int toId)
{
await _db.Shifts.Where(s => s.FacilityId == fromId)
.ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct);
await _db.JobOpenings.Where(j => j.FacilityId == fromId)
.ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct);
await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews
}
int merged = 0, cleaned = 0;
// 1) Junk-named crawl facilities → fold into the shared placeholder.
if (placeholder is not null)
foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList())
{
await AbsorbAsync(f.Id, placeholder.Id);
cleaned++;
}
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper.
var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct);
var done = new HashSet<int>();
foreach (var f in remaining)
{
if (done.Contains(f.Id)) continue;
done.Add(f.Id);
var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id)
&& o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList();
if (cluster.Count == 0) continue;
cluster.Add(f);
// keeper: verified > owned > most listings > lowest id (oldest).
var keeper = cluster.OrderByDescending(x => x.IsVerified)
.ThenByDescending(x => x.OwnerUserId.HasValue)
.ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First();
foreach (var dup in cluster.Where(x => x.Id != keeper.Id))
{
done.Add(dup.Id);
if (!Removable(dup)) continue; // never delete an employer/verified facility
await AbsorbAsync(dup.Id, keeper.Id);
merged++;
}
}
_log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned);
return (merged, cleaned);
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide(