Facility data hygiene: merge duplicates, drop junk-named facilities
Cleans up the crawl-generated facility table that surfaced garbage on /Facilities («بیمارستان هستم», «... از مدجابز», bare «کلینیک», «سازمان برنامه جنوبی» x3): - FacilityMatcher.IsJunkName: shared detector for non-names — bare type words, cores made only of filler/verb tokens, and leaked crawl-source/placeholder text. Added داروخانه/آسایشگاه to the generic type words so bare ones are caught and dedupe better. - HeuristicListingParser.ExtractFacilityName now rejects junk candidates (and emoji), so new ingests fall back to the shared placeholder instead of forging a fake facility. - IngestionService.MergeAndCleanFacilitiesAsync (+ admin button): folds junk facilities into the placeholder and merges Persian-fuzzy duplicates into one keeper, repointing their shifts/jobs first. Hard guard: only purely crawl-generated, unmanaged facilities are removed — employer-owned and verified facilities are never touched. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -418,6 +418,79 @@ public class IngestionService
|
||||
return archived;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities
|
||||
/// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the
|
||||
/// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper,
|
||||
/// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely
|
||||
/// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and
|
||||
/// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost.
|
||||
/// Returns (merged, cleaned).
|
||||
/// </summary>
|
||||
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
|
||||
{
|
||||
var facilities = await _db.Facilities.ToListAsync(ct);
|
||||
var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName);
|
||||
|
||||
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
|
||||
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
||||
var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId)
|
||||
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
||||
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
|
||||
|
||||
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
|
||||
// verified facility (those carry real employer data / verification).
|
||||
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
|
||||
&& f.Verification == VerificationStatus.Unverified
|
||||
&& (placeholder is null || f.Id != placeholder.Id);
|
||||
|
||||
async Task AbsorbAsync(int fromId, int toId)
|
||||
{
|
||||
await _db.Shifts.Where(s => s.FacilityId == fromId)
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct);
|
||||
await _db.JobOpenings.Where(j => j.FacilityId == fromId)
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct);
|
||||
await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews
|
||||
}
|
||||
|
||||
int merged = 0, cleaned = 0;
|
||||
|
||||
// 1) Junk-named crawl facilities → fold into the shared placeholder.
|
||||
if (placeholder is not null)
|
||||
foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList())
|
||||
{
|
||||
await AbsorbAsync(f.Id, placeholder.Id);
|
||||
cleaned++;
|
||||
}
|
||||
|
||||
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper.
|
||||
var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct);
|
||||
var done = new HashSet<int>();
|
||||
foreach (var f in remaining)
|
||||
{
|
||||
if (done.Contains(f.Id)) continue;
|
||||
done.Add(f.Id);
|
||||
var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id)
|
||||
&& o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList();
|
||||
if (cluster.Count == 0) continue;
|
||||
cluster.Add(f);
|
||||
// keeper: verified > owned > most listings > lowest id (oldest).
|
||||
var keeper = cluster.OrderByDescending(x => x.IsVerified)
|
||||
.ThenByDescending(x => x.OwnerUserId.HasValue)
|
||||
.ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First();
|
||||
foreach (var dup in cluster.Where(x => x.Id != keeper.Id))
|
||||
{
|
||||
done.Add(dup.Id);
|
||||
if (!Removable(dup)) continue; // never delete an employer/verified facility
|
||||
await AbsorbAsync(dup.Id, keeper.Id);
|
||||
merged++;
|
||||
}
|
||||
}
|
||||
|
||||
_log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned);
|
||||
return (merged, cleaned);
|
||||
}
|
||||
|
||||
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
|
||||
|
||||
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
||||
|
||||
Reference in New Issue
Block a user