From da55f82c6c4c33fe2e10c622d8a4c7040ba052a2 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sun, 21 Jun 2026 07:17:24 +0330 Subject: [PATCH] =?UTF-8?q?Fix=20facility=20junk-fold:=20match=20the=20rea?= =?UTF-8?q?l=20placeholder=20by=20=C2=AB=D9=86=D8=A7=D9=85=D8=B4=D8=AE?= =?UTF-8?q?=D8=B5=C2=BB=20marker?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The junk-removal half of the facility cleanup silently no-op'd because it looked up the shared placeholder by the exact UnknownFacilityName constant («نامشخص / ثبت نشده»), but production data uses an older wording («مرکز درمانی (نامشخص)»), so the lookup returned null and the whole junk pass was skipped (only the duplicate-merge half ran). Now resolve the placeholder by the «نامشخص» marker and pick the bucket with the most listings (the real one), and exclude it from the merge pass by id. Re-running the cleanup will fold «بیمارستان هستم», «... از مدجابز», bare type-word facilities, etc. into it. Co-Authored-By: Claude Opus 4.8 --- .../Services/Scraping/IngestionService.cs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 8483649..d70e945 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -430,7 +430,6 @@ public class IngestionService public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default) { var facilities = await _db.Facilities.ToListAsync(ct); - var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName); var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId) .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct); @@ -438,6 +437,15 @@ public class IngestionService .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct); int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id); + // The shared "unknown" placeholder is worded differently in older data + // («مرکز درمانی (نامشخص)») than the current constant, so an exact-name lookup found nothing and + // the junk-fold step silently no-op'd. Match by the «نامشخص» marker and pick the bucket actually + // used by the most listings — that's the real placeholder junk should fold into. + var placeholder = facilities + .Where(f => f.Name == UnknownFacilityName || FacilityMatcher.Normalize(f.Name).Contains("نامشخص")) + .OrderByDescending(f => Listings(f.Id)).FirstOrDefault(); + var placeholderId = placeholder?.Id ?? -1; + // Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a // verified facility (those carry real employer data / verification). bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified @@ -463,8 +471,8 @@ public class IngestionService cleaned++; } - // 2) Merge same-city Persian-fuzzy duplicates into the best keeper. - var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct); + // 2) Merge same-city Persian-fuzzy duplicates into the best keeper (never the placeholder). + var remaining = (await _db.Facilities.ToListAsync(ct)).Where(f => f.Id != placeholderId).ToList(); var done = new HashSet(); foreach (var f in remaining) {