Facility data hygiene: merge duplicates, drop junk-named facilities
CI/CD / CI · dotnet build (push) Successful in 1m51s
CI/CD / Deploy · hamkadr (push) Successful in 2m17s

Cleans up the crawl-generated facility table that surfaced garbage on /Facilities
(«بیمارستان هستم», «... از مدجابز», bare «کلینیک», «سازمان برنامه جنوبی» x3):

- FacilityMatcher.IsJunkName: shared detector for non-names — bare type words, cores
  made only of filler/verb tokens, and leaked crawl-source/placeholder text. Added
  داروخانه/آسایشگاه to the generic type words so bare ones are caught and dedupe better.
- HeuristicListingParser.ExtractFacilityName now rejects junk candidates (and emoji), so
  new ingests fall back to the shared placeholder instead of forging a fake facility.
- IngestionService.MergeAndCleanFacilitiesAsync (+ admin button): folds junk facilities
  into the placeholder and merges Persian-fuzzy duplicates into one keeper, repointing
  their shifts/jobs first. Hard guard: only purely crawl-generated, unmanaged facilities
  are removed — employer-owned and verified facilities are never touched.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 05:40:29 +03:30
parent 8be275596b
commit 88eca92333
5 changed files with 137 additions and 2 deletions
@@ -76,6 +76,15 @@
فقط آگهی‌هایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده می‌شوند (نه صرفاً ناقص) و استخدام‌های تکراری بایگانی می‌شوند (وضعیت «بایگانی»، نه حذف). آگهی‌های معتبر دست‌نخورده‌اند، پس آدرسِ ایندکس‌شده‌شان تغییر نمی‌کند؛ صفحهٔ موارد بایگانی‌شده ۴۱۰ Gone می‌دهد تا گوگل تمیز حذفشان کند.
</p>
<form method="post" onsubmit="return confirm('مراکز درمانیِ تکراری ادغام و مراکزِ بی‌نام/نامعتبر (مثل «بیمارستان هستم» یا «از مدجابز») حذف می‌شوند؛ آگهی‌هایشان به مرکزِ معتبر یا «نامشخص» منتقل می‌شود. مراکزِ ثبت‌شده توسط کارفرما یا تأییدشده دست‌نخورده می‌مانند. ادامه؟');">
<button type="submit" asp-page-handler="CleanFacilities" class="btn btn-primary btn-block" style="margin-top:10px;">
🏥 ادغام مراکز تکراری و حذف مراکز بی‌نام
</button>
</form>
<p class="muted" style="font-size:11px; margin:6px 0 0;">
مراکز تکراری (با تطبیقِ فارسی) در یک رکورد ادغام و مراکزِ بدونِ نامِ واقعی به «نامشخص» منتقل می‌شوند. آگهی‌ها حفظ می‌شوند؛ فقط مراکزِ جمع‌آوری‌شده و مدیریت‌نشده پاک می‌شوند.
</p>
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
<h3>افزودن دستی</h3>
@@ -145,6 +145,18 @@ public class IndexModel : PageModel
return RedirectToPage();
}
/// <summary>
/// Clean up the crawl-generated facility table: merge Persian-fuzzy duplicate facilities and fold
/// junk-named ones («بیمارستان هستم»، «... از مدجابز»، bare «کلینیک») into the shared placeholder,
/// repointing their listings first. Employer-owned / verified facilities are never touched.
/// </summary>
public async Task<IActionResult> OnPostCleanFacilitiesAsync()
{
var (merged, cleaned) = await _ingest.MergeAndCleanFacilitiesAsync();
IngestMessage = $"پاک‌سازی مراکز: {merged} مرکزِ تکراری ادغام و {cleaned} مرکزِ بی‌نام/نامعتبر حذف شد (آگهی‌هایشان به مرکزِ معتبر یا «نامشخص» منتقل شد). مراکز ثبت‌شده توسط کارفرما/تأییدشده دست‌نخورده ماند.";
return RedirectToPage();
}
private async Task LoadAsync()
{
Queue = await _db.RawListings
@@ -218,12 +218,17 @@ public class HeuristicListingParser : IListingParser
{
if (NameStops.Contains(w)) break;
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
if (!w.Any(char.IsLetter)) break; // emoji / punctuation («📍») isn't a name
if (w.Length == 1) break; // stray letters
picked.Add(w);
if (picked.Count >= 3) break;
}
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
return (kw + " " + string.Join(" ", picked)).Trim();
var candidate = (kw + " " + string.Join(" ", picked)).Trim();
// Reject names that are only filler/verb/source noise («بیمارستان هستم», «... از مدجابز») —
// a real name couldn't be extracted, so fall back to the shared placeholder downstream.
if (Scraping.FacilityMatcher.IsJunkName(candidate)) continue;
return candidate;
}
return null;
}
@@ -17,7 +17,7 @@ public static class FacilityMatcher
{
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
"آزمایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
"آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
};
/// <summary>Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.</summary>
@@ -47,6 +47,42 @@ public static class FacilityMatcher
return Regex.Replace(n, @"\s+", " ").Trim();
}
// Filler/verb/locator tokens that are never a real facility name — the parser sweeps these in
// when an ad has no named facility («بیمارستان هستم», «مطب نیازمندیم سه», «کلینیک های فقط منطقه»).
private static readonly string[] JunkCoreWords =
{
"هستم", "هستیم", "هستش", "میشوم", "میشم", "بشوم", "میباشد", "باشد", "میباشم",
"نیازمندیم", "نیازمند", "نیازمندم", "داریم", "دارم", "میخواهیم", "میخوام",
"حتی", "تعدادی", "فقط", "منطقه", "واقع", "های", "مبتدی", "محترم", "خوب",
"سه", "دو", "یک", "چند", "این", "آن", "همکار", "نیرو",
};
// Crawl-source names that must never appear as a public facility («مرکز درمانی (از مدجابز)»),
// plus the shared placeholder text.
private static readonly string[] SourceMarkers =
{
"مدجابز", "مدجاب", "از تلگرام", "از دیوار", "از بله", "از کانال", "ثبت نشده", "نامشخص",
};
/// <summary>
/// True when a name is NOT a usable facility name: a bare type word («بیمارستان»), a name whose
/// distinctive core is only filler/verb tokens («بیمارستان هستم» → «هستم»), or a leaked crawl
/// source / placeholder («... از مدجابز», «نامشخص»). Such an ad has no real named facility and
/// should fall back to the shared placeholder instead of forging a fake one.
/// </summary>
public static bool IsJunkName(string? name)
{
var normalized = Normalize(name);
if (normalized.Length == 0) return true;
if (SourceMarkers.Any(m => normalized.Contains(Normalize(m)))) return true;
var core = Core(name);
if (core.Length == 0) return true; // bare type word only («بیمارستان»، «کلینیک»)
var tokens = core.Split(' ', StringSplitOptions.RemoveEmptyEntries);
return tokens.All(t => t.Length <= 1 || JunkCoreWords.Contains(t));
}
/// <summary>True when two names almost certainly denote the same facility.</summary>
public static bool IsSame(string? a, string? b)
{
@@ -418,6 +418,79 @@ public class IngestionService
return archived;
}
/// <summary>
/// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities
/// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the
/// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper,
/// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely
/// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and
/// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost.
/// Returns (merged, cleaned).
/// </summary>
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
{
var facilities = await _db.Facilities.ToListAsync(ct);
var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName);
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId)
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
// verified facility (those carry real employer data / verification).
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
&& f.Verification == VerificationStatus.Unverified
&& (placeholder is null || f.Id != placeholder.Id);
async Task AbsorbAsync(int fromId, int toId)
{
await _db.Shifts.Where(s => s.FacilityId == fromId)
.ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct);
await _db.JobOpenings.Where(j => j.FacilityId == fromId)
.ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct);
await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews
}
int merged = 0, cleaned = 0;
// 1) Junk-named crawl facilities → fold into the shared placeholder.
if (placeholder is not null)
foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList())
{
await AbsorbAsync(f.Id, placeholder.Id);
cleaned++;
}
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper.
var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct);
var done = new HashSet<int>();
foreach (var f in remaining)
{
if (done.Contains(f.Id)) continue;
done.Add(f.Id);
var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id)
&& o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList();
if (cluster.Count == 0) continue;
cluster.Add(f);
// keeper: verified > owned > most listings > lowest id (oldest).
var keeper = cluster.OrderByDescending(x => x.IsVerified)
.ThenByDescending(x => x.OwnerUserId.HasValue)
.ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First();
foreach (var dup in cluster.Where(x => x.Id != keeper.Id))
{
done.Add(dup.Id);
if (!Removable(dup)) continue; // never delete an employer/verified facility
await AbsorbAsync(dup.Id, keeper.Id);
merged++;
}
}
_log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned);
return (merged, cleaned);
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide(