Facility data hygiene: merge duplicates, drop junk-named facilities
Cleans up the crawl-generated facility table that surfaced garbage on /Facilities («بیمارستان هستم», «... از مدجابز», bare «کلینیک», «سازمان برنامه جنوبی» x3): - FacilityMatcher.IsJunkName: shared detector for non-names — bare type words, cores made only of filler/verb tokens, and leaked crawl-source/placeholder text. Added داروخانه/آسایشگاه to the generic type words so bare ones are caught and dedupe better. - HeuristicListingParser.ExtractFacilityName now rejects junk candidates (and emoji), so new ingests fall back to the shared placeholder instead of forging a fake facility. - IngestionService.MergeAndCleanFacilitiesAsync (+ admin button): folds junk facilities into the placeholder and merges Persian-fuzzy duplicates into one keeper, repointing their shifts/jobs first. Hard guard: only purely crawl-generated, unmanaged facilities are removed — employer-owned and verified facilities are never touched. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -76,6 +76,15 @@
|
||||
فقط آگهیهایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده میشوند (نه صرفاً ناقص) و استخدامهای تکراری بایگانی میشوند (وضعیت «بایگانی»، نه حذف). آگهیهای معتبر دستنخوردهاند، پس آدرسِ ایندکسشدهشان تغییر نمیکند؛ صفحهٔ موارد بایگانیشده ۴۱۰ Gone میدهد تا گوگل تمیز حذفشان کند.
|
||||
</p>
|
||||
|
||||
<form method="post" onsubmit="return confirm('مراکز درمانیِ تکراری ادغام و مراکزِ بینام/نامعتبر (مثل «بیمارستان هستم» یا «از مدجابز») حذف میشوند؛ آگهیهایشان به مرکزِ معتبر یا «نامشخص» منتقل میشود. مراکزِ ثبتشده توسط کارفرما یا تأییدشده دستنخورده میمانند. ادامه؟');">
|
||||
<button type="submit" asp-page-handler="CleanFacilities" class="btn btn-primary btn-block" style="margin-top:10px;">
|
||||
🏥 ادغام مراکز تکراری و حذف مراکز بینام
|
||||
</button>
|
||||
</form>
|
||||
<p class="muted" style="font-size:11px; margin:6px 0 0;">
|
||||
مراکز تکراری (با تطبیقِ فارسی) در یک رکورد ادغام و مراکزِ بدونِ نامِ واقعی به «نامشخص» منتقل میشوند. آگهیها حفظ میشوند؛ فقط مراکزِ جمعآوریشده و مدیریتنشده پاک میشوند.
|
||||
</p>
|
||||
|
||||
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
|
||||
|
||||
<h3>افزودن دستی</h3>
|
||||
|
||||
@@ -145,6 +145,18 @@ public class IndexModel : PageModel
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clean up the crawl-generated facility table: merge Persian-fuzzy duplicate facilities and fold
|
||||
/// junk-named ones («بیمارستان هستم»، «... از مدجابز»، bare «کلینیک») into the shared placeholder,
|
||||
/// repointing their listings first. Employer-owned / verified facilities are never touched.
|
||||
/// </summary>
|
||||
public async Task<IActionResult> OnPostCleanFacilitiesAsync()
|
||||
{
|
||||
var (merged, cleaned) = await _ingest.MergeAndCleanFacilitiesAsync();
|
||||
IngestMessage = $"پاکسازی مراکز: {merged} مرکزِ تکراری ادغام و {cleaned} مرکزِ بینام/نامعتبر حذف شد (آگهیهایشان به مرکزِ معتبر یا «نامشخص» منتقل شد). مراکز ثبتشده توسط کارفرما/تأییدشده دستنخورده ماند.";
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
private async Task LoadAsync()
|
||||
{
|
||||
Queue = await _db.RawListings
|
||||
|
||||
@@ -218,12 +218,17 @@ public class HeuristicListingParser : IListingParser
|
||||
{
|
||||
if (NameStops.Contains(w)) break;
|
||||
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
|
||||
if (!w.Any(char.IsLetter)) break; // emoji / punctuation («📍») isn't a name
|
||||
if (w.Length == 1) break; // stray letters
|
||||
picked.Add(w);
|
||||
if (picked.Count >= 3) break;
|
||||
}
|
||||
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
|
||||
return (kw + " " + string.Join(" ", picked)).Trim();
|
||||
var candidate = (kw + " " + string.Join(" ", picked)).Trim();
|
||||
// Reject names that are only filler/verb/source noise («بیمارستان هستم», «... از مدجابز») —
|
||||
// a real name couldn't be extracted, so fall back to the shared placeholder downstream.
|
||||
if (Scraping.FacilityMatcher.IsJunkName(candidate)) continue;
|
||||
return candidate;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ public static class FacilityMatcher
|
||||
{
|
||||
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
|
||||
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
|
||||
"آزمایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
|
||||
"آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
|
||||
};
|
||||
|
||||
/// <summary>Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.</summary>
|
||||
@@ -47,6 +47,42 @@ public static class FacilityMatcher
|
||||
return Regex.Replace(n, @"\s+", " ").Trim();
|
||||
}
|
||||
|
||||
// Filler/verb/locator tokens that are never a real facility name — the parser sweeps these in
|
||||
// when an ad has no named facility («بیمارستان هستم», «مطب نیازمندیم سه», «کلینیک های فقط منطقه»).
|
||||
private static readonly string[] JunkCoreWords =
|
||||
{
|
||||
"هستم", "هستیم", "هستش", "میشوم", "میشم", "بشوم", "میباشد", "باشد", "میباشم",
|
||||
"نیازمندیم", "نیازمند", "نیازمندم", "داریم", "دارم", "میخواهیم", "میخوام",
|
||||
"حتی", "تعدادی", "فقط", "منطقه", "واقع", "های", "مبتدی", "محترم", "خوب",
|
||||
"سه", "دو", "یک", "چند", "این", "آن", "همکار", "نیرو",
|
||||
};
|
||||
|
||||
// Crawl-source names that must never appear as a public facility («مرکز درمانی (از مدجابز)»),
|
||||
// plus the shared placeholder text.
|
||||
private static readonly string[] SourceMarkers =
|
||||
{
|
||||
"مدجابز", "مدجاب", "از تلگرام", "از دیوار", "از بله", "از کانال", "ثبت نشده", "نامشخص",
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// True when a name is NOT a usable facility name: a bare type word («بیمارستان»), a name whose
|
||||
/// distinctive core is only filler/verb tokens («بیمارستان هستم» → «هستم»), or a leaked crawl
|
||||
/// source / placeholder («... از مدجابز», «نامشخص»). Such an ad has no real named facility and
|
||||
/// should fall back to the shared placeholder instead of forging a fake one.
|
||||
/// </summary>
|
||||
public static bool IsJunkName(string? name)
|
||||
{
|
||||
var normalized = Normalize(name);
|
||||
if (normalized.Length == 0) return true;
|
||||
if (SourceMarkers.Any(m => normalized.Contains(Normalize(m)))) return true;
|
||||
|
||||
var core = Core(name);
|
||||
if (core.Length == 0) return true; // bare type word only («بیمارستان»، «کلینیک»)
|
||||
|
||||
var tokens = core.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
||||
return tokens.All(t => t.Length <= 1 || JunkCoreWords.Contains(t));
|
||||
}
|
||||
|
||||
/// <summary>True when two names almost certainly denote the same facility.</summary>
|
||||
public static bool IsSame(string? a, string? b)
|
||||
{
|
||||
|
||||
@@ -418,6 +418,79 @@ public class IngestionService
|
||||
return archived;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities
|
||||
/// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the
|
||||
/// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper,
|
||||
/// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely
|
||||
/// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and
|
||||
/// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost.
|
||||
/// Returns (merged, cleaned).
|
||||
/// </summary>
|
||||
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
|
||||
{
|
||||
var facilities = await _db.Facilities.ToListAsync(ct);
|
||||
var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName);
|
||||
|
||||
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
|
||||
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
||||
var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId)
|
||||
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
||||
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
|
||||
|
||||
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
|
||||
// verified facility (those carry real employer data / verification).
|
||||
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
|
||||
&& f.Verification == VerificationStatus.Unverified
|
||||
&& (placeholder is null || f.Id != placeholder.Id);
|
||||
|
||||
async Task AbsorbAsync(int fromId, int toId)
|
||||
{
|
||||
await _db.Shifts.Where(s => s.FacilityId == fromId)
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct);
|
||||
await _db.JobOpenings.Where(j => j.FacilityId == fromId)
|
||||
.ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct);
|
||||
await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews
|
||||
}
|
||||
|
||||
int merged = 0, cleaned = 0;
|
||||
|
||||
// 1) Junk-named crawl facilities → fold into the shared placeholder.
|
||||
if (placeholder is not null)
|
||||
foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList())
|
||||
{
|
||||
await AbsorbAsync(f.Id, placeholder.Id);
|
||||
cleaned++;
|
||||
}
|
||||
|
||||
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper.
|
||||
var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct);
|
||||
var done = new HashSet<int>();
|
||||
foreach (var f in remaining)
|
||||
{
|
||||
if (done.Contains(f.Id)) continue;
|
||||
done.Add(f.Id);
|
||||
var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id)
|
||||
&& o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList();
|
||||
if (cluster.Count == 0) continue;
|
||||
cluster.Add(f);
|
||||
// keeper: verified > owned > most listings > lowest id (oldest).
|
||||
var keeper = cluster.OrderByDescending(x => x.IsVerified)
|
||||
.ThenByDescending(x => x.OwnerUserId.HasValue)
|
||||
.ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First();
|
||||
foreach (var dup in cluster.Where(x => x.Id != keeper.Id))
|
||||
{
|
||||
done.Add(dup.Id);
|
||||
if (!Removable(dup)) continue; // never delete an employer/verified facility
|
||||
await AbsorbAsync(dup.Id, keeper.Id);
|
||||
merged++;
|
||||
}
|
||||
}
|
||||
|
||||
_log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned);
|
||||
return (merged, cleaned);
|
||||
}
|
||||
|
||||
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
|
||||
|
||||
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
||||
|
||||
Reference in New Issue
Block a user