diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml b/src/JobsMedical.Web/Pages/Admin/Index.cshtml
index c52986e..68c3c19 100644
--- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml
+++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml
@@ -76,6 +76,15 @@
فقط آگهیهایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده میشوند (نه صرفاً ناقص) و استخدامهای تکراری بایگانی میشوند (وضعیت «بایگانی»، نه حذف). آگهیهای معتبر دستنخوردهاند، پس آدرسِ ایندکسشدهشان تغییر نمیکند؛ صفحهٔ موارد بایگانیشده ۴۱۰ Gone میدهد تا گوگل تمیز حذفشان کند.
+
+
+ مراکز تکراری (با تطبیقِ فارسی) در یک رکورد ادغام و مراکزِ بدونِ نامِ واقعی به «نامشخص» منتقل میشوند. آگهیها حفظ میشوند؛ فقط مراکزِ جمعآوریشده و مدیریتنشده پاک میشوند.
+
+
افزودن دستی
diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs
index 35c8c8f..a099017 100644
--- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs
+++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs
@@ -145,6 +145,18 @@ public class IndexModel : PageModel
return RedirectToPage();
}
+ ///
+ /// Clean up the crawl-generated facility table: merge Persian-fuzzy duplicate facilities and fold
+ /// junk-named ones («بیمارستان هستم»، «... از مدجابز»، bare «کلینیک») into the shared placeholder,
+ /// repointing their listings first. Employer-owned / verified facilities are never touched.
+ ///
+ public async Task OnPostCleanFacilitiesAsync()
+ {
+ var (merged, cleaned) = await _ingest.MergeAndCleanFacilitiesAsync();
+ IngestMessage = $"پاکسازی مراکز: {merged} مرکزِ تکراری ادغام و {cleaned} مرکزِ بینام/نامعتبر حذف شد (آگهیهایشان به مرکزِ معتبر یا «نامشخص» منتقل شد). مراکز ثبتشده توسط کارفرما/تأییدشده دستنخورده ماند.";
+ return RedirectToPage();
+ }
+
private async Task LoadAsync()
{
Queue = await _db.RawListings
diff --git a/src/JobsMedical.Web/Services/ListingParser.cs b/src/JobsMedical.Web/Services/ListingParser.cs
index 9854c38..a13c445 100644
--- a/src/JobsMedical.Web/Services/ListingParser.cs
+++ b/src/JobsMedical.Web/Services/ListingParser.cs
@@ -218,12 +218,17 @@ public class HeuristicListingParser : IListingParser
{
if (NameStops.Contains(w)) break;
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
+ if (!w.Any(char.IsLetter)) break; // emoji / punctuation («📍») isn't a name
if (w.Length == 1) break; // stray letters
picked.Add(w);
if (picked.Count >= 3) break;
}
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
- return (kw + " " + string.Join(" ", picked)).Trim();
+ var candidate = (kw + " " + string.Join(" ", picked)).Trim();
+ // Reject names that are only filler/verb/source noise («بیمارستان هستم», «... از مدجابز») —
+ // a real name couldn't be extracted, so fall back to the shared placeholder downstream.
+ if (Scraping.FacilityMatcher.IsJunkName(candidate)) continue;
+ return candidate;
}
return null;
}
diff --git a/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs
index 5b97f2e..4eca8cd 100644
--- a/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs
+++ b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs
@@ -17,7 +17,7 @@ public static class FacilityMatcher
{
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
- "آزمایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
+ "آزمایشگاه", "داروخانه", "آسایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
};
/// Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.
@@ -47,6 +47,42 @@ public static class FacilityMatcher
return Regex.Replace(n, @"\s+", " ").Trim();
}
+ // Filler/verb/locator tokens that are never a real facility name — the parser sweeps these in
+ // when an ad has no named facility («بیمارستان هستم», «مطب نیازمندیم سه», «کلینیک های فقط منطقه»).
+ private static readonly string[] JunkCoreWords =
+ {
+ "هستم", "هستیم", "هستش", "میشوم", "میشم", "بشوم", "میباشد", "باشد", "میباشم",
+ "نیازمندیم", "نیازمند", "نیازمندم", "داریم", "دارم", "میخواهیم", "میخوام",
+ "حتی", "تعدادی", "فقط", "منطقه", "واقع", "های", "مبتدی", "محترم", "خوب",
+ "سه", "دو", "یک", "چند", "این", "آن", "همکار", "نیرو",
+ };
+
+ // Crawl-source names that must never appear as a public facility («مرکز درمانی (از مدجابز)»),
+ // plus the shared placeholder text.
+ private static readonly string[] SourceMarkers =
+ {
+ "مدجابز", "مدجاب", "از تلگرام", "از دیوار", "از بله", "از کانال", "ثبت نشده", "نامشخص",
+ };
+
+ ///
+ /// True when a name is NOT a usable facility name: a bare type word («بیمارستان»), a name whose
+ /// distinctive core is only filler/verb tokens («بیمارستان هستم» → «هستم»), or a leaked crawl
+ /// source / placeholder («... از مدجابز», «نامشخص»). Such an ad has no real named facility and
+ /// should fall back to the shared placeholder instead of forging a fake one.
+ ///
+ public static bool IsJunkName(string? name)
+ {
+ var normalized = Normalize(name);
+ if (normalized.Length == 0) return true;
+ if (SourceMarkers.Any(m => normalized.Contains(Normalize(m)))) return true;
+
+ var core = Core(name);
+ if (core.Length == 0) return true; // bare type word only («بیمارستان»، «کلینیک»)
+
+ var tokens = core.Split(' ', StringSplitOptions.RemoveEmptyEntries);
+ return tokens.All(t => t.Length <= 1 || JunkCoreWords.Contains(t));
+ }
+
/// True when two names almost certainly denote the same facility.
public static bool IsSame(string? a, string? b)
{
diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
index 21a7742..8483649 100644
--- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
+++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
@@ -418,6 +418,79 @@ public class IngestionService
return archived;
}
+ ///
+ /// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities
+ /// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the
+ /// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper,
+ /// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely
+ /// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and
+ /// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost.
+ /// Returns (merged, cleaned).
+ ///
+ public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
+ {
+ var facilities = await _db.Facilities.ToListAsync(ct);
+ var placeholder = facilities.FirstOrDefault(f => f.Name == UnknownFacilityName);
+
+ var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
+ .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
+ var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId)
+ .Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
+ int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
+
+ // Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
+ // verified facility (those carry real employer data / verification).
+ bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
+ && f.Verification == VerificationStatus.Unverified
+ && (placeholder is null || f.Id != placeholder.Id);
+
+ async Task AbsorbAsync(int fromId, int toId)
+ {
+ await _db.Shifts.Where(s => s.FacilityId == fromId)
+ .ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct);
+ await _db.JobOpenings.Where(j => j.FacilityId == fromId)
+ .ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct);
+ await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews
+ }
+
+ int merged = 0, cleaned = 0;
+
+ // 1) Junk-named crawl facilities → fold into the shared placeholder.
+ if (placeholder is not null)
+ foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList())
+ {
+ await AbsorbAsync(f.Id, placeholder.Id);
+ cleaned++;
+ }
+
+ // 2) Merge same-city Persian-fuzzy duplicates into the best keeper.
+ var remaining = await _db.Facilities.Where(f => f.Name != UnknownFacilityName).ToListAsync(ct);
+ var done = new HashSet();
+ foreach (var f in remaining)
+ {
+ if (done.Contains(f.Id)) continue;
+ done.Add(f.Id);
+ var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id)
+ && o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList();
+ if (cluster.Count == 0) continue;
+ cluster.Add(f);
+ // keeper: verified > owned > most listings > lowest id (oldest).
+ var keeper = cluster.OrderByDescending(x => x.IsVerified)
+ .ThenByDescending(x => x.OwnerUserId.HasValue)
+ .ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First();
+ foreach (var dup in cluster.Where(x => x.Id != keeper.Id))
+ {
+ done.Add(dup.Id);
+ if (!Removable(dup)) continue; // never delete an employer/verified facility
+ await AbsorbAsync(dup.Id, keeper.Id);
+ merged++;
+ }
+ }
+
+ _log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned);
+ return (merged, cleaned);
+ }
+
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide(