diff --git a/src/JobsMedical.Web/Pages/Admin/Review.cshtml b/src/JobsMedical.Web/Pages/Admin/Review.cshtml
index 6ae6115..6824f2a 100644
--- a/src/JobsMedical.Web/Pages/Admin/Review.cshtml
+++ b/src/JobsMedical.Web/Pages/Admin/Review.cshtml
@@ -51,10 +51,10 @@
diff --git a/src/JobsMedical.Web/Pages/Admin/Review.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Review.cshtml.cs
index 1832c05..0923a1b 100644
--- a/src/JobsMedical.Web/Pages/Admin/Review.cshtml.cs
+++ b/src/JobsMedical.Web/Pages/Admin/Review.cshtml.cs
@@ -1,6 +1,7 @@
using JobsMedical.Web.Data;
using JobsMedical.Web.Models;
using JobsMedical.Web.Services;
+using JobsMedical.Web.Services.Scraping;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.Mvc.RazorPages;
@@ -72,6 +73,25 @@ public class ReviewModel : PageModel
if (Parsed.PayAmount is not null) { PayAmount = Parsed.PayAmount; SalaryMin = Parsed.PayAmount; }
Description = Raw.RawText;
Title = Parsed.RoleName is not null ? $"استخدام {Parsed.RoleName}" : "موقعیت استخدامی";
+
+ // Facility: try to match the listing's facility to one we already have; otherwise
+ // prefill the "new facility" box so publishing creates it.
+ if (!string.IsNullOrWhiteSpace(Parsed.FacilityName))
+ {
+ var cityId = await _db.Cities.Where(c => c.Name == Parsed.CityName)
+ .Select(c => (int?)c.Id).FirstOrDefaultAsync();
+ var match = FacilityMatcher.FindBest(Facilities, Parsed.FacilityName, cityId);
+ if (match is not null)
+ {
+ FacilityId = match.Id;
+ Parsed.Notes.Add($"مرکز منطبق در سیستم: «{match.Name}» — همین انتخاب شد.");
+ }
+ else
+ {
+ NewFacilityName = Parsed.FacilityName;
+ Parsed.Notes.Add($"مرکز جدید پیشنهادی: «{Parsed.FacilityName}» — هنگام انتشار ساخته میشود.");
+ }
+ }
return Page();
}
@@ -181,15 +201,17 @@ public class ReviewModel : PageModel
if (string.IsNullOrWhiteSpace(NewFacilityName))
return null;
- // Reuse a same-named facility if one already exists, else create it.
var name = NewFacilityName.Trim();
- var existing = await _db.Facilities.FirstOrDefaultAsync(f => f.Name == name);
- if (existing is not null) return existing.Id;
-
var cityId = await _db.Cities.OrderByDescending(c => c.IsActive)
.Select(c => (int?)c.Id).FirstOrDefaultAsync();
if (cityId is null) return null; // no cities seeded — cannot create a facility
+ // Reuse an existing facility that's exactly or closely the same (Persian-aware fuzzy
+ // match), so we don't create duplicates like «بیمارستان میلاد» vs «میلاد».
+ var all = await _db.Facilities.ToListAsync();
+ var match = FacilityMatcher.FindBest(all, name, cityId);
+ if (match is not null) return match.Id;
+
var facility = new Facility
{
Name = name,
diff --git a/src/JobsMedical.Web/Services/ListingParser.cs b/src/JobsMedical.Web/Services/ListingParser.cs
index 03a90cc..1ffc506 100644
--- a/src/JobsMedical.Web/Services/ListingParser.cs
+++ b/src/JobsMedical.Web/Services/ListingParser.cs
@@ -16,6 +16,7 @@ public class ParsedListing
public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز
public string? CityName { get; set; }
public string? DistrictName { get; set; }
+ public string? FacilityName { get; set; } // hospital/clinic name guessed from the text
public string? Phone { get; set; }
public List
Notes { get; set; } = new(); // what was/wasn't detected (shown to admin)
}
@@ -107,6 +108,10 @@ public class HeuristicListingParser : IListingParser
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
}
+ // --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) ---
+ p.FacilityName = ExtractFacilityName(text);
+ if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
+
// --- Phone ---
var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}");
if (phone.Success) p.Phone = phone.Value;
@@ -114,6 +119,48 @@ public class HeuristicListingParser : IListingParser
return p;
}
+ // Words that introduce a facility name, longest/most-specific first.
+ private static readonly string[] FacilityKeywords =
+ {
+ "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
+ "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
+ "آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان",
+ };
+
+ // Words that clearly aren't part of a facility's name — stop collecting here.
+ private static readonly string[] NameStops =
+ {
+ "جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب",
+ "دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز",
+ "شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم",
+ };
+
+ /// Best-effort hospital/clinic name: a facility keyword plus up to three name words.
+ private static string? ExtractFacilityName(string text)
+ {
+ foreach (var kw in FacilityKeywords)
+ {
+ var idx = text.IndexOf(kw, StringComparison.Ordinal);
+ if (idx < 0) continue;
+ var after = text[(idx + kw.Length)..];
+ var words = after.Split(
+ new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' },
+ StringSplitOptions.RemoveEmptyEntries);
+ var picked = new List();
+ foreach (var w in words)
+ {
+ if (NameStops.Contains(w)) break;
+ if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
+ if (w.Length == 1) break; // stray letters
+ picked.Add(w);
+ if (picked.Count >= 3) break;
+ }
+ if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
+ return (kw + " " + string.Join(" ", picked)).Trim();
+ }
+ return null;
+ }
+
/// Pull a Toman figure out of free text, handling «میلیون» and Persian digits.
private static long? ExtractAmount(string text)
{
diff --git a/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs
new file mode 100644
index 0000000..5b97f2e
--- /dev/null
+++ b/src/JobsMedical.Web/Services/Scraping/FacilityMatcher.cs
@@ -0,0 +1,109 @@
+using System.Text;
+using System.Text.RegularExpressions;
+using JobsMedical.Web.Models;
+
+namespace JobsMedical.Web.Services.Scraping;
+
+///
+/// Persian-aware fuzzy matching for facility names, so the same hospital written slightly
+/// differently — spacing, ي/ك vs ی/ک, ZWNJ, with or without «بیمارستان» — resolves to one
+/// record instead of creating a duplicate. Used by both the manual review/publish flow and
+/// the auto-publish ingestion pipeline.
+///
+public static class FacilityMatcher
+{
+ // Generic type words stripped to compare the distinctive core of a name.
+ private static readonly string[] TypeWords =
+ {
+ "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
+ "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
+ "آزمایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
+ };
+
+ /// Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.
+ public static string Normalize(string? s)
+ {
+ if (string.IsNullOrWhiteSpace(s)) return "";
+ var t = s.Replace('ي', 'ی').Replace('ك', 'ک').Replace('ۀ', 'ه').Replace('ة', 'ه')
+ .Replace('أ', 'ا').Replace('إ', 'ا').Replace('آ', 'ا').Replace('ئ', 'ی')
+ .Replace('', ' ').ToLowerInvariant();
+ var sb = new StringBuilder(t.Length);
+ foreach (var ch in t)
+ sb.Append(char.IsLetterOrDigit(ch) || ch == ' ' ? ch : ' ');
+ return Regex.Replace(sb.ToString(), @"\s+", " ").Trim();
+ }
+
+ /// Normalized name with generic type words removed — the distinctive part.
+ public static string Core(string? s)
+ {
+ var n = Normalize(s);
+ if (n.Length == 0) return "";
+ foreach (var w in TypeWords)
+ {
+ var nw = Normalize(w);
+ if (nw.Length == 0) continue;
+ n = Regex.Replace(n, $@"(?True when two names almost certainly denote the same facility.
+ public static bool IsSame(string? a, string? b)
+ {
+ var na = Normalize(a);
+ var nb = Normalize(b);
+ if (na.Length == 0 || nb.Length == 0) return false;
+ if (na == nb) return true;
+
+ var ca = Core(a);
+ var cb = Core(b);
+ if (ca.Length >= 2 && ca == cb) return true;
+ // one core fully contains the other (e.g. «میلاد» vs «میلاد ۱»)
+ if (ca.Length >= 3 && cb.Length >= 3 && (ca.Contains(cb) || cb.Contains(ca))) return true;
+
+ // edit-distance similarity on the most informative basis
+ var (x, y) = ca.Length >= 3 && cb.Length >= 3 ? (ca, cb) : (na, nb);
+ return Similarity(x, y) >= 0.86;
+ }
+
+ ///
+ /// Best existing facility for : same-city exact match first, then
+ /// any-city exact, then same-city fuzzy, then any-city fuzzy. Null when nothing matches.
+ ///
+ public static Facility? FindBest(IEnumerable facilities, string? name, int? cityId)
+ {
+ if (string.IsNullOrWhiteSpace(name)) return null;
+ var list = facilities as IList ?? facilities.ToList();
+ var target = Normalize(name);
+
+ return list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && Normalize(f.Name) == target)
+ ?? list.FirstOrDefault(f => Normalize(f.Name) == target)
+ ?? list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && IsSame(f.Name, name))
+ ?? list.FirstOrDefault(f => IsSame(f.Name, name));
+ }
+
+ private static double Similarity(string a, string b)
+ {
+ if (a == b) return 1;
+ var max = Math.Max(a.Length, b.Length);
+ return max == 0 ? 1 : 1.0 - (double)Levenshtein(a, b) / max;
+ }
+
+ private static int Levenshtein(string a, string b)
+ {
+ var dp = new int[b.Length + 1];
+ for (var j = 0; j <= b.Length; j++) dp[j] = j;
+ for (var i = 1; i <= a.Length; i++)
+ {
+ var prev = dp[0];
+ dp[0] = i;
+ for (var j = 1; j <= b.Length; j++)
+ {
+ var tmp = dp[j];
+ dp[j] = Math.Min(Math.Min(dp[j] + 1, dp[j - 1] + 1), prev + (a[i - 1] == b[j - 1] ? 0 : 1));
+ prev = tmp;
+ }
+ }
+ return dp[b.Length];
+ }
+}
diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
index 09bd795..7c79eae 100644
--- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
+++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
@@ -52,6 +52,7 @@ public class IngestionService
var roles = await _db.Roles.ToListAsync(ct);
var cities = await _db.Cities.ToListAsync(ct);
var districts = await _db.Districts.ToListAsync(ct);
+ var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create
var roleNames = roles.Select(r => r.Name).ToList();
var cityNames = cities.Select(c => c.Name).ToList();
var districtNames = districts.Select(d => d.Name).ToList();
@@ -95,7 +96,7 @@ public class IngestionService
if (status == RawListingStatus.Normalized)
{
- try { Publish(parsed, ai, raw, roles, cities, districts); published++; }
+ try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
}
else if (status == RawListingStatus.New) queued++;
@@ -157,7 +158,7 @@ public class IngestionService
}
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
- List roles, List cities, List districts)
+ List roles, List cities, List districts, List facilities)
{
var d = ai?.Data;
var roleName = d?.Role ?? parsed.RoleName;
@@ -170,9 +171,10 @@ public class IngestionService
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
+ : !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
: $"مرکز درمانی (از {raw.SourceChannel})";
- var facility = _db.Facilities.Local.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id)
- ?? _db.Facilities.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id);
+ // Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
+ var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
if (facility is null)
{
facility = new Facility
@@ -181,6 +183,7 @@ public class IngestionService
Phone = parsed.Phone, IsVerified = false,
};
_db.Facilities.Add(facility);
+ facilities.Add(facility); // so later listings in this run match it too
}
var kind = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();