diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs index 86aa9fd..e998e5b 100644 --- a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -414,24 +414,31 @@ public class IngestionService /// public async Task DedupeJobsAsync(CancellationToken ct = default) { + var gpId = await _db.Roles.Where(r => r.Name == "پزشک عمومی").Select(r => (int?)r.Id).FirstOrDefaultAsync(ct); var rows = await _db.JobOpenings .Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated) .Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt }) .ToListAsync(ct); - string? Sig(int roleId, int facId, string? desc) + // Signature = facility + normalized description core (digits/«… پیش» stripped). RoleId is + // deliberately NOT in the key, so the old role fan-out — the SAME ad published once per + // extracted/typo role (پزشک عمومی، پرستار، بهیار، «بیهیار»…) — collapses into one. + string? Sig(int facId, string? desc) { var core = NormalizeFa(Regex.Replace(desc ?? "", @"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim(); if (core.Length < 15) return null; // too little to call it a dup safely - return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}"; + return $"j:{facId}:{(core.Length > 120 ? core[..120] : core)}"; } + // Keep one per group — prefer a non-«پزشک عمومی» role (the fan-out's GP copy is the usual + // mislabel), then the newest. var toArchive = rows - .Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) }) + .Select(r => new { r.Id, r.RoleId, r.CreatedAt, Key = Sig(r.FacilityId, r.Description) }) .Where(x => x.Key is not null) .GroupBy(x => x.Key) - .SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id)) + .SelectMany(g => g.OrderBy(x => x.RoleId == gpId ? 1 : 0).ThenByDescending(x => x.CreatedAt) + .Skip(1).Select(x => x.Id)) .ToList(); if (toArchive.Count == 0) return 0; @@ -729,42 +736,43 @@ public class IngestionService facility.Lat = raw.Lat; facility.Lng = raw.Lng; } + // ONE ad = ONE listing. Do NOT fan out across roles. A single ad naming a few role-ish words + // («استخدام بهیار جهت دستیار پزشک و تزریقات») was exploding into 5 near-duplicate listings — + // one per extracted/typo role (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish only + // the primary (guard-corrected) role; the rest stay findable via the full description text. + var primaryRole = pubRoles[0]; if (kindStr.Contains("job") || kindStr.Contains("استخدام")) { - foreach (var role in pubRoles) - _db.JobOpenings.Add(new JobOpening - { - Facility = facility, Role = role, - Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}", - EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType), - // Prefer the AI-extracted salary, falling back to the parser's — matching the talent - // path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure, - // so every aggregated opening showed «توافقی» even when the ad stated a number.) - SalaryMin = d?.PayAmount ?? parsed.PayAmount, - Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, - SourceUrl = raw.SourceUrl, - Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center - Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing - }); + _db.JobOpenings.Add(new JobOpening + { + Facility = facility, Role = primaryRole, + Title = $"استخدام {primaryRole.Name}", + EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType), + // Prefer the AI-extracted salary, falling back to the parser's (matching the talent path). + SalaryMin = d?.PayAmount ?? parsed.PayAmount, + Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, + SourceUrl = raw.SourceUrl, + Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center + Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) + }); } else { var st = MapShiftType(d?.ShiftType, parsed.ShiftType); var (start, end) = DefaultTimes(st); - foreach (var role in pubRoles) - _db.Shifts.Add(new Shift - { - Facility = facility, Role = role, - Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1), - StartTime = start, EndTime = end, ShiftType = st, - SpecialtyRequired = role.Name, Description = raw.RawText, - PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage - : parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift, - PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent, - Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, - Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center - Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing - }); + _db.Shifts.Add(new Shift + { + Facility = facility, Role = primaryRole, + Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1), + StartTime = start, EndTime = end, ShiftType = st, + SpecialtyRequired = primaryRole.Name, Description = raw.RawText, + PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage + : parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift, + PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent, + Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl, + Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center + Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) + }); } raw.Status = RawListingStatus.Normalized; }