Stop job/shift role fan-out: one aggregated ad = one listing
A single ad naming several role-ish words («استخدام بهیار جهت دستیار پزشک و تزریقات») was fanning out into one listing PER extracted role — 5 near-duplicate cards with different and even typo roles (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish now creates ONE listing with the primary (guard-corrected) role; other role words stay findable via the full description. DedupeJobsAsync no longer keys on role, so existing fan-out copies collapse — preferring to keep a non-«پزشک عمومی» copy, then the newest. Run the «حذفِ تکراری» + «اصلاح نقش» buttons to clean the already-published fan-out. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -414,24 +414,31 @@ public class IngestionService
|
||||
/// </summary>
|
||||
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var gpId = await _db.Roles.Where(r => r.Name == "پزشک عمومی").Select(r => (int?)r.Id).FirstOrDefaultAsync(ct);
|
||||
var rows = await _db.JobOpenings
|
||||
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
||||
.Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
|
||||
.ToListAsync(ct);
|
||||
|
||||
string? Sig(int roleId, int facId, string? desc)
|
||||
// Signature = facility + normalized description core (digits/«… پیش» stripped). RoleId is
|
||||
// deliberately NOT in the key, so the old role fan-out — the SAME ad published once per
|
||||
// extracted/typo role (پزشک عمومی، پرستار، بهیار، «بیهیار»…) — collapses into one.
|
||||
string? Sig(int facId, string? desc)
|
||||
{
|
||||
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
||||
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
||||
if (core.Length < 15) return null; // too little to call it a dup safely
|
||||
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
||||
return $"j:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
||||
}
|
||||
|
||||
// Keep one per group — prefer a non-«پزشک عمومی» role (the fan-out's GP copy is the usual
|
||||
// mislabel), then the newest.
|
||||
var toArchive = rows
|
||||
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
|
||||
.Select(r => new { r.Id, r.RoleId, r.CreatedAt, Key = Sig(r.FacilityId, r.Description) })
|
||||
.Where(x => x.Key is not null)
|
||||
.GroupBy(x => x.Key)
|
||||
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
|
||||
.SelectMany(g => g.OrderBy(x => x.RoleId == gpId ? 1 : 0).ThenByDescending(x => x.CreatedAt)
|
||||
.Skip(1).Select(x => x.Id))
|
||||
.ToList();
|
||||
|
||||
if (toArchive.Count == 0) return 0;
|
||||
@@ -729,42 +736,43 @@ public class IngestionService
|
||||
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
|
||||
}
|
||||
|
||||
// ONE ad = ONE listing. Do NOT fan out across roles. A single ad naming a few role-ish words
|
||||
// («استخدام بهیار جهت دستیار پزشک و تزریقات») was exploding into 5 near-duplicate listings —
|
||||
// one per extracted/typo role (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish only
|
||||
// the primary (guard-corrected) role; the rest stay findable via the full description text.
|
||||
var primaryRole = pubRoles[0];
|
||||
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
||||
{
|
||||
foreach (var role in pubRoles)
|
||||
_db.JobOpenings.Add(new JobOpening
|
||||
{
|
||||
Facility = facility, Role = role,
|
||||
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
|
||||
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||
// Prefer the AI-extracted salary, falling back to the parser's — matching the talent
|
||||
// path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure,
|
||||
// so every aggregated opening showed «توافقی» even when the ad stated a number.)
|
||||
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
|
||||
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||||
SourceUrl = raw.SourceUrl,
|
||||
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
||||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
||||
});
|
||||
_db.JobOpenings.Add(new JobOpening
|
||||
{
|
||||
Facility = facility, Role = primaryRole,
|
||||
Title = $"استخدام {primaryRole.Name}",
|
||||
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||
// Prefer the AI-extracted salary, falling back to the parser's (matching the talent path).
|
||||
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
|
||||
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||||
SourceUrl = raw.SourceUrl,
|
||||
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
||||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s)
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
|
||||
var (start, end) = DefaultTimes(st);
|
||||
foreach (var role in pubRoles)
|
||||
_db.Shifts.Add(new Shift
|
||||
{
|
||||
Facility = facility, Role = role,
|
||||
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
|
||||
StartTime = start, EndTime = end, ShiftType = st,
|
||||
SpecialtyRequired = role.Name, Description = raw.RawText,
|
||||
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
||||
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
||||
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
||||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
||||
});
|
||||
_db.Shifts.Add(new Shift
|
||||
{
|
||||
Facility = facility, Role = primaryRole,
|
||||
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
|
||||
StartTime = start, EndTime = end, ShiftType = st,
|
||||
SpecialtyRequired = primaryRole.Name, Description = raw.RawText,
|
||||
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
||||
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
||||
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
||||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s)
|
||||
});
|
||||
}
|
||||
raw.Status = RawListingStatus.Normalized;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user