Stop job/shift role fan-out: one aggregated ad = one listing
A single ad naming several role-ish words («استخدام بهیار جهت دستیار پزشک و تزریقات») was fanning out into one listing PER extracted role — 5 near-duplicate cards with different and even typo roles (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish now creates ONE listing with the primary (guard-corrected) role; other role words stay findable via the full description. DedupeJobsAsync no longer keys on role, so existing fan-out copies collapse — preferring to keep a non-«پزشک عمومی» copy, then the newest. Run the «حذفِ تکراری» + «اصلاح نقش» buttons to clean the already-published fan-out. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -414,24 +414,31 @@ public class IngestionService
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
|
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
|
||||||
{
|
{
|
||||||
|
var gpId = await _db.Roles.Where(r => r.Name == "پزشک عمومی").Select(r => (int?)r.Id).FirstOrDefaultAsync(ct);
|
||||||
var rows = await _db.JobOpenings
|
var rows = await _db.JobOpenings
|
||||||
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
||||||
.Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
|
.Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
|
||||||
.ToListAsync(ct);
|
.ToListAsync(ct);
|
||||||
|
|
||||||
string? Sig(int roleId, int facId, string? desc)
|
// Signature = facility + normalized description core (digits/«… پیش» stripped). RoleId is
|
||||||
|
// deliberately NOT in the key, so the old role fan-out — the SAME ad published once per
|
||||||
|
// extracted/typo role (پزشک عمومی، پرستار، بهیار، «بیهیار»…) — collapses into one.
|
||||||
|
string? Sig(int facId, string? desc)
|
||||||
{
|
{
|
||||||
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
||||||
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
||||||
if (core.Length < 15) return null; // too little to call it a dup safely
|
if (core.Length < 15) return null; // too little to call it a dup safely
|
||||||
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
return $"j:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Keep one per group — prefer a non-«پزشک عمومی» role (the fan-out's GP copy is the usual
|
||||||
|
// mislabel), then the newest.
|
||||||
var toArchive = rows
|
var toArchive = rows
|
||||||
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
|
.Select(r => new { r.Id, r.RoleId, r.CreatedAt, Key = Sig(r.FacilityId, r.Description) })
|
||||||
.Where(x => x.Key is not null)
|
.Where(x => x.Key is not null)
|
||||||
.GroupBy(x => x.Key)
|
.GroupBy(x => x.Key)
|
||||||
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
|
.SelectMany(g => g.OrderBy(x => x.RoleId == gpId ? 1 : 0).ThenByDescending(x => x.CreatedAt)
|
||||||
|
.Skip(1).Select(x => x.Id))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
if (toArchive.Count == 0) return 0;
|
if (toArchive.Count == 0) return 0;
|
||||||
@@ -729,41 +736,42 @@ public class IngestionService
|
|||||||
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
|
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ONE ad = ONE listing. Do NOT fan out across roles. A single ad naming a few role-ish words
|
||||||
|
// («استخدام بهیار جهت دستیار پزشک و تزریقات») was exploding into 5 near-duplicate listings —
|
||||||
|
// one per extracted/typo role (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish only
|
||||||
|
// the primary (guard-corrected) role; the rest stay findable via the full description text.
|
||||||
|
var primaryRole = pubRoles[0];
|
||||||
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
||||||
{
|
{
|
||||||
foreach (var role in pubRoles)
|
|
||||||
_db.JobOpenings.Add(new JobOpening
|
_db.JobOpenings.Add(new JobOpening
|
||||||
{
|
{
|
||||||
Facility = facility, Role = role,
|
Facility = facility, Role = primaryRole,
|
||||||
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
|
Title = $"استخدام {primaryRole.Name}",
|
||||||
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||||
// Prefer the AI-extracted salary, falling back to the parser's — matching the talent
|
// Prefer the AI-extracted salary, falling back to the parser's (matching the talent path).
|
||||||
// path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure,
|
|
||||||
// so every aggregated opening showed «توافقی» even when the ad stated a number.)
|
|
||||||
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
|
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
|
||||||
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||||||
SourceUrl = raw.SourceUrl,
|
SourceUrl = raw.SourceUrl,
|
||||||
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
||||||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
|
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
|
||||||
var (start, end) = DefaultTimes(st);
|
var (start, end) = DefaultTimes(st);
|
||||||
foreach (var role in pubRoles)
|
|
||||||
_db.Shifts.Add(new Shift
|
_db.Shifts.Add(new Shift
|
||||||
{
|
{
|
||||||
Facility = facility, Role = role,
|
Facility = facility, Role = primaryRole,
|
||||||
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
|
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
|
||||||
StartTime = start, EndTime = end, ShiftType = st,
|
StartTime = start, EndTime = end, ShiftType = st,
|
||||||
SpecialtyRequired = role.Name, Description = raw.RawText,
|
SpecialtyRequired = primaryRole.Name, Description = raw.RawText,
|
||||||
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
||||||
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
||||||
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
||||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||||
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
||||||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
raw.Status = RawListingStatus.Normalized;
|
raw.Status = RawListingStatus.Normalized;
|
||||||
|
|||||||
Reference in New Issue
Block a user