Default aggregated ads to Job, not Shift (stop fabricating shift dates/times)
A generic hiring ad like «پرستار درمانگاه» was published as a dated SHIFT with an invented date («فردا») and default hours («۰۸:۰۰–۱۴:۰۰») the source never stated — because classification defaulted to Shift. Now a dated Shift is only produced when the text carries an explicit shift signal (شیفت/آنکال/کشیک/نوبت); everything else is an ongoing hiring post → Job (no date to invent). Fixed in both the parser default and the Publish branch (so an AI mislabel can''t force a shift either). ReclassifyMisclassifiedShiftsAsync (in the post-ingest auto-cleanup) converts the existing signal-less aggregated shifts into jobs in place — copies the content to a JobOpening and archives the old shift (its URL 410s). After one pass it''s a no-op since new ads no longer become shifts. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -69,8 +69,11 @@ public class HeuristicListingParser : IListingParser
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
p.Kind = (jobSignals && !shiftSignals) ? ListingKind.Job : ListingKind.Shift;
|
// A dated SHIFT requires an explicit shift signal («شیفت/آنکال/کشیک/نوبت»). Otherwise the ad
|
||||||
p.Notes.Add(p.Kind == ListingKind.Job ? "نوع: استخدام (تشخیص خودکار)" : "نوع: شیفت (تشخیص خودکار)");
|
// is an ongoing hiring post → Job. (Defaulting to Shift forced a fabricated date/time onto
|
||||||
|
// generic ads like «پرستار درمانگاه», which the source never stated.)
|
||||||
|
p.Kind = shiftSignals ? ListingKind.Shift : ListingKind.Job;
|
||||||
|
p.Notes.Add(p.Kind == ListingKind.Shift ? "نوع: شیفت (تشخیص خودکار)" : "نوع: استخدام (تشخیص خودکار)");
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Roles (an ad can name several at once: «پرستار سالمند و کودک و همراه بیمار») ---
|
// --- Roles (an ad can name several at once: «پرستار سالمند و کودک و همراه بیمار») ---
|
||||||
|
|||||||
@@ -373,6 +373,41 @@ public class IngestionService
|
|||||||
return filled;
|
return filled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static readonly string[] ShiftSignals = { "شیفت", "آنکال", "انکال", "کشیک", "نوبت" };
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Convert existing aggregated "shifts" that have NO shift signal in their text into JobOpenings —
|
||||||
|
/// they were generic hiring ads («پرستار درمانگاه») mis-defaulted to a Shift with a fabricated
|
||||||
|
/// date/time. Copies the content into a job and archives the old shift (so its URL 410s). New ingests
|
||||||
|
/// no longer produce these (Job is now the default), so after one pass this is a no-op.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<int> ReclassifyMisclassifiedShiftsAsync(CancellationToken ct = default)
|
||||||
|
{
|
||||||
|
var shifts = await _db.Shifts.Include(s => s.Role).Include(s => s.Contacts)
|
||||||
|
.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated)
|
||||||
|
.ToListAsync(ct);
|
||||||
|
var bad = shifts.Where(s => !ShiftSignals.Any(w => (s.Description ?? "").Contains(w))).ToList();
|
||||||
|
if (bad.Count == 0) return 0;
|
||||||
|
|
||||||
|
foreach (var s in bad)
|
||||||
|
{
|
||||||
|
_db.JobOpenings.Add(new JobOpening
|
||||||
|
{
|
||||||
|
FacilityId = s.FacilityId, RoleId = s.RoleId,
|
||||||
|
Title = $"استخدام {s.Role?.Name}",
|
||||||
|
EmploymentType = EmploymentType.FullTime,
|
||||||
|
SalaryMin = s.PayAmount,
|
||||||
|
Description = s.Description, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||||||
|
SourceUrl = s.SourceUrl, Lat = s.Lat, Lng = s.Lng,
|
||||||
|
Contacts = s.Contacts.Select(c => new ContactMethod { Type = c.Type, Value = c.Value, SortOrder = c.SortOrder }).ToList(),
|
||||||
|
});
|
||||||
|
s.Status = ShiftStatus.Archived;
|
||||||
|
}
|
||||||
|
await _db.SaveChangesAsync(ct);
|
||||||
|
_log.LogInformation("Reclassified {N} signal-less aggregated shifts into jobs.", bad.Count);
|
||||||
|
return bad.Count;
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
|
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
|
||||||
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
|
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
|
||||||
@@ -387,8 +422,9 @@ public class IngestionService
|
|||||||
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
|
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
|
||||||
var coords = await BackfillCoordsAsync(ct);
|
var coords = await BackfillCoordsAsync(ct);
|
||||||
var pay = await BackfillPayAsync(ct);
|
var pay = await BackfillPayAsync(ct);
|
||||||
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C} pay={P}",
|
var reclassified = await ReclassifyMisclassifiedShiftsAsync(ct);
|
||||||
archived, dedupedJobs, mergedFac, cleanedFac, coords, pay);
|
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C} pay={P} reclassified={R}",
|
||||||
|
archived, dedupedJobs, mergedFac, cleanedFac, coords, pay, reclassified);
|
||||||
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -821,7 +857,13 @@ public class IngestionService
|
|||||||
// one per extracted/typo role (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish only
|
// one per extracted/typo role (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish only
|
||||||
// the primary (guard-corrected) role; the rest stay findable via the full description text.
|
// the primary (guard-corrected) role; the rest stay findable via the full description text.
|
||||||
var primaryRole = pubRoles[0];
|
var primaryRole = pubRoles[0];
|
||||||
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
// A dated SHIFT is created ONLY when the ad is explicitly shift-based (the kind says shift AND
|
||||||
|
// the text actually carries a shift signal). Otherwise it's an ongoing hiring post → Job, so we
|
||||||
|
// never fabricate a date/time the source never stated (the «پرستار درمانگاه as فردا ۰۸:۰۰ شیفت»
|
||||||
|
// bug). Defends against the AI mislabeling a generic ad as a shift, too.
|
||||||
|
bool isShift = (kindStr.Contains("shift") || kindStr.Contains("شیفت"))
|
||||||
|
&& new[] { "شیفت", "آنکال", "انکال", "کشیک", "نوبت" }.Any(raw.RawText.Contains);
|
||||||
|
if (!isShift)
|
||||||
{
|
{
|
||||||
_db.JobOpenings.Add(new JobOpening
|
_db.JobOpenings.Add(new JobOpening
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user