Default aggregated ads to Job, not Shift (stop fabricating shift dates/times)
A generic hiring ad like «پرستار درمانگاه» was published as a dated SHIFT with an invented date («فردا») and default hours («۰۸:۰۰–۱۴:۰۰») the source never stated — because classification defaulted to Shift. Now a dated Shift is only produced when the text carries an explicit shift signal (شیفت/آنکال/کشیک/نوبت); everything else is an ongoing hiring post → Job (no date to invent). Fixed in both the parser default and the Publish branch (so an AI mislabel can''t force a shift either). ReclassifyMisclassifiedShiftsAsync (in the post-ingest auto-cleanup) converts the existing signal-less aggregated shifts into jobs in place — copies the content to a JobOpening and archives the old shift (its URL 410s). After one pass it''s a no-op since new ads no longer become shifts. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -69,8 +69,11 @@ public class HeuristicListingParser : IListingParser
|
||||
}
|
||||
else
|
||||
{
|
||||
p.Kind = (jobSignals && !shiftSignals) ? ListingKind.Job : ListingKind.Shift;
|
||||
p.Notes.Add(p.Kind == ListingKind.Job ? "نوع: استخدام (تشخیص خودکار)" : "نوع: شیفت (تشخیص خودکار)");
|
||||
// A dated SHIFT requires an explicit shift signal («شیفت/آنکال/کشیک/نوبت»). Otherwise the ad
|
||||
// is an ongoing hiring post → Job. (Defaulting to Shift forced a fabricated date/time onto
|
||||
// generic ads like «پرستار درمانگاه», which the source never stated.)
|
||||
p.Kind = shiftSignals ? ListingKind.Shift : ListingKind.Job;
|
||||
p.Notes.Add(p.Kind == ListingKind.Shift ? "نوع: شیفت (تشخیص خودکار)" : "نوع: استخدام (تشخیص خودکار)");
|
||||
}
|
||||
|
||||
// --- Roles (an ad can name several at once: «پرستار سالمند و کودک و همراه بیمار») ---
|
||||
|
||||
@@ -373,6 +373,41 @@ public class IngestionService
|
||||
return filled;
|
||||
}
|
||||
|
||||
private static readonly string[] ShiftSignals = { "شیفت", "آنکال", "انکال", "کشیک", "نوبت" };
|
||||
|
||||
/// <summary>
|
||||
/// Convert existing aggregated "shifts" that have NO shift signal in their text into JobOpenings —
|
||||
/// they were generic hiring ads («پرستار درمانگاه») mis-defaulted to a Shift with a fabricated
|
||||
/// date/time. Copies the content into a job and archives the old shift (so its URL 410s). New ingests
|
||||
/// no longer produce these (Job is now the default), so after one pass this is a no-op.
|
||||
/// </summary>
|
||||
public async Task<int> ReclassifyMisclassifiedShiftsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var shifts = await _db.Shifts.Include(s => s.Role).Include(s => s.Contacts)
|
||||
.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated)
|
||||
.ToListAsync(ct);
|
||||
var bad = shifts.Where(s => !ShiftSignals.Any(w => (s.Description ?? "").Contains(w))).ToList();
|
||||
if (bad.Count == 0) return 0;
|
||||
|
||||
foreach (var s in bad)
|
||||
{
|
||||
_db.JobOpenings.Add(new JobOpening
|
||||
{
|
||||
FacilityId = s.FacilityId, RoleId = s.RoleId,
|
||||
Title = $"استخدام {s.Role?.Name}",
|
||||
EmploymentType = EmploymentType.FullTime,
|
||||
SalaryMin = s.PayAmount,
|
||||
Description = s.Description, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||||
SourceUrl = s.SourceUrl, Lat = s.Lat, Lng = s.Lng,
|
||||
Contacts = s.Contacts.Select(c => new ContactMethod { Type = c.Type, Value = c.Value, SortOrder = c.SortOrder }).ToList(),
|
||||
});
|
||||
s.Status = ShiftStatus.Archived;
|
||||
}
|
||||
await _db.SaveChangesAsync(ct);
|
||||
_log.LogInformation("Reclassified {N} signal-less aggregated shifts into jobs.", bad.Count);
|
||||
return bad.Count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
|
||||
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
|
||||
@@ -387,8 +422,9 @@ public class IngestionService
|
||||
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
|
||||
var coords = await BackfillCoordsAsync(ct);
|
||||
var pay = await BackfillPayAsync(ct);
|
||||
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C} pay={P}",
|
||||
archived, dedupedJobs, mergedFac, cleanedFac, coords, pay);
|
||||
var reclassified = await ReclassifyMisclassifiedShiftsAsync(ct);
|
||||
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C} pay={P} reclassified={R}",
|
||||
archived, dedupedJobs, mergedFac, cleanedFac, coords, pay, reclassified);
|
||||
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
||||
}
|
||||
|
||||
@@ -821,7 +857,13 @@ public class IngestionService
|
||||
// one per extracted/typo role (پزشک عمومی، پرستار، دستیار پزشک، بهیار، «بیهیار»). Publish only
|
||||
// the primary (guard-corrected) role; the rest stay findable via the full description text.
|
||||
var primaryRole = pubRoles[0];
|
||||
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
||||
// A dated SHIFT is created ONLY when the ad is explicitly shift-based (the kind says shift AND
|
||||
// the text actually carries a shift signal). Otherwise it's an ongoing hiring post → Job, so we
|
||||
// never fabricate a date/time the source never stated (the «پرستار درمانگاه as فردا ۰۸:۰۰ شیفت»
|
||||
// bug). Defends against the AI mislabeling a generic ad as a shift, too.
|
||||
bool isShift = (kindStr.Contains("shift") || kindStr.Contains("شیفت"))
|
||||
&& new[] { "شیفت", "آنکال", "انکال", "کشیک", "نوبت" }.Any(raw.RawText.Contains);
|
||||
if (!isShift)
|
||||
{
|
||||
_db.JobOpenings.Add(new JobOpening
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user