Add scrape/ingestion engine + validation, and 24h shift hour-range visualization
Scrape engine (Services/Scraping/): pluggable IListingSource (working sample + Telegram/Divar credential-ready stubs) → IngestionService (content-hash dedupe → parse → validate → review queue) → ListingValidator (completeness score + spam screen) → IngestionWorker (config-gated hosted service). RawListing gains ContentHash/Confidence/ValidationNotes; RawListingStatus.Flagged. Admin /Admin gets run-now, source list, confidence + flagged queue. Hour-range viz: _HourBar 24h timeline bar (colored by type, overnight wrap) on shift cards, recommendation cards, and detail. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using JobsMedical.Web.Data;
|
||||
using JobsMedical.Web.Models;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public record SourceResult(string Source, int Fetched, int Queued, int Flagged, int Spam, int Duplicates);
|
||||
|
||||
public record IngestionSummary(List<SourceResult> Sources)
|
||||
{
|
||||
public int TotalQueued => Sources.Sum(s => s.Queued);
|
||||
public int TotalFlagged => Sources.Sum(s => s.Flagged);
|
||||
public int TotalSpam => Sources.Sum(s => s.Spam);
|
||||
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The scrape engine. Pulls from every enabled <see cref="IListingSource"/>, dedupes by content
|
||||
/// hash, parses with <see cref="IListingParser"/>, validates with <see cref="ListingValidator"/>,
|
||||
/// and stores each as a <see cref="RawListing"/> with a status: New (queued for review),
|
||||
/// Flagged (incomplete/suspicious), or Discarded (spam). Source-agnostic — add a source and it
|
||||
/// flows through unchanged.
|
||||
/// </summary>
|
||||
public class IngestionService
|
||||
{
|
||||
private readonly AppDbContext _db;
|
||||
private readonly IEnumerable<IListingSource> _sources;
|
||||
private readonly IListingParser _parser;
|
||||
private readonly ListingValidator _validator;
|
||||
private readonly ILogger<IngestionService> _log;
|
||||
|
||||
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources,
|
||||
IListingParser parser, ListingValidator validator, ILogger<IngestionService> log)
|
||||
{
|
||||
_db = db;
|
||||
_sources = sources;
|
||||
_parser = parser;
|
||||
_validator = validator;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public IReadOnlyList<(string Name, bool Enabled)> Sources =>
|
||||
_sources.Select(s => (s.Name, s.Enabled)).ToList();
|
||||
|
||||
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
||||
{
|
||||
var roles = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
||||
var cities = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
||||
var districts = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
|
||||
|
||||
var results = new List<SourceResult>();
|
||||
|
||||
foreach (var source in _sources.Where(s => s.Enabled))
|
||||
{
|
||||
int fetched = 0, queued = 0, flagged = 0, spam = 0, dupes = 0;
|
||||
IReadOnlyList<ScrapedItem> items;
|
||||
try { items = await source.FetchAsync(ct); }
|
||||
catch (Exception ex) { _log.LogError(ex, "Source {Source} fetch failed", source.Name); continue; }
|
||||
|
||||
foreach (var item in items)
|
||||
{
|
||||
fetched++;
|
||||
var hash = Hash(item.RawText);
|
||||
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
|
||||
|
||||
var parsed = _parser.Parse(item.RawText, roles, cities, districts);
|
||||
var val = _validator.Validate(item.RawText, parsed);
|
||||
|
||||
var status = val.IsSpam ? RawListingStatus.Discarded
|
||||
: val.IsValid ? RawListingStatus.New
|
||||
: RawListingStatus.Flagged;
|
||||
if (status == RawListingStatus.New) queued++;
|
||||
else if (status == RawListingStatus.Flagged) flagged++;
|
||||
else spam++;
|
||||
|
||||
_db.RawListings.Add(new RawListing
|
||||
{
|
||||
SourceChannel = item.Source,
|
||||
SourceUrl = item.SourceUrl,
|
||||
RawText = item.RawText.Trim(),
|
||||
ContentHash = hash,
|
||||
Confidence = val.Confidence,
|
||||
ValidationNotes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null,
|
||||
Status = status,
|
||||
});
|
||||
}
|
||||
|
||||
await _db.SaveChangesAsync(ct);
|
||||
results.Add(new SourceResult(source.Name, fetched, queued, flagged, spam, dupes));
|
||||
_log.LogInformation("Ingestion {Source}: fetched={F} queued={Q} flagged={Fl} spam={S} dupes={D}",
|
||||
source.Name, fetched, queued, flagged, spam, dupes);
|
||||
}
|
||||
|
||||
return new IngestionSummary(results);
|
||||
}
|
||||
|
||||
/// <summary>SHA-256 hex of the whitespace-normalized text (for cross-run dedupe).</summary>
|
||||
private static string Hash(string text)
|
||||
{
|
||||
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
|
||||
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalized));
|
||||
return Convert.ToHexString(bytes).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user