Real channel fetch (Telegram/Bale/Divar) + AI-audited automation engine + CI/CD
- Fetch: Telegram via t.me/s, Bale via Bot API, Divar via web-search (HttpClient, config-gated, graceful) - AI layer: DB-backed AppSetting (mode auto/manual, thresholds, AI endpoint/model/key/prompt/framework, auto-approve); OpenAI-compatible IAiAuditor (self-host/Iranian endpoints; fails safe to manual) - Pipeline: fetch → dedupe(hash) → parse → validate → AI audit → Discard/Flag/Queue/auto-publish (resolve-or-create facility) - Admin: /Admin/Settings automation+AI panel; queue shows confidence + AI verdict; flagged section - CI/CD: Dockerfile, docker-compose.prod.yml, .gitea/workflows/ci-cd.yml, nginx vhost, DEPLOY.md; forwarded headers + /healthz + prod reference-only seed; ports 22/80/443 only Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
@@ -5,38 +7,80 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
public class DivarOptions
|
||||
{
|
||||
public bool Enabled { get; set; }
|
||||
public string? City { get; set; } // e.g. "tehran"
|
||||
public List<string> Queries { get; set; } = new(); // search terms, e.g. "استخدام پزشک"
|
||||
public string City { get; set; } = "tehran";
|
||||
public string Category { get; set; } = "jobs";
|
||||
public List<string> Queries { get; set; } = new(); // e.g. "پرستار", "پزشک عمومی", "درمانگاه"
|
||||
public string BaseUrl { get; set; } = "https://api.divar.ir/v8/web-search";
|
||||
public int PerQuery { get; set; } = 25;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Divar source. Credential-ready: configure city + queries in (Ingestion:Divar) and implement
|
||||
/// the fetch against Divar's listing API/HTML. Dormant until enabled.
|
||||
/// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post
|
||||
/// titles + descriptions. Divar's private API shifts shape over time, so we walk the JSON
|
||||
/// tolerantly for any object carrying a "title" plus a nearby description field, and fail soft.
|
||||
/// </summary>
|
||||
public class DivarListingSource : IListingSource
|
||||
{
|
||||
private readonly DivarOptions _opts;
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ILogger<DivarListingSource> _log;
|
||||
|
||||
public DivarListingSource(IOptions<DivarOptions> opts, ILogger<DivarListingSource> log)
|
||||
public DivarListingSource(IOptions<DivarOptions> opts, IHttpClientFactory http,
|
||||
ILogger<DivarListingSource> log)
|
||||
{
|
||||
_opts = opts.Value;
|
||||
_http = http;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "دیوار";
|
||||
public bool Enabled => _opts.Enabled && _opts.Queries.Count > 0;
|
||||
|
||||
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!Enabled)
|
||||
if (!Enabled) { _log.LogInformation("Divar source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
var client = _http.CreateClient("scrape");
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var q in _opts.Queries.Where(q => q.Trim().Length > 0))
|
||||
{
|
||||
_log.LogInformation("Divar source not configured — skipping.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
try
|
||||
{
|
||||
var url = $"{_opts.BaseUrl.TrimEnd('/')}/{_opts.City}/{_opts.Category}?q={Uri.EscapeDataString(q)}";
|
||||
var body = await client.GetStringAsync(url, ct);
|
||||
using var doc = JsonDocument.Parse(body);
|
||||
foreach (var text in Harvest(doc.RootElement).Take(_opts.PerQuery))
|
||||
items.Add(new ScrapedItem("دیوار", text, "https://divar.ir"));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
|
||||
}
|
||||
return items;
|
||||
}
|
||||
|
||||
private static readonly string[] DescKeys =
|
||||
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
||||
|
||||
/// <summary>Walk the JSON; for each object with a string "title", emit title + first description.</summary>
|
||||
private static IEnumerable<string> Harvest(JsonElement el)
|
||||
{
|
||||
if (el.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
|
||||
{
|
||||
var sb = new StringBuilder(t.GetString());
|
||||
foreach (var k in DescKeys)
|
||||
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
|
||||
{ sb.Append(" — ").Append(d.GetString()); break; }
|
||||
var text = sb.ToString().Trim();
|
||||
if (text.Length >= 15) yield return text;
|
||||
}
|
||||
foreach (var p in el.EnumerateObject())
|
||||
foreach (var s in Harvest(p.Value)) yield return s;
|
||||
}
|
||||
else if (el.ValueKind == JsonValueKind.Array)
|
||||
{
|
||||
foreach (var item in el.EnumerateArray())
|
||||
foreach (var s in Harvest(item)) yield return s;
|
||||
}
|
||||
// TODO(prod): query Divar for each term in the configured city, map each ad's
|
||||
// title+description to new ScrapedItem(Name, text, adUrl).
|
||||
_log.LogWarning("Divar fetch not yet implemented; returning empty.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user