[Ingest] Route scraping through an optional V2Ray/Xray proxy (Telegram in Iran)
Telegram and some sources are filtered in Iran. .NET cannot speak vmess/vless/trojan, so add an Xray sidecar (compose service 'xray', behind the 'proxy' profile) that converts the admin's config into a local SOCKS5 proxy (xray:10808). New ScrapeHttpClients provider builds a proxied or direct HttpClient (WebProxy supports socks5/socks4/http) cached per proxy URL; all five ingestion sources (Telegram/Bale/Divar/Medjobs/Websites) now use it. Admin settings gain IngestProxyEnabled + IngestProxyUrl (migration; UI under sources). Added deploy/xray/config.json template + README with vmess/vless/trojan examples. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -10,12 +10,12 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
public class BaleListingSource : IListingSource
|
||||
{
|
||||
private const string BaseUrl = "https://tapi.bale.ai";
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<BaleListingSource> _log;
|
||||
|
||||
public BaleListingSource(IHttpClientFactory http, ILogger<BaleListingSource> log)
|
||||
public BaleListingSource(ScrapeHttpClients clients, ILogger<BaleListingSource> log)
|
||||
{
|
||||
_http = http;
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ public class BaleListingSource : IListingSource
|
||||
|
||||
try
|
||||
{
|
||||
var client = _http.CreateClient("scrape");
|
||||
var client = _clients.For(s);
|
||||
var body = await client.GetStringAsync($"{BaseUrl}/bot{s.BaleBotToken}/getUpdates", ct);
|
||||
using var doc = JsonDocument.Parse(body);
|
||||
if (!doc.RootElement.TryGetProperty("result", out var result) || result.ValueKind != JsonValueKind.Array)
|
||||
|
||||
@@ -12,12 +12,12 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
public class DivarListingSource : IListingSource
|
||||
{
|
||||
private const string BaseUrl = "https://api.divar.ir/v8/web-search";
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<DivarListingSource> _log;
|
||||
|
||||
public DivarListingSource(IHttpClientFactory http, ILogger<DivarListingSource> log)
|
||||
public DivarListingSource(ScrapeHttpClients clients, ILogger<DivarListingSource> log)
|
||||
{
|
||||
_http = http;
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ public class DivarListingSource : IListingSource
|
||||
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty<ScrapedItem>();
|
||||
var city = string.IsNullOrWhiteSpace(s.DivarCity) ? "tehran" : s.DivarCity.Trim();
|
||||
|
||||
var client = _http.CreateClient("scrape");
|
||||
var client = _clients.For(s);
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var q in queries)
|
||||
{
|
||||
|
||||
@@ -13,12 +13,12 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
public class MedjobsListingSource : IListingSource
|
||||
{
|
||||
private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml";
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<MedjobsListingSource> _log;
|
||||
|
||||
public MedjobsListingSource(IHttpClientFactory http, ILogger<MedjobsListingSource> log)
|
||||
public MedjobsListingSource(ScrapeHttpClients clients, ILogger<MedjobsListingSource> log)
|
||||
{
|
||||
_http = http;
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ public class MedjobsListingSource : IListingSource
|
||||
{
|
||||
if (!s.MedjobsEnabled) return Array.Empty<ScrapedItem>();
|
||||
var max = Math.Clamp(s.MedjobsMaxAds, 1, 500);
|
||||
var client = _http.CreateClient("scrape");
|
||||
var client = _clients.For(s);
|
||||
|
||||
try
|
||||
{
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Net;
|
||||
using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>
|
||||
/// Supplies the HttpClient used by ingestion sources, optionally routed through a proxy.
|
||||
///
|
||||
/// Telegram (t.me) and some other sources are filtered in Iran, so the admin can point
|
||||
/// ingestion at a local proxy that an Xray/V2Ray client sidecar exposes (e.g.
|
||||
/// <c>socks5://xray:10808</c>). .NET's WebProxy understands <c>socks5://</c>, <c>socks4://</c>
|
||||
/// and <c>http://</c> schemes, so the same code path covers all of them.
|
||||
///
|
||||
/// Clients are cached per proxy descriptor (singleton). Changing the proxy in admin settings
|
||||
/// makes the next run pick up a new client; the old one is disposed.
|
||||
/// </summary>
|
||||
public sealed class ScrapeHttpClients : IDisposable
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, HttpClient> _cache = new();
|
||||
|
||||
/// <summary>The HttpClient for the given settings — proxied when enabled, direct otherwise.</summary>
|
||||
public HttpClient For(AppSetting s)
|
||||
{
|
||||
var key = (s.IngestProxyEnabled && !string.IsNullOrWhiteSpace(s.IngestProxyUrl))
|
||||
? s.IngestProxyUrl.Trim()
|
||||
: "direct";
|
||||
|
||||
// Drop stale clients if the proxy URL changed (keep only "direct" + the current proxy).
|
||||
foreach (var k in _cache.Keys)
|
||||
if (k != "direct" && k != key && _cache.TryRemove(k, out var stale))
|
||||
stale.Dispose();
|
||||
|
||||
return _cache.GetOrAdd(key, Build);
|
||||
}
|
||||
|
||||
private static HttpClient Build(string key)
|
||||
{
|
||||
var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.All };
|
||||
if (key != "direct")
|
||||
{
|
||||
handler.Proxy = new WebProxy(key); // socks5:// | socks4:// | http://
|
||||
handler.UseProxy = true;
|
||||
}
|
||||
var c = new HttpClient(handler) { Timeout = TimeSpan.FromSeconds(20) };
|
||||
c.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; HamkadrBot/1.0)");
|
||||
return c;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
foreach (var c in _cache.Values) c.Dispose();
|
||||
_cache.Clear();
|
||||
}
|
||||
}
|
||||
@@ -44,6 +44,8 @@ public class SettingsService
|
||||
s.DemoMode = incoming.DemoMode;
|
||||
s.WebsitesEnabled = incoming.WebsitesEnabled;
|
||||
s.WebsiteUrls = incoming.WebsiteUrls?.Trim();
|
||||
s.IngestProxyEnabled = incoming.IngestProxyEnabled;
|
||||
s.IngestProxyUrl = incoming.IngestProxyUrl?.Trim();
|
||||
s.DivarEnabled = incoming.DivarEnabled;
|
||||
s.DivarCity = string.IsNullOrWhiteSpace(incoming.DivarCity) ? "tehran" : incoming.DivarCity.Trim();
|
||||
s.DivarQueries = incoming.DivarQueries?.Trim();
|
||||
|
||||
@@ -10,12 +10,12 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
/// </summary>
|
||||
public class TelegramListingSource : IListingSource
|
||||
{
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<TelegramListingSource> _log;
|
||||
|
||||
public TelegramListingSource(IHttpClientFactory http, ILogger<TelegramListingSource> log)
|
||||
public TelegramListingSource(ScrapeHttpClients clients, ILogger<TelegramListingSource> log)
|
||||
{
|
||||
_http = http;
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ public class TelegramListingSource : IListingSource
|
||||
var channels = AppSetting.SplitList(s.TelegramChannels);
|
||||
if (!s.TelegramEnabled || channels.Count == 0) return Array.Empty<ScrapedItem>();
|
||||
|
||||
var client = _http.CreateClient("scrape");
|
||||
var client = _clients.For(s);
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var ch in channels.Select(c => c.TrimStart('@')).Where(c => c.Length > 0))
|
||||
{
|
||||
|
||||
@@ -11,12 +11,12 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
/// </summary>
|
||||
public class WebsiteListingSource : IListingSource
|
||||
{
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<WebsiteListingSource> _log;
|
||||
|
||||
public WebsiteListingSource(IHttpClientFactory http, ILogger<WebsiteListingSource> log)
|
||||
public WebsiteListingSource(ScrapeHttpClients clients, ILogger<WebsiteListingSource> log)
|
||||
{
|
||||
_http = http;
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ public class WebsiteListingSource : IListingSource
|
||||
var urls = AppSetting.SplitList(s.WebsiteUrls);
|
||||
if (!s.WebsitesEnabled || urls.Count == 0) return Array.Empty<ScrapedItem>();
|
||||
|
||||
var client = _http.CreateClient("scrape");
|
||||
var client = _clients.For(s);
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var url in urls.Where(u => u.StartsWith("http")))
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user