255e8d25e5
deploy / deploy (push) Successful in 24s
Rewrite the FA strings sitewide in natural, human Persian (English unchanged), removing translation calques like «معمار راهکار», «هوش مصنوعی تولیدی», «موارد کاربری», «چرخههای هیجان», «استقرار در تولید», «محیط تولید». Covers: hero, services, pipeline, stack, expertise, portfolio, blog, contact (Index), nav/meta/footer (_Layout), the /blog list + per-post FA titles (BlogIndexModel, PostModel). Also removed two stray English em-dashes in the blog excerpts. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
217 lines
14 KiB
C#
217 lines
14 KiB
C#
using System.Text;
|
||
using SoroushAsadi.Services;
|
||
|
||
namespace SoroushAsadi.Pages.Blog;
|
||
|
||
public class PostModel(ContentService content) : BasePageModel
|
||
{
|
||
[Microsoft.AspNetCore.Mvc.BindProperty(SupportsGet = true)]
|
||
public string Slug { get; set; } = "";
|
||
|
||
public string Title { get; private set; } = "";
|
||
public string Category { get; private set; } = "";
|
||
public int ReadTime { get; private set; }
|
||
public string BodyHtml { get; private set; } = "";
|
||
public bool PostNotFound { get; private set; }
|
||
|
||
// Default bodies (Markdown-lite, rendered server-side)
|
||
private static readonly Dictionary<string, (string Cat, string TitleEn, string TitleFa, int RT, string Body)> _defaults = new()
|
||
{
|
||
["rag-eval-framework"] = ("LLM", "A RAG evaluation framework that holds up in production", "چارچوب ارزیابی RAG که در عمل جواب میدهد", 8, DefaultBodies.RagEval),
|
||
["agentic-n8n-patterns"] = ("Automation", "Agentic patterns with n8n for the enterprise", "الگوهای عاملمحور با n8n برای سازمان", 11, DefaultBodies.N8nPatterns),
|
||
["vertex-cost-control"] = ("Google Stack", "Vertex AI cost control at scale", "کنترل هزینه روی Vertex AI در مقیاس بالا", 6, DefaultBodies.VertexCost),
|
||
["k8s-llm-inference"] = ("Infra", "Sub-50ms LLM inference on Kubernetes", "اجرای LLM روی Kubernetes با تأخیر زیر ۵۰ میلیثانیه",14, DefaultBodies.K8sInference),
|
||
["flutter-on-device-ai"] = ("Mobile", "On-device AI in Flutter", "هوش مصنوعی روی دستگاه در Flutter", 9, DefaultBodies.FlutterAI),
|
||
["enterprise-ai-roadmap"] = ("Strategy", "A 90-day enterprise AI roadmap", "نقشهی راه هوش مصنوعی سازمانی در ۹۰ روز", 7, DefaultBodies.EnterpriseRoadmap),
|
||
};
|
||
|
||
public void OnGet()
|
||
{
|
||
if (!_defaults.TryGetValue(Slug, out var def)) { PostNotFound = true; return; }
|
||
|
||
// Check for DB override (stored under "posts" key as slug→{body,...})
|
||
var overrides = content.GetPostOverrides();
|
||
string body = def.Body;
|
||
if (overrides.TryGetValue(Slug, out var node) && node["body"]?.GetValue<string>() is { } dbBody)
|
||
body = dbBody;
|
||
|
||
Title = IsFa ? def.TitleFa : def.TitleEn;
|
||
Category = def.Cat;
|
||
ReadTime = def.RT;
|
||
BodyHtml = SimpleMarkdown(body);
|
||
}
|
||
|
||
// Minimal Markdown → HTML (headings, bold, code, paragraphs)
|
||
private static string SimpleMarkdown(string md)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(md)) return "";
|
||
var sb = new StringBuilder();
|
||
foreach (var rawLine in md.Split('\n'))
|
||
{
|
||
var line = rawLine.TrimEnd();
|
||
if (line.StartsWith("## ")) { sb.Append($"<h2>{Inline(line[3..])}</h2>\n"); continue; }
|
||
if (line.StartsWith("### ")) { sb.Append($"<h3>{Inline(line[4..])}</h3>\n"); continue; }
|
||
if (line.StartsWith("- ")) { sb.Append($"<li>{Inline(line[2..])}</li>\n"); continue; }
|
||
if (string.IsNullOrWhiteSpace(line)) { sb.Append('\n'); continue; }
|
||
sb.Append($"<p>{Inline(line)}</p>\n");
|
||
}
|
||
return sb.ToString();
|
||
}
|
||
|
||
private static string Inline(string s)
|
||
{
|
||
// **bold**, `code`, &, <, >
|
||
var sb = new StringBuilder();
|
||
int i = 0;
|
||
while (i < s.Length)
|
||
{
|
||
if (i + 1 < s.Length && s[i] == '*' && s[i + 1] == '*')
|
||
{
|
||
int end = s.IndexOf("**", i + 2);
|
||
if (end >= 0) { sb.Append("<strong>"); sb.Append(Esc(s[(i + 2)..end])); sb.Append("</strong>"); i = end + 2; continue; }
|
||
}
|
||
if (s[i] == '`')
|
||
{
|
||
int end = s.IndexOf('`', i + 1);
|
||
if (end >= 0) { sb.Append("<code>"); sb.Append(Esc(s[(i + 1)..end])); sb.Append("</code>"); i = end + 1; continue; }
|
||
}
|
||
sb.Append(s[i] switch { '&' => "&", '<' => "<", '>' => ">", _ => s[i].ToString() });
|
||
i++;
|
||
}
|
||
return sb.ToString();
|
||
}
|
||
private static string Esc(string s) => s.Replace("&","&").Replace("<","<").Replace(">",">");
|
||
}
|
||
|
||
/// Default article bodies (Markdown).
|
||
internal static class DefaultBodies
|
||
{
|
||
public const string RagEval = """
|
||
## Why standard metrics fail for RAG
|
||
|
||
BLEU and ROUGE measure n-gram overlap against a reference answer. In a RAG system, there is often no single correct reference — a question about company policy may have dozens of valid phrasings. High BLEU does not mean the system cited the right source; low BLEU does not mean it was wrong.
|
||
|
||
## The three metrics that actually matter
|
||
|
||
**Faithfulness** measures whether every claim in the generated answer can be traced back to a retrieved passage. A faithfulness score of 1.0 means the model invented nothing. Tools like RAGAS implement this with an LLM judge.
|
||
|
||
**Context Precision** asks: of the passages retrieved, how many were actually relevant to the question? Low precision wastes context window and increases hallucination risk.
|
||
|
||
**Answer Relevancy** checks whether the final response actually addresses what was asked — not just whether it sounds good.
|
||
|
||
## Building an eval harness
|
||
|
||
Start with a **golden dataset**: 100–200 question/answer pairs that domain experts have verified. Run your pipeline against them nightly. Track the three metrics above over time. A drop in Faithfulness after a model upgrade is a red flag; a drop in Context Precision after a chunking change means your retrieval is degrading.
|
||
|
||
The harness does not have to be complex. A spreadsheet with automatic scoring via the OpenAI or Anthropic API is enough to start catching regressions before they reach production.
|
||
""";
|
||
|
||
public const string N8nPatterns = """
|
||
## The problem with "just use n8n"
|
||
|
||
n8n is excellent for integrating SaaS tools. It becomes fragile when you try to use it as an agent orchestrator — long-running loops, conditional retries, and LLM calls that can fail in non-obvious ways.
|
||
|
||
## Separating orchestration from integration
|
||
|
||
The pattern that works: **n8n handles triggers and integrations; LangGraph handles agent logic**.
|
||
|
||
An n8n workflow watches a Slack channel. When a message matches a pattern, it calls a LangGraph endpoint with the raw payload. LangGraph runs the multi-step reasoning loop, maintains state, and returns a structured result. n8n takes that result and routes it — posts to Jira, sends an email, updates a database row.
|
||
|
||
## Making agents auditable
|
||
|
||
Every LangGraph state transition should emit an event to a structured log. We use a Postgres table with columns: `run_id`, `step`, `input`, `output`, `timestamp`. This table becomes the audit trail that compliance teams and on-call engineers both need.
|
||
|
||
Add a `human_in_the_loop` node for any action that cannot be undone — deleting records, sending external emails, approving payments. The node pauses execution and posts to Slack; a human approves or rejects; execution resumes.
|
||
|
||
## Handling failures gracefully
|
||
|
||
LLM calls fail. Build **retry with exponential backoff** into every LangGraph node that calls an LLM. Set a hard limit of 3 retries, then route to a dead-letter state that pages the on-call engineer. Never silently swallow errors in agentic pipelines — a swallowed error is an invisible outage.
|
||
""";
|
||
|
||
public const string VertexCost = """
|
||
## Anti-pattern 1: calling Gemini Ultra for everything
|
||
|
||
Gemini Ultra (or GPT-4-class models) costs 10–30× more per token than smaller models. Many teams default to the most capable model because it "just works" during prototyping, then never re-evaluate.
|
||
|
||
**Fix**: build a **model router**. Classify each incoming request by complexity. Simple lookups, short summaries, and classification tasks go to Gemini Flash or Haiku. Only complex reasoning, multi-step synthesis, and long-context tasks go to Pro or Ultra. In most production systems, 60–80% of requests can be served by the cheaper tier.
|
||
|
||
## Anti-pattern 2: no context caching
|
||
|
||
Vertex AI supports prompt caching (as does the Anthropic API). A system prompt that is 10k tokens, sent with every request at $3/M tokens, costs $30 for every million calls before the user has typed a single word.
|
||
|
||
**Fix**: cache any context that is static or changes infrequently — system prompts, retrieved document sets, few-shot examples. Cache hits cost ~10% of full input price.
|
||
|
||
## Anti-pattern 3: synchronous batch jobs
|
||
|
||
Teams run nightly document processing jobs synchronously — one document at a time, each blocked on the previous. This is slow and expensive because you pay for idle wait time between calls.
|
||
|
||
**Fix**: use the Vertex AI batch prediction API for jobs over ~1,000 documents. Batch jobs run asynchronously, are eligible for spot discounts, and typically cost 50% less per token than online serving.
|
||
""";
|
||
|
||
public const string K8sInference = """
|
||
## The baseline architecture
|
||
|
||
A single Kubernetes `Deployment` behind a `ClusterIP` `Service`, fronted by an Ingress. Works fine up to ~50 RPS for a small model. Falls apart when traffic spikes, when GPU pods take 3 minutes to schedule, or when the model server has a 2-second cold-start.
|
||
|
||
## Autoscaling with KEDA
|
||
|
||
HPA (Horizontal Pod Autoscaler) scales on CPU and memory. LLM inference is GPU-bound and queue-depth-bound — neither maps to CPU utilization well.
|
||
|
||
KEDA (Kubernetes Event-Driven Autoscaling) scales on arbitrary metrics — queue depth, Pub/Sub lag, Redis list length. We publish inference request counts to a Redis stream; KEDA scales the model server pods when the stream depth exceeds a threshold. Scaling-up latency drops from minutes (cluster autoscaler cold start) to seconds (replica scale-up from 1 to N).
|
||
|
||
## GPU sharing with time-slicing
|
||
|
||
For models that fit in 4–8 GB VRAM, full GPU dedication is wasteful. NVIDIA's time-slicing MIG (Multi-Instance GPU) lets multiple pods share one A100, each getting a guaranteed slice.
|
||
|
||
Configure `nvidia.com/gpu: 1` and set the time-slice profile to `1g.10gb`. A single A100 80GB can serve 8 concurrent model instances at 10 GB each — 8× the throughput per GPU.
|
||
|
||
## Request hedging for tail latency
|
||
|
||
p50 latency is 12ms. p99 is 280ms. The tail is dominated by KV-cache misses and occasional GC pauses. **Hedged requests**: after 40ms, send a duplicate request to a second replica. Take whichever response arrives first; cancel the other. This cuts p99 from 280ms to ~45ms with only ~15% increase in total compute.
|
||
""";
|
||
|
||
public const string FlutterAI = """
|
||
## Why on-device inference matters
|
||
|
||
Cloud inference requires a network round-trip, exposes user data to a server, and fails in offline scenarios. For consumer apps — messaging, health, productivity — on-device inference is often a requirement, not a nice-to-have.
|
||
|
||
## Gemini Nano and LiteRT
|
||
|
||
Google's Gemini Nano is a 1.8B parameter model quantized to run on mobile NPUs (Neural Processing Units). The Flutter integration uses the `google_ai_dart_sdk` package with `GeminiNanoModel`, falling back to cloud inference when the device model is unavailable.
|
||
|
||
LiteRT (formerly TensorFlow Lite) handles vision and custom small models. For classification and embedding tasks, a 50MB quantized model runs in under 20ms on a mid-range Android device.
|
||
|
||
## Streaming UX without a network
|
||
|
||
The key insight: users tolerate slightly slower responses if they can see text appearing token by token. Even on-device inference can stream — Gemini Nano's Dart SDK exposes a `generateContentStream` method. Pipe tokens directly to a Flutter `StreamBuilder` for a responsive feel regardless of total generation time.
|
||
|
||
## Battery and thermal management
|
||
|
||
On-device inference heats the chip. Implement **thermal throttling**: check `DeviceInfo.thermalState` (iOS) or subscribe to the battery API on Android. Reduce `maxTokens` from 512 to 128 during sustained load. Schedule background inference tasks during charging. Users notice neither the throttling nor the scheduling — they notice when their phone gets too hot.
|
||
""";
|
||
|
||
public const string EnterpriseRoadmap = """
|
||
## Days 1–30: discovery
|
||
|
||
The most expensive mistake in enterprise AI is building the wrong thing fast. Discovery is not a formality — it is the work.
|
||
|
||
Interview 8–12 stakeholders across business units. For each, ask: what manual task takes more than 2 hours per week? What decision do you make with incomplete information? What report do you wish existed but is too expensive to build?
|
||
|
||
Map the candidates on a 2×2: **impact** (revenue, cost, risk) vs **feasibility** (data quality, integration complexity, regulatory constraints). The top-right quadrant is your first sprint.
|
||
|
||
## Days 31–60: prototype and validate
|
||
|
||
Pick one use case from the top-right. Build a prototype in 3 weeks. The prototype does not have to be production-grade — it has to be **testable by domain experts**.
|
||
|
||
Run a structured eval: 100 questions, domain expert scores each answer 1–5. Set a threshold (e.g., ≥4.0 average) before the sprint begins. If the prototype clears it, proceed to production hardening. If it doesn't, investigate root cause — usually data quality or chunking strategy — before committing engineering resources.
|
||
|
||
## Days 61–90: first production deployment
|
||
|
||
Scope the first deployment to a single team of 10–20 people. This limits blast radius and generates real usage data fast.
|
||
|
||
Instrument everything: latency, cost per query, thumbs-up/thumbs-down from users, faithfulness score from the automated harness. Review metrics weekly with the business owner. Adjust chunking, retrieval strategy, or model tier based on what the data shows — not intuition.
|
||
|
||
At day 90, you have a live system, a tuned eval harness, and a clear picture of what the second use case should be. That is the foundation for a credible 12-month roadmap.
|
||
""";
|
||
}
|