soroushasadi/Pages/Blog/Post.cshtml.cs

using System.Text;
using SoroushAsadi.Services;

namespace SoroushAsadi.Pages.Blog;

public class PostModel(ContentService content) : BasePageModel
{
    [Microsoft.AspNetCore.Mvc.BindProperty(SupportsGet = true)]
    public string Slug { get; set; } = "";

    public string Title    { get; private set; } = "";
    public string Category { get; private set; } = "";
    public int    ReadTime { get; private set; }
    public string BodyHtml { get; private set; } = "";
    public bool   PostNotFound { get; private set; }

    // Default bodies (Markdown-lite, rendered server-side)
    private static readonly Dictionary<string, (string Cat, string TitleEn, string TitleFa, int RT, string Body)> _defaults = new()
    {
        ["rag-eval-framework"]    = ("LLM",          "A RAG evaluation framework that holds up in production",     "چارچوب ارزیابی RAG که در عمل جواب می‌دهد",    8,  DefaultBodies.RagEval),
        ["agentic-n8n-patterns"]  = ("Automation",   "Agentic patterns with n8n for the enterprise",               "الگوهای عامل‌محور با n8n برای سازمان",          11, DefaultBodies.N8nPatterns),
        ["vertex-cost-control"]   = ("Google Stack", "Vertex AI cost control at scale",                            "کنترل هزینه روی Vertex AI در مقیاس بالا",       6,  DefaultBodies.VertexCost),
        ["k8s-llm-inference"]     = ("Infra",        "Sub-50ms LLM inference on Kubernetes",                       "اجرای LLM روی Kubernetes با تأخیر زیر ۵۰ میلی‌ثانیه",14, DefaultBodies.K8sInference),
        ["flutter-on-device-ai"]  = ("Mobile",       "On-device AI in Flutter",                                    "هوش مصنوعی روی دستگاه در Flutter",               9,  DefaultBodies.FlutterAI),
        ["enterprise-ai-roadmap"] = ("Strategy",     "A 90-day enterprise AI roadmap",                             "نقشه‌ی راه هوش مصنوعی سازمانی در ۹۰ روز",        7,  DefaultBodies.EnterpriseRoadmap),
    };

    public void OnGet()
    {
        if (!_defaults.TryGetValue(Slug, out var def)) { PostNotFound = true; return; }

        // Check for DB override (stored under "posts" key as slug→{body,...})
        var overrides = content.GetPostOverrides();
        string body = def.Body;
        if (overrides.TryGetValue(Slug, out var node) && node["body"]?.GetValue<string>() is { } dbBody)
            body = dbBody;

        Title    = IsFa ? def.TitleFa : def.TitleEn;
        Category = def.Cat;
        ReadTime = def.RT;
        BodyHtml = SimpleMarkdown(body);
    }

    // Minimal Markdown → HTML (headings, bold, code, paragraphs)
    private static string SimpleMarkdown(string md)
    {
        if (string.IsNullOrWhiteSpace(md)) return "";
        var sb = new StringBuilder();
        foreach (var rawLine in md.Split('\n'))
        {
            var line = rawLine.TrimEnd();
            if (line.StartsWith("## "))  { sb.Append($"<h2>{Inline(line[3..])}</h2>\n"); continue; }
            if (line.StartsWith("### ")) { sb.Append($"<h3>{Inline(line[4..])}</h3>\n"); continue; }
            if (line.StartsWith("- "))   { sb.Append($"<li>{Inline(line[2..])}</li>\n"); continue; }
            if (string.IsNullOrWhiteSpace(line)) { sb.Append('\n'); continue; }
            sb.Append($"<p>{Inline(line)}</p>\n");
        }
        return sb.ToString();
    }

    private static string Inline(string s)
    {
        // **bold**, `code`, &, <, >
        var sb = new StringBuilder();
        int i = 0;
        while (i < s.Length)
        {
            if (i + 1 < s.Length && s[i] == '*' && s[i + 1] == '*')
            {
                int end = s.IndexOf("**", i + 2);
                if (end >= 0) { sb.Append("<strong>"); sb.Append(Esc(s[(i + 2)..end])); sb.Append("</strong>"); i = end + 2; continue; }
            }
            if (s[i] == '`')
            {
                int end = s.IndexOf('`', i + 1);
                if (end >= 0) { sb.Append("<code>"); sb.Append(Esc(s[(i + 1)..end])); sb.Append("</code>"); i = end + 1; continue; }
            }
            sb.Append(s[i] switch { '&' => "&amp;", '<' => "&lt;", '>' => "&gt;", _ => s[i].ToString() });
            i++;
        }
        return sb.ToString();
    }
    private static string Esc(string s) => s.Replace("&","&amp;").Replace("<","&lt;").Replace(">","&gt;");
}

/// Default article bodies (Markdown).
internal static class DefaultBodies
{
    public const string RagEval = """
## Why standard metrics fail for RAG

BLEU and ROUGE measure n-gram overlap against a reference answer. In a RAG system, there is often no single correct reference — a question about company policy may have dozens of valid phrasings. High BLEU does not mean the system cited the right source; low BLEU does not mean it was wrong.

## The three metrics that actually matter

**Faithfulness** measures whether every claim in the generated answer can be traced back to a retrieved passage. A faithfulness score of 1.0 means the model invented nothing. Tools like RAGAS implement this with an LLM judge.

**Context Precision** asks: of the passages retrieved, how many were actually relevant to the question? Low precision wastes context window and increases hallucination risk.

**Answer Relevancy** checks whether the final response actually addresses what was asked — not just whether it sounds good.

## Building an eval harness

Start with a **golden dataset**: 100–200 question/answer pairs that domain experts have verified. Run your pipeline against them nightly. Track the three metrics above over time. A drop in Faithfulness after a model upgrade is a red flag; a drop in Context Precision after a chunking change means your retrieval is degrading.

The harness does not have to be complex. A spreadsheet with automatic scoring via the OpenAI or Anthropic API is enough to start catching regressions before they reach production.
""";

    public const string N8nPatterns = """
## The problem with "just use n8n"

n8n is excellent for integrating SaaS tools. It becomes fragile when you try to use it as an agent orchestrator — long-running loops, conditional retries, and LLM calls that can fail in non-obvious ways.

## Separating orchestration from integration

The pattern that works: **n8n handles triggers and integrations; LangGraph handles agent logic**.

An n8n workflow watches a Slack channel. When a message matches a pattern, it calls a LangGraph endpoint with the raw payload. LangGraph runs the multi-step reasoning loop, maintains state, and returns a structured result. n8n takes that result and routes it — posts to Jira, sends an email, updates a database row.

## Making agents auditable

Every LangGraph state transition should emit an event to a structured log. We use a Postgres table with columns: `run_id`, `step`, `input`, `output`, `timestamp`. This table becomes the audit trail that compliance teams and on-call engineers both need.

Add a `human_in_the_loop` node for any action that cannot be undone — deleting records, sending external emails, approving payments. The node pauses execution and posts to Slack; a human approves or rejects; execution resumes.

## Handling failures gracefully

LLM calls fail. Build **retry with exponential backoff** into every LangGraph node that calls an LLM. Set a hard limit of 3 retries, then route to a dead-letter state that pages the on-call engineer. Never silently swallow errors in agentic pipelines — a swallowed error is an invisible outage.
""";

    public const string VertexCost = """
## Anti-pattern 1: calling Gemini Ultra for everything

Gemini Ultra (or GPT-4-class models) costs 10–30× more per token than smaller models. Many teams default to the most capable model because it "just works" during prototyping, then never re-evaluate.

**Fix**: build a **model router**. Classify each incoming request by complexity. Simple lookups, short summaries, and classification tasks go to Gemini Flash or Haiku. Only complex reasoning, multi-step synthesis, and long-context tasks go to Pro or Ultra. In most production systems, 60–80% of requests can be served by the cheaper tier.

## Anti-pattern 2: no context caching

Vertex AI supports prompt caching (as does the Anthropic API). A system prompt that is 10k tokens, sent with every request at $3/M tokens, costs $30 for every million calls before the user has typed a single word.

**Fix**: cache any context that is static or changes infrequently — system prompts, retrieved document sets, few-shot examples. Cache hits cost ~10% of full input price.

## Anti-pattern 3: synchronous batch jobs

Teams run nightly document processing jobs synchronously — one document at a time, each blocked on the previous. This is slow and expensive because you pay for idle wait time between calls.

**Fix**: use the Vertex AI batch prediction API for jobs over ~1,000 documents. Batch jobs run asynchronously, are eligible for spot discounts, and typically cost 50% less per token than online serving.
""";

    public const string K8sInference = """
## The baseline architecture

A single Kubernetes `Deployment` behind a `ClusterIP` `Service`, fronted by an Ingress. Works fine up to ~50 RPS for a small model. Falls apart when traffic spikes, when GPU pods take 3 minutes to schedule, or when the model server has a 2-second cold-start.

## Autoscaling with KEDA

HPA (Horizontal Pod Autoscaler) scales on CPU and memory. LLM inference is GPU-bound and queue-depth-bound — neither maps to CPU utilization well.

KEDA (Kubernetes Event-Driven Autoscaling) scales on arbitrary metrics — queue depth, Pub/Sub lag, Redis list length. We publish inference request counts to a Redis stream; KEDA scales the model server pods when the stream depth exceeds a threshold. Scaling-up latency drops from minutes (cluster autoscaler cold start) to seconds (replica scale-up from 1 to N).

## GPU sharing with time-slicing

For models that fit in 4–8 GB VRAM, full GPU dedication is wasteful. NVIDIA's time-slicing MIG (Multi-Instance GPU) lets multiple pods share one A100, each getting a guaranteed slice.

Configure `nvidia.com/gpu: 1` and set the time-slice profile to `1g.10gb`. A single A100 80GB can serve 8 concurrent model instances at 10 GB each — 8× the throughput per GPU.

## Request hedging for tail latency

p50 latency is 12ms. p99 is 280ms. The tail is dominated by KV-cache misses and occasional GC pauses. **Hedged requests**: after 40ms, send a duplicate request to a second replica. Take whichever response arrives first; cancel the other. This cuts p99 from 280ms to ~45ms with only ~15% increase in total compute.
""";

    public const string FlutterAI = """
## Why on-device inference matters

Cloud inference requires a network round-trip, exposes user data to a server, and fails in offline scenarios. For consumer apps — messaging, health, productivity — on-device inference is often a requirement, not a nice-to-have.

## Gemini Nano and LiteRT

Google's Gemini Nano is a 1.8B parameter model quantized to run on mobile NPUs (Neural Processing Units). The Flutter integration uses the `google_ai_dart_sdk` package with `GeminiNanoModel`, falling back to cloud inference when the device model is unavailable.

LiteRT (formerly TensorFlow Lite) handles vision and custom small models. For classification and embedding tasks, a 50MB quantized model runs in under 20ms on a mid-range Android device.

## Streaming UX without a network

The key insight: users tolerate slightly slower responses if they can see text appearing token by token. Even on-device inference can stream — Gemini Nano's Dart SDK exposes a `generateContentStream` method. Pipe tokens directly to a Flutter `StreamBuilder` for a responsive feel regardless of total generation time.

## Battery and thermal management

On-device inference heats the chip. Implement **thermal throttling**: check `DeviceInfo.thermalState` (iOS) or subscribe to the battery API on Android. Reduce `maxTokens` from 512 to 128 during sustained load. Schedule background inference tasks during charging. Users notice neither the throttling nor the scheduling — they notice when their phone gets too hot.
""";

    public const string EnterpriseRoadmap = """
## Days 1–30: discovery

The most expensive mistake in enterprise AI is building the wrong thing fast. Discovery is not a formality — it is the work.

Interview 8–12 stakeholders across business units. For each, ask: what manual task takes more than 2 hours per week? What decision do you make with incomplete information? What report do you wish existed but is too expensive to build?

Map the candidates on a 2×2: **impact** (revenue, cost, risk) vs **feasibility** (data quality, integration complexity, regulatory constraints). The top-right quadrant is your first sprint.

## Days 31–60: prototype and validate

Pick one use case from the top-right. Build a prototype in 3 weeks. The prototype does not have to be production-grade — it has to be **testable by domain experts**.

Run a structured eval: 100 questions, domain expert scores each answer 1–5. Set a threshold (e.g., ≥4.0 average) before the sprint begins. If the prototype clears it, proceed to production hardening. If it doesn't, investigate root cause — usually data quality or chunking strategy — before committing engineering resources.

## Days 61–90: first production deployment

Scope the first deployment to a single team of 10–20 people. This limits blast radius and generates real usage data fast.

Instrument everything: latency, cost per query, thumbs-up/thumbs-down from users, faithfulness score from the automated harness. Review metrics weekly with the business owner. Adjust chunking, retrieval strategy, or model tier based on what the data shows — not intuition.

At day 90, you have a live system, a tuned eval harness, and a clear picture of what the second use case should be. That is the foundation for a credible 12-month roadmap.
""";
}