using System.Text; using SoroushAsadi.Services; namespace SoroushAsadi.Pages.Blog; public class PostModel(ContentService content) : BasePageModel { [Microsoft.AspNetCore.Mvc.BindProperty(SupportsGet = true)] public string Slug { get; set; } = ""; public string Title { get; private set; } = ""; public string Category { get; private set; } = ""; public int ReadTime { get; private set; } public string BodyHtml { get; private set; } = ""; public bool PostNotFound { get; private set; } // Default bodies (Markdown-lite, rendered server-side) private static readonly Dictionary _defaults = new() { ["rag-eval-framework"] = ("LLM", "A RAG evaluation framework that holds up in production", "چارچوب ارزیابی RAG که در تولید کار می‌کند", 8, DefaultBodies.RagEval), ["agentic-n8n-patterns"] = ("Automation", "Agentic patterns with n8n for the enterprise", "الگوهای عامل‌محور با n8n برای سازمان", 11, DefaultBodies.N8nPatterns), ["vertex-cost-control"] = ("Google Stack", "Vertex AI cost control at scale", "کنترل هزینه روی Vertex AI در مقیاس بالا", 6, DefaultBodies.VertexCost), ["k8s-llm-inference"] = ("Infra", "Sub-50ms LLM inference on Kubernetes", "استنتاج LLM روی Kubernetes با تأخیر زیر ۵۰ ms",14, DefaultBodies.K8sInference), ["flutter-on-device-ai"] = ("Mobile", "On-device AI in Flutter", "هوش مصنوعی on-device در Flutter", 9, DefaultBodies.FlutterAI), ["enterprise-ai-roadmap"] = ("Strategy", "A 90-day enterprise AI roadmap", "نقشه راه هوش مصنوعی سازمانی در ۹۰ روز", 7, DefaultBodies.EnterpriseRoadmap), }; public void OnGet() { if (!_defaults.TryGetValue(Slug, out var def)) { PostNotFound = true; return; } // Check for DB override (stored under "posts" key as slug→{body,...}) var overrides = content.GetPostOverrides(); string body = def.Body; if (overrides.TryGetValue(Slug, out var node) && node["body"]?.GetValue() is { } dbBody) body = dbBody; Title = IsFa ? def.TitleFa : def.TitleEn; Category = def.Cat; ReadTime = def.RT; BodyHtml = SimpleMarkdown(body); } // Minimal Markdown → HTML (headings, bold, code, paragraphs) private static string SimpleMarkdown(string md) { if (string.IsNullOrWhiteSpace(md)) return ""; var sb = new StringBuilder(); foreach (var rawLine in md.Split('\n')) { var line = rawLine.TrimEnd(); if (line.StartsWith("## ")) { sb.Append($"

{Inline(line[3..])}

\n"); continue; } if (line.StartsWith("### ")) { sb.Append($"

{Inline(line[4..])}

\n"); continue; } if (line.StartsWith("- ")) { sb.Append($"
  • {Inline(line[2..])}
  • \n"); continue; } if (string.IsNullOrWhiteSpace(line)) { sb.Append('\n'); continue; } sb.Append($"

    {Inline(line)}

    \n"); } return sb.ToString(); } private static string Inline(string s) { // **bold**, `code`, &, <, > var sb = new StringBuilder(); int i = 0; while (i < s.Length) { if (i + 1 < s.Length && s[i] == '*' && s[i + 1] == '*') { int end = s.IndexOf("**", i + 2); if (end >= 0) { sb.Append(""); sb.Append(Esc(s[(i + 2)..end])); sb.Append(""); i = end + 2; continue; } } if (s[i] == '`') { int end = s.IndexOf('`', i + 1); if (end >= 0) { sb.Append(""); sb.Append(Esc(s[(i + 1)..end])); sb.Append(""); i = end + 1; continue; } } sb.Append(s[i] switch { '&' => "&", '<' => "<", '>' => ">", _ => s[i].ToString() }); i++; } return sb.ToString(); } private static string Esc(string s) => s.Replace("&","&").Replace("<","<").Replace(">",">"); } /// Default article bodies (Markdown). internal static class DefaultBodies { public const string RagEval = """ ## Why standard metrics fail for RAG BLEU and ROUGE measure n-gram overlap against a reference answer. In a RAG system, there is often no single correct reference — a question about company policy may have dozens of valid phrasings. High BLEU does not mean the system cited the right source; low BLEU does not mean it was wrong. ## The three metrics that actually matter **Faithfulness** measures whether every claim in the generated answer can be traced back to a retrieved passage. A faithfulness score of 1.0 means the model invented nothing. Tools like RAGAS implement this with an LLM judge. **Context Precision** asks: of the passages retrieved, how many were actually relevant to the question? Low precision wastes context window and increases hallucination risk. **Answer Relevancy** checks whether the final response actually addresses what was asked — not just whether it sounds good. ## Building an eval harness Start with a **golden dataset**: 100–200 question/answer pairs that domain experts have verified. Run your pipeline against them nightly. Track the three metrics above over time. A drop in Faithfulness after a model upgrade is a red flag; a drop in Context Precision after a chunking change means your retrieval is degrading. The harness does not have to be complex. A spreadsheet with automatic scoring via the OpenAI or Anthropic API is enough to start catching regressions before they reach production. """; public const string N8nPatterns = """ ## The problem with "just use n8n" n8n is excellent for integrating SaaS tools. It becomes fragile when you try to use it as an agent orchestrator — long-running loops, conditional retries, and LLM calls that can fail in non-obvious ways. ## Separating orchestration from integration The pattern that works: **n8n handles triggers and integrations; LangGraph handles agent logic**. An n8n workflow watches a Slack channel. When a message matches a pattern, it calls a LangGraph endpoint with the raw payload. LangGraph runs the multi-step reasoning loop, maintains state, and returns a structured result. n8n takes that result and routes it — posts to Jira, sends an email, updates a database row. ## Making agents auditable Every LangGraph state transition should emit an event to a structured log. We use a Postgres table with columns: `run_id`, `step`, `input`, `output`, `timestamp`. This table becomes the audit trail that compliance teams and on-call engineers both need. Add a `human_in_the_loop` node for any action that cannot be undone — deleting records, sending external emails, approving payments. The node pauses execution and posts to Slack; a human approves or rejects; execution resumes. ## Handling failures gracefully LLM calls fail. Build **retry with exponential backoff** into every LangGraph node that calls an LLM. Set a hard limit of 3 retries, then route to a dead-letter state that pages the on-call engineer. Never silently swallow errors in agentic pipelines — a swallowed error is an invisible outage. """; public const string VertexCost = """ ## Anti-pattern 1: calling Gemini Ultra for everything Gemini Ultra (or GPT-4-class models) costs 10–30× more per token than smaller models. Many teams default to the most capable model because it "just works" during prototyping, then never re-evaluate. **Fix**: build a **model router**. Classify each incoming request by complexity. Simple lookups, short summaries, and classification tasks go to Gemini Flash or Haiku. Only complex reasoning, multi-step synthesis, and long-context tasks go to Pro or Ultra. In most production systems, 60–80% of requests can be served by the cheaper tier. ## Anti-pattern 2: no context caching Vertex AI supports prompt caching (as does the Anthropic API). A system prompt that is 10k tokens, sent with every request at $3/M tokens, costs $30 for every million calls before the user has typed a single word. **Fix**: cache any context that is static or changes infrequently — system prompts, retrieved document sets, few-shot examples. Cache hits cost ~10% of full input price. ## Anti-pattern 3: synchronous batch jobs Teams run nightly document processing jobs synchronously — one document at a time, each blocked on the previous. This is slow and expensive because you pay for idle wait time between calls. **Fix**: use the Vertex AI batch prediction API for jobs over ~1,000 documents. Batch jobs run asynchronously, are eligible for spot discounts, and typically cost 50% less per token than online serving. """; public const string K8sInference = """ ## The baseline architecture A single Kubernetes `Deployment` behind a `ClusterIP` `Service`, fronted by an Ingress. Works fine up to ~50 RPS for a small model. Falls apart when traffic spikes, when GPU pods take 3 minutes to schedule, or when the model server has a 2-second cold-start. ## Autoscaling with KEDA HPA (Horizontal Pod Autoscaler) scales on CPU and memory. LLM inference is GPU-bound and queue-depth-bound — neither maps to CPU utilization well. KEDA (Kubernetes Event-Driven Autoscaling) scales on arbitrary metrics — queue depth, Pub/Sub lag, Redis list length. We publish inference request counts to a Redis stream; KEDA scales the model server pods when the stream depth exceeds a threshold. Scaling-up latency drops from minutes (cluster autoscaler cold start) to seconds (replica scale-up from 1 to N). ## GPU sharing with time-slicing For models that fit in 4–8 GB VRAM, full GPU dedication is wasteful. NVIDIA's time-slicing MIG (Multi-Instance GPU) lets multiple pods share one A100, each getting a guaranteed slice. Configure `nvidia.com/gpu: 1` and set the time-slice profile to `1g.10gb`. A single A100 80GB can serve 8 concurrent model instances at 10 GB each — 8× the throughput per GPU. ## Request hedging for tail latency p50 latency is 12ms. p99 is 280ms. The tail is dominated by KV-cache misses and occasional GC pauses. **Hedged requests**: after 40ms, send a duplicate request to a second replica. Take whichever response arrives first; cancel the other. This cuts p99 from 280ms to ~45ms with only ~15% increase in total compute. """; public const string FlutterAI = """ ## Why on-device inference matters Cloud inference requires a network round-trip, exposes user data to a server, and fails in offline scenarios. For consumer apps — messaging, health, productivity — on-device inference is often a requirement, not a nice-to-have. ## Gemini Nano and LiteRT Google's Gemini Nano is a 1.8B parameter model quantized to run on mobile NPUs (Neural Processing Units). The Flutter integration uses the `google_ai_dart_sdk` package with `GeminiNanoModel`, falling back to cloud inference when the device model is unavailable. LiteRT (formerly TensorFlow Lite) handles vision and custom small models. For classification and embedding tasks, a 50MB quantized model runs in under 20ms on a mid-range Android device. ## Streaming UX without a network The key insight: users tolerate slightly slower responses if they can see text appearing token by token. Even on-device inference can stream — Gemini Nano's Dart SDK exposes a `generateContentStream` method. Pipe tokens directly to a Flutter `StreamBuilder` for a responsive feel regardless of total generation time. ## Battery and thermal management On-device inference heats the chip. Implement **thermal throttling**: check `DeviceInfo.thermalState` (iOS) or subscribe to the battery API on Android. Reduce `maxTokens` from 512 to 128 during sustained load. Schedule background inference tasks during charging. Users notice neither the throttling nor the scheduling — they notice when their phone gets too hot. """; public const string EnterpriseRoadmap = """ ## Days 1–30: discovery The most expensive mistake in enterprise AI is building the wrong thing fast. Discovery is not a formality — it is the work. Interview 8–12 stakeholders across business units. For each, ask: what manual task takes more than 2 hours per week? What decision do you make with incomplete information? What report do you wish existed but is too expensive to build? Map the candidates on a 2×2: **impact** (revenue, cost, risk) vs **feasibility** (data quality, integration complexity, regulatory constraints). The top-right quadrant is your first sprint. ## Days 31–60: prototype and validate Pick one use case from the top-right. Build a prototype in 3 weeks. The prototype does not have to be production-grade — it has to be **testable by domain experts**. Run a structured eval: 100 questions, domain expert scores each answer 1–5. Set a threshold (e.g., ≥4.0 average) before the sprint begins. If the prototype clears it, proceed to production hardening. If it doesn't, investigate root cause — usually data quality or chunking strategy — before committing engineering resources. ## Days 61–90: first production deployment Scope the first deployment to a single team of 10–20 people. This limits blast radius and generates real usage data fast. Instrument everything: latency, cost per query, thumbs-up/thumbs-down from users, faithfulness score from the automated harness. Review metrics weekly with the business owner. Adjust chunking, retrieval strategy, or model tier based on what the data shows — not intuition. At day 90, you have a live system, a tuned eval harness, and a clear picture of what the second use case should be. That is the foundation for a credible 12-month roadmap. """; }