1b3a8b493e
deploy / deploy (push) Failing after 1m21s
Full rewrite of the portfolio site from Next.js 14 to .NET 10: - ASP.NET Core 10 Razor Pages, no Node.js dependency - EF Core 10 + SQLite (same schema as before — data survives upgrade) - Cookie authentication (same single-password model) - Resend contact form via HttpClient - Bilingual FA/EN via locale cookie + BasePageModel - All UI ported to Razor Pages with Tailwind CDN + custom CSS - Vanilla JS: particles, typewriter, cursor, animations, portfolio modal - Dockerfile: SDK 10.0-alpine → aspnet 10.0-alpine (no npm/Node needed) - CI/CD: dropped NPM_TOKEN, ADMIN_SESSION_SECRET — pure dotnet publish Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
217 lines
14 KiB
C#
217 lines
14 KiB
C#
using System.Text;
|
||
using SoroushAsadi.Services;
|
||
|
||
namespace SoroushAsadi.Pages.Blog;
|
||
|
||
public class PostModel(ContentService content) : BasePageModel
|
||
{
|
||
[Microsoft.AspNetCore.Mvc.BindProperty(SupportsGet = true)]
|
||
public string Slug { get; set; } = "";
|
||
|
||
public string Title { get; private set; } = "";
|
||
public string Category { get; private set; } = "";
|
||
public int ReadTime { get; private set; }
|
||
public string BodyHtml { get; private set; } = "";
|
||
public bool PostNotFound { get; private set; }
|
||
|
||
// Default bodies (Markdown-lite, rendered server-side)
|
||
private static readonly Dictionary<string, (string Cat, string TitleEn, string TitleFa, int RT, string Body)> _defaults = new()
|
||
{
|
||
["rag-eval-framework"] = ("LLM", "A RAG evaluation framework that holds up in production", "چارچوب ارزیابی RAG که در تولید کار میکند", 8, DefaultBodies.RagEval),
|
||
["agentic-n8n-patterns"] = ("Automation", "Agentic patterns with n8n for the enterprise", "الگوهای عاملمحور با n8n برای سازمان", 11, DefaultBodies.N8nPatterns),
|
||
["vertex-cost-control"] = ("Google Stack", "Vertex AI cost control at scale", "کنترل هزینه روی Vertex AI در مقیاس بالا", 6, DefaultBodies.VertexCost),
|
||
["k8s-llm-inference"] = ("Infra", "Sub-50ms LLM inference on Kubernetes", "استنتاج LLM روی Kubernetes با تأخیر زیر ۵۰ ms",14, DefaultBodies.K8sInference),
|
||
["flutter-on-device-ai"] = ("Mobile", "On-device AI in Flutter", "هوش مصنوعی on-device در Flutter", 9, DefaultBodies.FlutterAI),
|
||
["enterprise-ai-roadmap"] = ("Strategy", "A 90-day enterprise AI roadmap", "نقشه راه هوش مصنوعی سازمانی در ۹۰ روز", 7, DefaultBodies.EnterpriseRoadmap),
|
||
};
|
||
|
||
public void OnGet()
|
||
{
|
||
if (!_defaults.TryGetValue(Slug, out var def)) { PostNotFound = true; return; }
|
||
|
||
// Check for DB override (stored under "posts" key as slug→{body,...})
|
||
var overrides = content.GetPostOverrides();
|
||
string body = def.Body;
|
||
if (overrides.TryGetValue(Slug, out var node) && node["body"]?.GetValue<string>() is { } dbBody)
|
||
body = dbBody;
|
||
|
||
Title = IsFa ? def.TitleFa : def.TitleEn;
|
||
Category = def.Cat;
|
||
ReadTime = def.RT;
|
||
BodyHtml = SimpleMarkdown(body);
|
||
}
|
||
|
||
// Minimal Markdown → HTML (headings, bold, code, paragraphs)
|
||
private static string SimpleMarkdown(string md)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(md)) return "";
|
||
var sb = new StringBuilder();
|
||
foreach (var rawLine in md.Split('\n'))
|
||
{
|
||
var line = rawLine.TrimEnd();
|
||
if (line.StartsWith("## ")) { sb.Append($"<h2>{Inline(line[3..])}</h2>\n"); continue; }
|
||
if (line.StartsWith("### ")) { sb.Append($"<h3>{Inline(line[4..])}</h3>\n"); continue; }
|
||
if (line.StartsWith("- ")) { sb.Append($"<li>{Inline(line[2..])}</li>\n"); continue; }
|
||
if (string.IsNullOrWhiteSpace(line)) { sb.Append('\n'); continue; }
|
||
sb.Append($"<p>{Inline(line)}</p>\n");
|
||
}
|
||
return sb.ToString();
|
||
}
|
||
|
||
private static string Inline(string s)
|
||
{
|
||
// **bold**, `code`, &, <, >
|
||
var sb = new StringBuilder();
|
||
int i = 0;
|
||
while (i < s.Length)
|
||
{
|
||
if (i + 1 < s.Length && s[i] == '*' && s[i + 1] == '*')
|
||
{
|
||
int end = s.IndexOf("**", i + 2);
|
||
if (end >= 0) { sb.Append("<strong>"); sb.Append(Esc(s[(i + 2)..end])); sb.Append("</strong>"); i = end + 2; continue; }
|
||
}
|
||
if (s[i] == '`')
|
||
{
|
||
int end = s.IndexOf('`', i + 1);
|
||
if (end >= 0) { sb.Append("<code>"); sb.Append(Esc(s[(i + 1)..end])); sb.Append("</code>"); i = end + 1; continue; }
|
||
}
|
||
sb.Append(s[i] switch { '&' => "&", '<' => "<", '>' => ">", _ => s[i].ToString() });
|
||
i++;
|
||
}
|
||
return sb.ToString();
|
||
}
|
||
private static string Esc(string s) => s.Replace("&","&").Replace("<","<").Replace(">",">");
|
||
}
|
||
|
||
/// Default article bodies (Markdown).
|
||
internal static class DefaultBodies
|
||
{
|
||
public const string RagEval = """
|
||
## Why standard metrics fail for RAG
|
||
|
||
BLEU and ROUGE measure n-gram overlap against a reference answer. In a RAG system, there is often no single correct reference — a question about company policy may have dozens of valid phrasings. High BLEU does not mean the system cited the right source; low BLEU does not mean it was wrong.
|
||
|
||
## The three metrics that actually matter
|
||
|
||
**Faithfulness** measures whether every claim in the generated answer can be traced back to a retrieved passage. A faithfulness score of 1.0 means the model invented nothing. Tools like RAGAS implement this with an LLM judge.
|
||
|
||
**Context Precision** asks: of the passages retrieved, how many were actually relevant to the question? Low precision wastes context window and increases hallucination risk.
|
||
|
||
**Answer Relevancy** checks whether the final response actually addresses what was asked — not just whether it sounds good.
|
||
|
||
## Building an eval harness
|
||
|
||
Start with a **golden dataset**: 100–200 question/answer pairs that domain experts have verified. Run your pipeline against them nightly. Track the three metrics above over time. A drop in Faithfulness after a model upgrade is a red flag; a drop in Context Precision after a chunking change means your retrieval is degrading.
|
||
|
||
The harness does not have to be complex. A spreadsheet with automatic scoring via the OpenAI or Anthropic API is enough to start catching regressions before they reach production.
|
||
""";
|
||
|
||
public const string N8nPatterns = """
|
||
## The problem with "just use n8n"
|
||
|
||
n8n is excellent for integrating SaaS tools. It becomes fragile when you try to use it as an agent orchestrator — long-running loops, conditional retries, and LLM calls that can fail in non-obvious ways.
|
||
|
||
## Separating orchestration from integration
|
||
|
||
The pattern that works: **n8n handles triggers and integrations; LangGraph handles agent logic**.
|
||
|
||
An n8n workflow watches a Slack channel. When a message matches a pattern, it calls a LangGraph endpoint with the raw payload. LangGraph runs the multi-step reasoning loop, maintains state, and returns a structured result. n8n takes that result and routes it — posts to Jira, sends an email, updates a database row.
|
||
|
||
## Making agents auditable
|
||
|
||
Every LangGraph state transition should emit an event to a structured log. We use a Postgres table with columns: `run_id`, `step`, `input`, `output`, `timestamp`. This table becomes the audit trail that compliance teams and on-call engineers both need.
|
||
|
||
Add a `human_in_the_loop` node for any action that cannot be undone — deleting records, sending external emails, approving payments. The node pauses execution and posts to Slack; a human approves or rejects; execution resumes.
|
||
|
||
## Handling failures gracefully
|
||
|
||
LLM calls fail. Build **retry with exponential backoff** into every LangGraph node that calls an LLM. Set a hard limit of 3 retries, then route to a dead-letter state that pages the on-call engineer. Never silently swallow errors in agentic pipelines — a swallowed error is an invisible outage.
|
||
""";
|
||
|
||
public const string VertexCost = """
|
||
## Anti-pattern 1: calling Gemini Ultra for everything
|
||
|
||
Gemini Ultra (or GPT-4-class models) costs 10–30× more per token than smaller models. Many teams default to the most capable model because it "just works" during prototyping, then never re-evaluate.
|
||
|
||
**Fix**: build a **model router**. Classify each incoming request by complexity. Simple lookups, short summaries, and classification tasks go to Gemini Flash or Haiku. Only complex reasoning, multi-step synthesis, and long-context tasks go to Pro or Ultra. In most production systems, 60–80% of requests can be served by the cheaper tier.
|
||
|
||
## Anti-pattern 2: no context caching
|
||
|
||
Vertex AI supports prompt caching (as does the Anthropic API). A system prompt that is 10k tokens, sent with every request at $3/M tokens, costs $30 for every million calls before the user has typed a single word.
|
||
|
||
**Fix**: cache any context that is static or changes infrequently — system prompts, retrieved document sets, few-shot examples. Cache hits cost ~10% of full input price.
|
||
|
||
## Anti-pattern 3: synchronous batch jobs
|
||
|
||
Teams run nightly document processing jobs synchronously — one document at a time, each blocked on the previous. This is slow and expensive because you pay for idle wait time between calls.
|
||
|
||
**Fix**: use the Vertex AI batch prediction API for jobs over ~1,000 documents. Batch jobs run asynchronously, are eligible for spot discounts, and typically cost 50% less per token than online serving.
|
||
""";
|
||
|
||
public const string K8sInference = """
|
||
## The baseline architecture
|
||
|
||
A single Kubernetes `Deployment` behind a `ClusterIP` `Service`, fronted by an Ingress. Works fine up to ~50 RPS for a small model. Falls apart when traffic spikes, when GPU pods take 3 minutes to schedule, or when the model server has a 2-second cold-start.
|
||
|
||
## Autoscaling with KEDA
|
||
|
||
HPA (Horizontal Pod Autoscaler) scales on CPU and memory. LLM inference is GPU-bound and queue-depth-bound — neither maps to CPU utilization well.
|
||
|
||
KEDA (Kubernetes Event-Driven Autoscaling) scales on arbitrary metrics — queue depth, Pub/Sub lag, Redis list length. We publish inference request counts to a Redis stream; KEDA scales the model server pods when the stream depth exceeds a threshold. Scaling-up latency drops from minutes (cluster autoscaler cold start) to seconds (replica scale-up from 1 to N).
|
||
|
||
## GPU sharing with time-slicing
|
||
|
||
For models that fit in 4–8 GB VRAM, full GPU dedication is wasteful. NVIDIA's time-slicing MIG (Multi-Instance GPU) lets multiple pods share one A100, each getting a guaranteed slice.
|
||
|
||
Configure `nvidia.com/gpu: 1` and set the time-slice profile to `1g.10gb`. A single A100 80GB can serve 8 concurrent model instances at 10 GB each — 8× the throughput per GPU.
|
||
|
||
## Request hedging for tail latency
|
||
|
||
p50 latency is 12ms. p99 is 280ms. The tail is dominated by KV-cache misses and occasional GC pauses. **Hedged requests**: after 40ms, send a duplicate request to a second replica. Take whichever response arrives first; cancel the other. This cuts p99 from 280ms to ~45ms with only ~15% increase in total compute.
|
||
""";
|
||
|
||
public const string FlutterAI = """
|
||
## Why on-device inference matters
|
||
|
||
Cloud inference requires a network round-trip, exposes user data to a server, and fails in offline scenarios. For consumer apps — messaging, health, productivity — on-device inference is often a requirement, not a nice-to-have.
|
||
|
||
## Gemini Nano and LiteRT
|
||
|
||
Google's Gemini Nano is a 1.8B parameter model quantized to run on mobile NPUs (Neural Processing Units). The Flutter integration uses the `google_ai_dart_sdk` package with `GeminiNanoModel`, falling back to cloud inference when the device model is unavailable.
|
||
|
||
LiteRT (formerly TensorFlow Lite) handles vision and custom small models. For classification and embedding tasks, a 50MB quantized model runs in under 20ms on a mid-range Android device.
|
||
|
||
## Streaming UX without a network
|
||
|
||
The key insight: users tolerate slightly slower responses if they can see text appearing token by token. Even on-device inference can stream — Gemini Nano's Dart SDK exposes a `generateContentStream` method. Pipe tokens directly to a Flutter `StreamBuilder` for a responsive feel regardless of total generation time.
|
||
|
||
## Battery and thermal management
|
||
|
||
On-device inference heats the chip. Implement **thermal throttling**: check `DeviceInfo.thermalState` (iOS) or subscribe to the battery API on Android. Reduce `maxTokens` from 512 to 128 during sustained load. Schedule background inference tasks during charging. Users notice neither the throttling nor the scheduling — they notice when their phone gets too hot.
|
||
""";
|
||
|
||
public const string EnterpriseRoadmap = """
|
||
## Days 1–30: discovery
|
||
|
||
The most expensive mistake in enterprise AI is building the wrong thing fast. Discovery is not a formality — it is the work.
|
||
|
||
Interview 8–12 stakeholders across business units. For each, ask: what manual task takes more than 2 hours per week? What decision do you make with incomplete information? What report do you wish existed but is too expensive to build?
|
||
|
||
Map the candidates on a 2×2: **impact** (revenue, cost, risk) vs **feasibility** (data quality, integration complexity, regulatory constraints). The top-right quadrant is your first sprint.
|
||
|
||
## Days 31–60: prototype and validate
|
||
|
||
Pick one use case from the top-right. Build a prototype in 3 weeks. The prototype does not have to be production-grade — it has to be **testable by domain experts**.
|
||
|
||
Run a structured eval: 100 questions, domain expert scores each answer 1–5. Set a threshold (e.g., ≥4.0 average) before the sprint begins. If the prototype clears it, proceed to production hardening. If it doesn't, investigate root cause — usually data quality or chunking strategy — before committing engineering resources.
|
||
|
||
## Days 61–90: first production deployment
|
||
|
||
Scope the first deployment to a single team of 10–20 people. This limits blast radius and generates real usage data fast.
|
||
|
||
Instrument everything: latency, cost per query, thumbs-up/thumbs-down from users, faithfulness score from the automated harness. Review metrics weekly with the business owner. Adjust chunking, retrieval strategy, or model tier based on what the data shows — not intuition.
|
||
|
||
At day 90, you have a live system, a tuned eval harness, and a clear picture of what the second use case should be. That is the foundation for a credible 12-month roadmap.
|
||
""";
|
||
}
|