soroushasadi/lib/content/posts.ts

/**
 * Full article bodies for the blog, seeded for production.
 * Metadata (title, excerpt, category, readTime) lives in the i18n dict;
 * this module holds the long-form body in both locales.
 *
 * When the admin panel / CMS lands, this file becomes the seed source —
 * the shape maps 1:1 to a `posts` table.
 */

export type Block =
  | { k: 'p'; t: string }
  | { k: 'h2'; t: string }
  | { k: 'ul'; items: string[] }
  | { k: 'quote'; t: string }
  | { k: 'code'; lang?: string; t: string };

export type Article = { lead: string; blocks: Block[] };

export type PostContent = {
  /** ISO date */
  date: string;
  /** accent key used for the cover gradient */
  accent: 'electric' | 'violet' | 'magenta' | 'emerald' | 'cyan';
  en: Article;
  fa: Article;
};

export const POSTS: Record<string, PostContent> = {
  'rag-eval-framework': {
    date: '2026-04-22',
    accent: 'magenta',
    en: {
      lead: 'Most RAG systems are shipped on a vibe. A demo answers three questions well, everyone nods, and it goes to production untested. Here is the evaluation framework I install before a single user touches it.',
      blocks: [
        { k: 'h2', t: 'Why BLEU and ROUGE fail you' },
        { k: 'p', t: 'BLEU and ROUGE measure n-gram overlap with a reference answer. For translation that is fine. For retrieval-augmented generation it is misleading: a correct answer phrased differently scores low, and a fluent hallucination that happens to reuse words scores high. You end up optimizing for surface similarity instead of truth.' },
        { k: 'p', t: 'The fix is to split evaluation into two independent layers — retrieval quality and answer quality — and never average them into a single vanity number.' },
        { k: 'h2', t: 'Layer one: retrieval' },
        { k: 'p', t: 'Before the model writes anything, ask whether the right context was even fetched. Build a labelled set of question → gold-chunk pairs and track these:' },
        { k: 'ul', items: [
          'Recall@k — did the gold chunk appear in the top k results?',
          'MRR — how high did it rank when it did appear?',
          'Context precision — what fraction of retrieved chunks were actually relevant?',
        ] },
        { k: 'p', t: 'If recall@5 is below 0.9, no amount of prompt engineering will save the answer. Fix retrieval first — chunking, embeddings, hybrid search — before you touch the generation prompt.' },
        { k: 'h2', t: 'Layer two: answer faithfulness' },
        { k: 'p', t: 'For generation, the metric that actually moves the needle is groundedness: is every claim in the answer supported by the retrieved context? I use an LLM-as-judge with a strict rubric and a small human-graded calibration set to keep the judge honest.' },
        { k: 'quote', t: 'A RAG system you cannot measure is a RAG system you cannot improve. Eval is not a phase — it is the control loop.' },
        { k: 'h2', t: 'Wire it into CI' },
        { k: 'p', t: 'The framework only pays off when it runs on every change. I gate deploys on a regression suite: if faithfulness drops more than two points or recall@5 falls below threshold, the pipeline blocks. That single gate has caught more silent regressions than any manual QA pass.' },
      ],
    },
    fa: {
      lead: 'بیشتر سامانه‌های RAG بر اساس حس‌وحال منتشر می‌شوند. یک دموی سه‌سؤالی خوب جواب می‌دهد، همه سر تکان می‌دهند و بدون آزمون به تولید می‌رود. این چارچوب ارزیابی‌ای است که پیش از آنکه حتی یک کاربر آن را لمس کند، نصب می‌کنم.',
      blocks: [
        { k: 'h2', t: 'چرا BLEU و ROUGE ناکافی‌اند' },
        { k: 'p', t: 'BLEU و ROUGE هم‌پوشانی n-gram با پاسخ مرجع را می‌سنجند. برای ترجمه قابل قبول است، اما برای RAG گمراه‌کننده: پاسخ درستی که با عبارت متفاوت بیان شود امتیاز پایین می‌گیرد و توهمی روان که اتفاقاً واژه‌ها را تکرار کند امتیاز بالا. در نهایت به جای حقیقت، شباهت سطحی را بهینه می‌کنید.' },
        { k: 'p', t: 'راه‌حل، تفکیک ارزیابی به دو لایه‌ی مستقل است — کیفیت بازیابی و کیفیت پاسخ — و هرگز میانگین‌گرفتن آن‌ها در یک عدد تزئینی.' },
        { k: 'h2', t: 'لایه‌ی اول: بازیابی' },
        { k: 'p', t: 'پیش از آنکه مدل چیزی بنویسد، بپرسید آیا اصلاً متن درست بازیابی شده است؟ یک مجموعه‌ی برچسب‌خورده از جفت‌های پرسش ← قطعه‌ی طلایی بسازید و این‌ها را پایش کنید:' },
        { k: 'ul', items: [
          'Recall@k — آیا قطعه‌ی طلایی در k نتیجه‌ی برتر ظاهر شد؟',
          'MRR — وقتی ظاهر شد، چه رتبه‌ای داشت؟',
          'دقت متن — چه کسری از قطعات بازیابی‌شده واقعاً مرتبط بودند؟',
        ] },
        { k: 'p', t: 'اگر recall@5 زیر ۰٫۹ باشد، هیچ مقدار مهندسی پرامپت پاسخ را نجات نمی‌دهد. اول بازیابی را درست کنید — قطعه‌بندی، embedding، جست‌وجوی ترکیبی — بعد سراغ پرامپت تولید بروید.' },
        { k: 'h2', t: 'لایه‌ی دوم: وفاداری پاسخ' },
        { k: 'p', t: 'برای تولید، معیاری که واقعاً تأثیر دارد groundedness است: آیا هر ادعای پاسخ توسط متن بازیابی‌شده پشتیبانی می‌شود؟ من از LLM به‌عنوان داور با یک rubric سخت‌گیرانه و یک مجموعه‌ی کالیبراسیون انسانی کوچک استفاده می‌کنم تا داور صادق بماند.' },
        { k: 'quote', t: 'سامانه‌ی RAG‌ای که نتوانید اندازه بگیرید، سامانه‌ای است که نمی‌توانید بهبودش دهید. ارزیابی یک فاز نیست — حلقه‌ی کنترل است.' },
        { k: 'h2', t: 'آن را در CI ببندید' },
        { k: 'p', t: 'این چارچوب تنها وقتی ارزش دارد که روی هر تغییر اجرا شود. من استقرارها را به یک مجموعه‌ی regression مشروط می‌کنم: اگر وفاداری بیش از دو واحد افت کند یا recall@5 از آستانه پایین‌تر بیاید، خط لوله مسدود می‌شود. همین یک دروازه بیش از هر QA دستی، افت‌های خاموش را گرفته است.' },
      ],
    },
  },

  'agentic-n8n-patterns': {
    date: '2026-04-09',
    accent: 'violet',
    en: {
      lead: 'n8n is the most underrated tool in the enterprise AI stack. Not because it is clever, but because it is boring in exactly the right places — and boring is what you want around an LLM agent.',
      blocks: [
        { k: 'h2', t: 'Agents need a substrate, not a framework' },
        { k: 'p', t: 'The mistake teams make is reaching for an agent framework first. Frameworks hide the control flow inside the model. In production you want the opposite: deterministic orchestration around a non-deterministic core. n8n gives you that substrate — visible nodes, retries, error branches, and a durable execution log.' },
        { k: 'h2', t: 'The pattern: LLM as a node, not the conductor' },
        { k: 'p', t: 'Treat the model as one step that proposes an action, then let n8n decide whether to execute it. The agent suggests; the workflow disposes. This keeps every side effect — an API call, a database write, an email — gated behind a node you can inspect, rate-limit, and roll back.' },
        { k: 'ul', items: [
          'Planner node — the LLM returns a structured action, never raw text.',
          'Router node — n8n validates the action against an allow-list.',
          'Tool nodes — real integrations, each with their own retry policy.',
          'Audit node — every step is appended to an execution store.',
        ] },
        { k: 'h2', t: 'Where LangGraph fits' },
        { k: 'p', t: 'For loops that need real state — multi-turn reasoning, reflection, tool retries with memory — I drop LangGraph inside a single n8n node. n8n owns the macro workflow and the durability; LangGraph owns the micro reasoning loop. The boundary is clean and each tool does what it is good at.' },
        { k: 'quote', t: 'Make the deterministic parts boring and the boring parts auditable. The intelligence belongs in exactly one node.' },
        { k: 'h2', t: 'Observability is the whole game' },
        { k: 'p', t: 'Because every execution is a record, you can replay a failed run, diff two runs, and answer the question every stakeholder eventually asks: "why did it do that?" An agent you can explain is an agent you can ship.' },
      ],
    },
    fa: {
      lead: 'n8n کم‌ارزش‌گذاری‌شده‌ترین ابزار استک هوش مصنوعی سازمانی است. نه به این دلیل که باهوش است، بلکه چون دقیقاً در جای درست «خسته‌کننده» است — و خسته‌کننده دقیقاً همان چیزی است که گرداگرد یک عامل LLM می‌خواهید.',
      blocks: [
        { k: 'h2', t: 'عامل‌ها به بستر نیاز دارند، نه فریم‌ورک' },
        { k: 'p', t: 'اشتباه تیم‌ها این است که اول سراغ فریم‌ورک عامل می‌روند. فریم‌ورک‌ها جریان کنترل را داخل مدل پنهان می‌کنند. در تولید عکسش را می‌خواهید: ارکستراسیون قطعی گرداگرد یک هسته‌ی نامعین. n8n همان بستر را می‌دهد — گره‌های قابل‌مشاهده، تلاش مجدد، شاخه‌های خطا و یک گزارش اجرای پایدار.' },
        { k: 'h2', t: 'الگو: LLM به‌عنوان یک گره، نه رهبر ارکستر' },
        { k: 'p', t: 'مدل را یک گام بدانید که کنشی را پیشنهاد می‌دهد، سپس بگذارید n8n تصمیم بگیرد آن را اجرا کند یا نه. عامل پیشنهاد می‌دهد؛ گردش‌کار تصمیم می‌گیرد. این کار هر اثر جانبی — فراخوان API، نوشتن در پایگاه‌داده، ایمیل — را پشت گره‌ای نگه می‌دارد که می‌توانید بازرسی، محدود و بازگردانی‌اش کنید.' },
        { k: 'ul', items: [
          'گره برنامه‌ریز — LLM یک کنش ساختارمند برمی‌گرداند، نه متن خام.',
          'گره مسیریاب — n8n کنش را در برابر فهرست مجاز اعتبارسنجی می‌کند.',
          'گره‌های ابزار — یکپارچه‌سازی‌های واقعی، هرکدام با سیاست تلاش مجدد خود.',
          'گره ممیزی — هر گام به یک انبار اجرا افزوده می‌شود.',
        ] },
        { k: 'h2', t: 'جای LangGraph کجاست' },
        { k: 'p', t: 'برای حلقه‌هایی که به حالت واقعی نیاز دارند — استدلال چندمرحله‌ای، بازتاب، تلاش مجدد ابزار با حافظه — LangGraph را داخل یک گره‌ی n8n می‌گذارم. n8n مالک گردش‌کار کلان و پایداری است؛ LangGraph مالک حلقه‌ی استدلال خرد. مرز تمیز است و هر ابزار کاری را می‌کند که در آن خوب است.' },
        { k: 'quote', t: 'بخش‌های قطعی را خسته‌کننده کنید و بخش‌های خسته‌کننده را قابل‌ممیزی. هوش دقیقاً به یک گره تعلق دارد.' },
        { k: 'h2', t: 'مشاهده‌پذیری همه‌ی بازی است' },
        { k: 'p', t: 'چون هر اجرا یک رکورد است، می‌توانید اجرای ناموفق را بازپخش کنید، دو اجرا را مقایسه کنید و به پرسشی پاسخ دهید که هر ذی‌نفعی سرانجام می‌پرسد: «چرا این کار را کرد؟» عاملی که بتوانید توضیحش دهید، عاملی است که می‌توانید منتشرش کنید.' },
      ],
    },
  },

  'vertex-cost-control': {
    date: '2026-03-28',
    accent: 'cyan',
    en: {
      lead: 'I have reviewed dozens of Vertex AI bills. The same three anti-patterns show up in roughly 80% of them — and removing them routinely cuts monthly spend by half without touching quality.',
      blocks: [
        { k: 'h2', t: 'Anti-pattern 1: the always-on endpoint' },
        { k: 'p', t: 'Teams deploy a model to a dedicated endpoint with a minimum replica count of one and then forget about it. For bursty internal traffic that is a machine billing 24/7 to serve a few hundred requests a day. Set min replicas to zero where the latency budget allows, or batch the workload.' },
        { k: 'h2', t: 'Anti-pattern 2: the wrong model for the job' },
        { k: 'p', t: 'Not every call needs the frontier model. A cascade — cheap model first, escalate to the expensive one only when confidence is low — keeps quality high where it matters and spend low everywhere else.' },
        { k: 'ul', items: [
          'Route by task complexity, not by habit.',
          'Cache embeddings aggressively — they rarely change.',
          'Use context caching for stable system prompts and long shared documents.',
        ] },
        { k: 'h2', t: 'Anti-pattern 3: no unit economics' },
        { k: 'p', t: 'If you cannot state the cost per request, you cannot control it. I instrument every call with token counts and model id, then roll it up to cost-per-feature. The moment a feature has a dollar figure attached, the optimization conversation changes from abstract to obvious.' },
        { k: 'quote', t: 'You do not cut cloud cost with a spreadsheet at month-end. You cut it with a label on every request.' },
        { k: 'h2', t: 'The result' },
        { k: 'p', t: 'On the last engagement, those three fixes plus context caching took a $40k/month Vertex bill to under $16k — and p95 latency improved, because the cascade kept most traffic on a faster, smaller model.' },
      ],
    },
    fa: {
      lead: 'ده‌ها صورتحساب Vertex AI را بررسی کرده‌ام. همان سه ضدالگو در حدود ۸۰٪ آن‌ها دیده می‌شود — و حذف‌شان معمولاً هزینه‌ی ماهانه را بدون دست‌زدن به کیفیت نصف می‌کند.',
      blocks: [
        { k: 'h2', t: 'ضدالگوی ۱: endpoint همیشه‌روشن' },
        { k: 'p', t: 'تیم‌ها مدلی را روی یک endpoint اختصاصی با حداقل یک replica مستقر می‌کنند و فراموشش می‌کنند. برای ترافیک داخلی پرنوسان، این یعنی ماشینی که ۲۴ ساعته صورتحساب می‌دهد تا چند صد درخواست در روز را پاسخ دهد. جایی که بودجه‌ی تأخیر اجازه می‌دهد حداقل replica را صفر کنید، یا بار کاری را batch کنید.' },
        { k: 'h2', t: 'ضدالگوی ۲: مدل نامناسب برای کار' },
        { k: 'p', t: 'هر فراخوان به مدل مرزی نیاز ندارد. یک cascade — اول مدل ارزان، فقط وقتی اطمینان پایین است به مدل گران ارتقا — کیفیت را جایی که مهم است بالا و هزینه را همه‌جا پایین نگه می‌دارد.' },
        { k: 'ul', items: [
          'مسیریابی بر اساس پیچیدگی کار، نه عادت.',
          'embeddingها را پرحجم cache کنید — به‌ندرت تغییر می‌کنند.',
          'برای پرامپت‌های سیستمی پایدار و اسناد مشترک طولانی از context caching استفاده کنید.',
        ] },
        { k: 'h2', t: 'ضدالگوی ۳: نبود اقتصاد واحد' },
        { k: 'p', t: 'اگر نتوانید هزینه‌ی هر درخواست را بگویید، نمی‌توانید کنترلش کنید. من هر فراخوان را با شمار توکن و شناسه‌ی مدل ابزارگذاری می‌کنم و سپس به هزینه‌به‌ازای‌قابلیت تجمیع می‌کنم. لحظه‌ای که یک قابلیت رقم دلاری پیدا کند، گفت‌وگوی بهینه‌سازی از انتزاعی به بدیهی تبدیل می‌شود.' },
        { k: 'quote', t: 'هزینه‌ی ابر را با یک صفحه‌گسترده در پایان ماه کم نمی‌کنید. با یک برچسب روی هر درخواست کم می‌کنید.' },
        { k: 'h2', t: 'نتیجه' },
        { k: 'p', t: 'در آخرین پروژه، همین سه اصلاح به‌علاوه‌ی context caching صورتحساب ۴۰هزاردلاری ماهانه‌ی Vertex را به زیر ۱۶هزار دلار رساند — و تأخیر p95 هم بهتر شد، چون cascade بیشتر ترافیک را روی مدلی کوچک‌تر و سریع‌تر نگه داشت.' },
      ],
    },
  },

  'k8s-llm-inference': {
    date: '2026-03-11',
    accent: 'emerald',
    en: {
      lead: 'Sub-50ms LLM inference on commodity Kubernetes is achievable — but not by throwing GPUs at the problem. It comes from removing the three places latency actually hides.',
      blocks: [
        { k: 'h2', t: 'Latency hides in cold starts' },
        { k: 'p', t: 'A pod that scales from zero pays a model-load tax of tens of seconds. The answer is KEDA scaling on a queue depth signal, with a warm pool sized to your p50 traffic. You autoscale for the spikes, but you never serve a request from a cold replica.' },
        { k: 'h2', t: 'Latency hides in GPU contention' },
        { k: 'p', t: 'One model per GPU is wasteful; ten models fighting for one GPU is slow. The middle path is time-slicing or MIG partitions with explicit memory budgets, plus a scheduler that is GPU-topology aware so chatty replicas land on the same node.' },
        { k: 'ul', items: [
          'Pin the model in GPU memory — never reload per request.',
          'Use continuous batching so concurrent requests share a forward pass.',
          'Hedge slow requests: fire a second attempt at p95 and take the first to finish.',
        ] },
        { k: 'h2', t: 'Latency hides in the network' },
        { k: 'p', t: 'Cross-AZ hops, TLS renegotiation, and an over-eager service mesh quietly add milliseconds. Keep inference traffic in-zone, reuse connections, and measure the mesh overhead before you assume it is free.' },
        { k: 'quote', t: 'You do not buy latency with bigger GPUs. You earn it by deleting the waits nobody is looking at.' },
        { k: 'h2', t: 'Prove it with a budget' },
        { k: 'p', t: 'I define an explicit latency budget per stage — queue, batch, forward pass, serialization, network — and alert when any stage drifts. When p95 regresses, the budget tells you exactly which stage to open, instead of guessing.' },
      ],
    },
    fa: {
      lead: 'استنتاج LLM با تأخیر زیر ۵۰ میلی‌ثانیه روی Kubernetes معمولی دست‌یافتنی است — اما نه با ریختن GPU روی مسئله. از حذف سه جایی می‌آید که تأخیر واقعاً پنهان می‌شود.',
      blocks: [
        { k: 'h2', t: 'تأخیر در cold start پنهان است' },
        { k: 'p', t: 'پادی که از صفر مقیاس می‌گیرد، مالیات بارگذاری مدل به‌اندازه‌ی ده‌ها ثانیه می‌پردازد. پاسخ، مقیاس‌دهی KEDA بر اساس عمق صف است، با یک استخر گرم به‌اندازه‌ی ترافیک p50. برای جهش‌ها autoscale می‌کنید، اما هرگز درخواستی را از replica سرد پاسخ نمی‌دهید.' },
        { k: 'h2', t: 'تأخیر در رقابت GPU پنهان است' },
        { k: 'p', t: 'یک مدل به‌ازای هر GPU اسراف است؛ ده مدل در رقابت بر سر یک GPU کند است. راه میانه، time-slicing یا پارتیشن‌های MIG با بودجه‌ی حافظه‌ی صریح است، به‌علاوه‌ی زمان‌بندی‌ای که از توپولوژی GPU آگاه باشد تا replicaهای پرگفت‌وگو روی یک گره بنشینند.' },
        { k: 'ul', items: [
          'مدل را در حافظه‌ی GPU پین کنید — هرگز به‌ازای هر درخواست بارگذاری نکنید.',
          'از continuous batching استفاده کنید تا درخواست‌های هم‌زمان یک forward pass را به اشتراک بگذارند.',
          'درخواست‌های کند را hedge کنید: در p95 تلاش دوم را بفرستید و اولی که تمام شد را بردارید.',
        ] },
        { k: 'h2', t: 'تأخیر در شبکه پنهان است' },
        { k: 'p', t: 'پرش‌های بین‌AZ، مذاکره‌ی مجدد TLS و یک service mesh بیش‌ازحد مشتاق بی‌سروصدا میلی‌ثانیه اضافه می‌کنند. ترافیک استنتاج را درون‌ناحیه نگه دارید، اتصال‌ها را بازاستفاده کنید و پیش از آنکه فرض کنید mesh رایگان است، سربارش را اندازه بگیرید.' },
        { k: 'quote', t: 'تأخیر را با GPUهای بزرگ‌تر نمی‌خرید. با حذف انتظارهایی که کسی نگاهشان نمی‌کند به دستش می‌آورید.' },
        { k: 'h2', t: 'با یک بودجه اثباتش کنید' },
        { k: 'p', t: 'برای هر مرحله بودجه‌ی تأخیر صریح تعریف می‌کنم — صف، batch، forward pass، سریال‌سازی، شبکه — و وقتی هر مرحله منحرف شد هشدار می‌دهم. وقتی p95 پسرفت می‌کند، بودجه دقیقاً می‌گوید کدام مرحله را باز کنید، به‌جای حدس‌زدن.' },
      ],
    },
  },

  'flutter-on-device-ai': {
    date: '2026-02-19',
    accent: 'electric',
    en: {
      lead: 'On-device AI is not a smaller version of cloud AI. It is a different engineering problem with a different reward: privacy, offline capability, and zero per-inference cost.',
      blocks: [
        { k: 'h2', t: 'Pick the right tier' },
        { k: 'p', t: 'Not everything belongs on the device. The decision tree is simple: if the task is latency-critical, privacy-sensitive, or must work offline, it runs on-device. Everything else can call the cloud. Most real apps end up hybrid — a small local model for the common case, a cloud fallback for the hard one.' },
        { k: 'h2', t: 'Gemini Nano and LiteRT in Flutter' },
        { k: 'p', t: 'On Android, Gemini Nano gives you a capable on-device model through AICore. For custom models, LiteRT (formerly TFLite) runs quantized weights with hardware delegation. From Flutter you bridge to both through a thin platform channel — keep the inference on the native side and pass only structured results across.' },
        { k: 'ul', items: [
          'Quantize to int8 — the quality loss is usually negligible, the speedup is not.',
          'Warm the interpreter at app start, not on first use.',
          'Stream tokens to the UI so perceived latency stays low even when total latency is not.',
        ] },
        { k: 'h2', t: 'The UX is the hard part' },
        { k: 'p', t: 'On-device models are smaller, so the product has to be honest about their limits. Constrain the task, give the model structure, and design graceful fallbacks. A focused local model that does one thing reliably beats a general one that occasionally embarrasses you.' },
        { k: 'quote', t: 'On-device AI rewards narrow scope. Ship the model that nails one job, not the one that attempts ten.' },
        { k: 'h2', t: 'Battery and binary size are product decisions' },
        { k: 'p', t: 'A 200MB model and a hot CPU are features your users feel. Measure energy per inference and ship the model on demand rather than in the initial bundle. The right size is the smallest one that clears your quality bar.' },
      ],
    },
    fa: {
      lead: 'هوش مصنوعی on-device نسخه‌ی کوچک‌تر هوش مصنوعی ابری نیست. مسئله‌ی مهندسی متفاوتی با پاداش متفاوت است: حریم خصوصی، توان آفلاین و هزینه‌ی صفر به‌ازای هر استنتاج.',
      blocks: [
        { k: 'h2', t: 'لایه‌ی درست را انتخاب کنید' },
        { k: 'p', t: 'همه‌چیز به دستگاه تعلق ندارد. درخت تصمیم ساده است: اگر کار حساس‌به‌تأخیر، حساس‌به‌حریم‌خصوصی یا نیازمند کار آفلاین است، روی دستگاه اجرا می‌شود. بقیه می‌توانند ابر را فرابخوانند. بیشتر اپ‌های واقعی ترکیبی می‌شوند — یک مدل محلی کوچک برای حالت رایج، یک fallback ابری برای حالت سخت.' },
        { k: 'h2', t: 'Gemini Nano و LiteRT در Flutter' },
        { k: 'p', t: 'در اندروید، Gemini Nano از طریق AICore یک مدل on-device توانمند می‌دهد. برای مدل‌های سفارشی، LiteRT (همان TFLite سابق) وزن‌های کوانتیزه را با واگذاری سخت‌افزاری اجرا می‌کند. از Flutter از طریق یک platform channel نازک به هردو پل می‌زنید — استنتاج را سمت native نگه دارید و فقط نتایج ساختارمند را عبور دهید.' },
        { k: 'ul', items: [
          'به int8 کوانتیزه کنید — افت کیفیت معمولاً ناچیز است، شتاب نه.',
          'مفسر را در شروع اپ گرم کنید، نه در اولین استفاده.',
          'توکن‌ها را به UI استریم کنید تا تأخیر ادراک‌شده پایین بماند حتی اگر تأخیر کل نباشد.',
        ] },
        { k: 'h2', t: 'بخش سخت، UX است' },
        { k: 'p', t: 'مدل‌های on-device کوچک‌ترند، پس محصول باید درباره‌ی محدودیت‌هایشان صادق باشد. کار را محدود کنید، به مدل ساختار بدهید و fallbackهای مودبانه طراحی کنید. یک مدل محلی متمرکز که یک کار را قابل‌اتکا انجام دهد، از مدلی عمومی که گاهی شرمنده‌تان می‌کند بهتر است.' },
        { k: 'quote', t: 'هوش مصنوعی on-device به دامنه‌ی باریک پاداش می‌دهد. مدلی را منتشر کنید که یک کار را بی‌نقص انجام دهد، نه آنکه ده کار را امتحان کند.' },
        { k: 'h2', t: 'باتری و حجم باینری تصمیم‌های محصول‌اند' },
        { k: 'p', t: 'یک مدل ۲۰۰مگابایتی و CPU داغ، قابلیت‌هایی‌اند که کاربرانتان حس می‌کنند. انرژی به‌ازای هر استنتاج را اندازه بگیرید و مدل را به‌صورت on-demand منتشر کنید نه در بسته‌ی اولیه. اندازه‌ی درست، کوچک‌ترین اندازه‌ای است که از خط کیفیت شما رد شود.' },
      ],
    },
  },

  'enterprise-ai-roadmap': {
    date: '2026-01-30',
    accent: 'electric',
    en: {
      lead: 'Most enterprise AI initiatives die in the gap between a board mandate and a shipped feature. This is the 90-day roadmap I build to cross it — discovery to first production deployment.',
      blocks: [
        { k: 'h2', t: 'Days 0–30: discovery, not deck-building' },
        { k: 'p', t: 'The first month is spent finding the use cases that are both valuable and feasible. I interview the people doing the work, map the data that actually exists (not the data the org wishes it had), and score candidates on impact versus effort. The output is a shortlist of three, not a 40-slide strategy.' },
        { k: 'h2', t: 'Days 30–60: one thin slice to production' },
        { k: 'p', t: 'We pick the single highest-leverage use case and ship it end-to-end for a small group of real users. Not a pilot in a sandbox — a thin slice in production, with monitoring, evaluation, and a rollback path. The goal is to learn what breaks when reality arrives.' },
        { k: 'ul', items: [
          'Define success metrics before writing code.',
          'Instrument cost and quality from request one.',
          'Ship behind a flag to a controlled cohort.',
        ] },
        { k: 'h2', t: 'Days 60–90: harden and templatize' },
        { k: 'p', t: 'With one real workload live, the last month turns the bespoke build into a repeatable pattern: shared eval harness, a reference architecture, and the platform pieces the next three use cases will reuse. The second project should take half the time of the first.' },
        { k: 'quote', t: 'A roadmap is not a list of features. It is the order in which you reduce uncertainty.' },
        { k: 'h2', t: 'What kills roadmaps' },
        { k: 'p', t: 'Boiling the ocean, optimizing a model nobody uses, and treating AI as a research project instead of a product. The antidote to all three is the same: get one real thing in front of real users fast, then let what you learn redraw the map.' },
      ],
    },
    fa: {
      lead: 'بیشتر ابتکارهای هوش مصنوعی سازمانی در شکاف میان دستور هیئت‌مدیره و یک قابلیت منتشرشده می‌میرند. این نقشه‌ی راه ۹۰روزه‌ای است که برای عبور از آن می‌سازم — از کشف تا اولین استقرار تولید.',
      blocks: [
        { k: 'h2', t: 'روز ۰ تا ۳۰: کشف، نه ساختن اسلاید' },
        { k: 'p', t: 'ماه اول صرف یافتن موارد کاربری‌ای می‌شود که هم ارزشمند و هم شدنی‌اند. با کسانی که کار را انجام می‌دهند مصاحبه می‌کنم، داده‌ای را که واقعاً وجود دارد نگاشت می‌کنم (نه داده‌ای که سازمان آرزویش را دارد) و گزینه‌ها را بر اساس اثر در برابر تلاش امتیاز می‌دهم. خروجی، فهرست کوتاهی از سه مورد است، نه یک راهبرد ۴۰اسلایدی.' },
        { k: 'h2', t: 'روز ۳۰ تا ۶۰: یک برش نازک تا تولید' },
        { k: 'p', t: 'تک‌مورد با بیشترین اهرم را برمی‌گزینیم و آن را سرتاسری برای گروه کوچکی از کاربران واقعی منتشر می‌کنیم. نه یک pilot در sandbox — یک برش نازک در تولید، با پایش، ارزیابی و مسیر بازگشت. هدف، یادگرفتن چیزی است که وقتی واقعیت می‌رسد می‌شکند.' },
        { k: 'ul', items: [
          'معیارهای موفقیت را پیش از نوشتن کد تعریف کنید.',
          'هزینه و کیفیت را از همان درخواست اول ابزارگذاری کنید.',
          'پشت یک flag برای یک گروه کنترل‌شده منتشر کنید.',
        ] },
        { k: 'h2', t: 'روز ۶۰ تا ۹۰: تثبیت و قالب‌سازی' },
        { k: 'p', t: 'با یک بار کاری واقعی در حال اجرا، ماه آخر ساخت سفارشی را به الگویی تکرارپذیر تبدیل می‌کند: harness ارزیابی مشترک، یک معماری مرجع و قطعات پلتفرمی که سه مورد بعدی بازاستفاده خواهند کرد. پروژه‌ی دوم باید نصف زمان اولی را ببرد.' },
        { k: 'quote', t: 'نقشه‌ی راه فهرستی از قابلیت‌ها نیست. ترتیبی است که در آن عدم‌قطعیت را کاهش می‌دهید.' },
        { k: 'h2', t: 'چه چیزی نقشه‌ی راه را می‌کشد' },
        { k: 'p', t: 'جوشاندن اقیانوس، بهینه‌سازی مدلی که کسی استفاده نمی‌کند و رفتار با هوش مصنوعی به‌مثابه‌ی پروژه‌ی پژوهشی به‌جای محصول. پادزهر هر سه یکی است: یک چیز واقعی را سریع جلوی کاربران واقعی بگذارید، سپس بگذارید آنچه می‌آموزید نقشه را دوباره بکشد.' },
      ],
    },
  },
};

export const POST_SLUGS = Object.keys(POSTS);