From e987e33c0ad26ba3e9b4df6166f63b07382e76db Mon Sep 17 00:00:00 2001
From: "soroush.asadi" <soroush.asadi@aliasaas.com>
Date: Tue, 9 Jun 2026 18:42:19 +0330
Subject: [PATCH] =?UTF-8?q?M2:=20eval=20harness=20=E2=80=94=20golden=20tes?=
 =?UTF-8?q?ts=20gated=20on=20edit=20distance?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- SkillEvaluator (internal to Skills): runs each golden test through an ISkillExecutor and
  passes only if normalized edit distance <= threshold (the north-star metric). The executor
  is a stub in M2 (no model runtime); M4's assembler supplies the real one and publishing is
  gated on the report. The indexer's structural gate (roles + >=1 golden test) stands until then.
- InternalsVisibleTo the integration tests so the harness is exercised directly.

Verified: build green; ArchitectureTests 8/8; IntegrationTests 25/25 (+3 eval-harness unit
tests: pass on match, fail on divergence, fail with no golden tests).
---
 .../Eval/SkillEvaluator.cs                    | 45 +++++++++++++++++
 .../TeamUp.Modules.Skills.csproj              |  5 ++
 .../SkillEvaluatorTests.cs                    | 49 +++++++++++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs
 create mode 100644 tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs
diff --git a/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs b/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs
new file mode 100644
index 0000000..8d57322
--- /dev/null
+++ b/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs
@@ -0,0 +1,45 @@
+using TeamUp.Modules.Skills.Domain;
+using TeamUp.SharedKernel.Metrics;
+
+namespace TeamUp.Modules.Skills.Eval;
+
+/// <summary>Runs a skill against one golden input and returns its output.</summary>
+internal interface ISkillExecutor
+{
+    Task<string> ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default);
+}
+
+internal sealed record GoldenResult(string Input, double Distance, bool Passed);
+
+internal sealed record EvalReport(bool Passed, double WorstDistance, IReadOnlyList<GoldenResult> Results);
+
+/// <summary>
+/// The eval harness: runs each golden test through an executor and gates on normalized edit
+/// distance (the north-star metric). In M2 the executor is a stub (no model runtime); M4's
+/// assembler supplies the real one, and publishing is gated on <see cref="EvalReport.Passed"/>.
+/// </summary>
+internal sealed class SkillEvaluator(double passThreshold = 0.34)
+{
+    public async Task<EvalReport> EvaluateAsync(
+        IReadOnlyList<GoldenExample> goldenTests,
+        string skillBody,
+        ISkillExecutor executor,
+        CancellationToken cancellationToken = default)
+    {
+        if (goldenTests.Count == 0)
+        {
+            return new EvalReport(false, 1.0, []);
+        }
+
+        var results = new List<GoldenResult>(goldenTests.Count);
+        foreach (var test in goldenTests)
+        {
+            var output = await executor.ExecuteAsync(skillBody, test.Input, cancellationToken);
+            var distance = EditDistance.Normalized(test.Expected, output);
+            results.Add(new GoldenResult(test.Input, distance, distance <= passThreshold));
+        }
+
+        var worst = results.Max(r => r.Distance);
+        return new EvalReport(results.All(r => r.Passed), worst, results);
+    }
+}
diff --git a/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj b/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj
index 662caa0..79f6ce6 100644
--- a/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj
+++ b/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj
@@ -7,6 +7,11 @@
     <ProjectReference Include="..\..\Shared\TeamUp.SharedKernel\TeamUp.SharedKernel.csproj" />
   </ItemGroup>
 
+  <!-- The eval harness is internal; let the integration tests exercise it directly. -->
+  <ItemGroup>
+    <InternalsVisibleTo Include="TeamUp.IntegrationTests" />
+  </ItemGroup>
+
   <ItemGroup>
     <PackageReference Include="Microsoft.EntityFrameworkCore" />
     <PackageReference Include="Microsoft.EntityFrameworkCore.Design" PrivateAssets="all" />
diff --git a/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs b/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs
new file mode 100644
index 0000000..bf18ed4
--- /dev/null
+++ b/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs
@@ -0,0 +1,49 @@
+using TeamUp.Modules.Skills.Domain;
+using TeamUp.Modules.Skills.Eval;
+using Xunit;
+
+namespace TeamUp.IntegrationTests;
+
+/// <summary>Unit coverage for the eval harness (no database). Uses a stub executor for the model.</summary>
+public sealed class SkillEvaluatorTests
+{
+    private sealed class StubExecutor(Func<string, string> respond) : ISkillExecutor
+    {
+        public Task<string> ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default) =>
+            Task.FromResult(respond(input));
+    }
+
+    private static List<GoldenExample> Golden(string input, string expected) =>
+        [new GoldenExample { Input = input, Expected = expected }];
+
+    [Fact]
+    public async Task Passes_when_output_matches_expected()
+    {
+        var report = await new SkillEvaluator().EvaluateAsync(
+            Golden("anything", "a clear logout button in the header"),
+            "body",
+            new StubExecutor(_ => "a clear logout button in the header"));
+
+        Assert.True(report.Passed);
+        Assert.Equal(0d, report.WorstDistance, precision: 3);
+    }
+
+    [Fact]
+    public async Task Fails_when_output_diverges()
+    {
+        var report = await new SkillEvaluator().EvaluateAsync(
+            Golden("anything", "a clear logout button in the header"),
+            "body",
+            new StubExecutor(_ => "something completely unrelated and very different indeed"));
+
+        Assert.False(report.Passed);
+        Assert.True(report.WorstDistance > 0.34);
+    }
+
+    [Fact]
+    public async Task Fails_when_there_are_no_golden_tests()
+    {
+        var report = await new SkillEvaluator().EvaluateAsync([], "body", new StubExecutor(_ => "x"));
+        Assert.False(report.Passed);
+    }
+}