From e987e33c0ad26ba3e9b4df6166f63b07382e76db Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Tue, 9 Jun 2026 18:42:19 +0330 Subject: [PATCH] =?UTF-8?q?M2:=20eval=20harness=20=E2=80=94=20golden=20tes?= =?UTF-8?q?ts=20gated=20on=20edit=20distance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - SkillEvaluator (internal to Skills): runs each golden test through an ISkillExecutor and passes only if normalized edit distance <= threshold (the north-star metric). The executor is a stub in M2 (no model runtime); M4's assembler supplies the real one and publishing is gated on the report. The indexer's structural gate (roles + >=1 golden test) stands until then. - InternalsVisibleTo the integration tests so the harness is exercised directly. Verified: build green; ArchitectureTests 8/8; IntegrationTests 25/25 (+3 eval-harness unit tests: pass on match, fail on divergence, fail with no golden tests). --- .../Eval/SkillEvaluator.cs | 45 +++++++++++++++++ .../TeamUp.Modules.Skills.csproj | 5 ++ .../SkillEvaluatorTests.cs | 49 +++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs create mode 100644 tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs diff --git a/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs b/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs new file mode 100644 index 0000000..8d57322 --- /dev/null +++ b/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs @@ -0,0 +1,45 @@ +using TeamUp.Modules.Skills.Domain; +using TeamUp.SharedKernel.Metrics; + +namespace TeamUp.Modules.Skills.Eval; + +/// Runs a skill against one golden input and returns its output. +internal interface ISkillExecutor +{ + Task ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default); +} + +internal sealed record GoldenResult(string Input, double Distance, bool Passed); + +internal sealed record EvalReport(bool Passed, double WorstDistance, IReadOnlyList Results); + +/// +/// The eval harness: runs each golden test through an executor and gates on normalized edit +/// distance (the north-star metric). In M2 the executor is a stub (no model runtime); M4's +/// assembler supplies the real one, and publishing is gated on . +/// +internal sealed class SkillEvaluator(double passThreshold = 0.34) +{ + public async Task EvaluateAsync( + IReadOnlyList goldenTests, + string skillBody, + ISkillExecutor executor, + CancellationToken cancellationToken = default) + { + if (goldenTests.Count == 0) + { + return new EvalReport(false, 1.0, []); + } + + var results = new List(goldenTests.Count); + foreach (var test in goldenTests) + { + var output = await executor.ExecuteAsync(skillBody, test.Input, cancellationToken); + var distance = EditDistance.Normalized(test.Expected, output); + results.Add(new GoldenResult(test.Input, distance, distance <= passThreshold)); + } + + var worst = results.Max(r => r.Distance); + return new EvalReport(results.All(r => r.Passed), worst, results); + } +} diff --git a/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj b/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj index 662caa0..79f6ce6 100644 --- a/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj +++ b/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj @@ -7,6 +7,11 @@ + + + + + diff --git a/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs b/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs new file mode 100644 index 0000000..bf18ed4 --- /dev/null +++ b/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs @@ -0,0 +1,49 @@ +using TeamUp.Modules.Skills.Domain; +using TeamUp.Modules.Skills.Eval; +using Xunit; + +namespace TeamUp.IntegrationTests; + +/// Unit coverage for the eval harness (no database). Uses a stub executor for the model. +public sealed class SkillEvaluatorTests +{ + private sealed class StubExecutor(Func respond) : ISkillExecutor + { + public Task ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default) => + Task.FromResult(respond(input)); + } + + private static List Golden(string input, string expected) => + [new GoldenExample { Input = input, Expected = expected }]; + + [Fact] + public async Task Passes_when_output_matches_expected() + { + var report = await new SkillEvaluator().EvaluateAsync( + Golden("anything", "a clear logout button in the header"), + "body", + new StubExecutor(_ => "a clear logout button in the header")); + + Assert.True(report.Passed); + Assert.Equal(0d, report.WorstDistance, precision: 3); + } + + [Fact] + public async Task Fails_when_output_diverges() + { + var report = await new SkillEvaluator().EvaluateAsync( + Golden("anything", "a clear logout button in the header"), + "body", + new StubExecutor(_ => "something completely unrelated and very different indeed")); + + Assert.False(report.Passed); + Assert.True(report.WorstDistance > 0.34); + } + + [Fact] + public async Task Fails_when_there_are_no_golden_tests() + { + var report = await new SkillEvaluator().EvaluateAsync([], "body", new StubExecutor(_ => "x")); + Assert.False(report.Passed); + } +}