diff --git a/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs b/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs new file mode 100644 index 0000000..8d57322 --- /dev/null +++ b/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs @@ -0,0 +1,45 @@ +using TeamUp.Modules.Skills.Domain; +using TeamUp.SharedKernel.Metrics; + +namespace TeamUp.Modules.Skills.Eval; + +/// Runs a skill against one golden input and returns its output. +internal interface ISkillExecutor +{ + Task ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default); +} + +internal sealed record GoldenResult(string Input, double Distance, bool Passed); + +internal sealed record EvalReport(bool Passed, double WorstDistance, IReadOnlyList Results); + +/// +/// The eval harness: runs each golden test through an executor and gates on normalized edit +/// distance (the north-star metric). In M2 the executor is a stub (no model runtime); M4's +/// assembler supplies the real one, and publishing is gated on . +/// +internal sealed class SkillEvaluator(double passThreshold = 0.34) +{ + public async Task EvaluateAsync( + IReadOnlyList goldenTests, + string skillBody, + ISkillExecutor executor, + CancellationToken cancellationToken = default) + { + if (goldenTests.Count == 0) + { + return new EvalReport(false, 1.0, []); + } + + var results = new List(goldenTests.Count); + foreach (var test in goldenTests) + { + var output = await executor.ExecuteAsync(skillBody, test.Input, cancellationToken); + var distance = EditDistance.Normalized(test.Expected, output); + results.Add(new GoldenResult(test.Input, distance, distance <= passThreshold)); + } + + var worst = results.Max(r => r.Distance); + return new EvalReport(results.All(r => r.Passed), worst, results); + } +} diff --git a/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj b/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj index 662caa0..79f6ce6 100644 --- a/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj +++ b/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj @@ -7,6 +7,11 @@ + + + + + diff --git a/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs b/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs new file mode 100644 index 0000000..bf18ed4 --- /dev/null +++ b/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs @@ -0,0 +1,49 @@ +using TeamUp.Modules.Skills.Domain; +using TeamUp.Modules.Skills.Eval; +using Xunit; + +namespace TeamUp.IntegrationTests; + +/// Unit coverage for the eval harness (no database). Uses a stub executor for the model. +public sealed class SkillEvaluatorTests +{ + private sealed class StubExecutor(Func respond) : ISkillExecutor + { + public Task ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default) => + Task.FromResult(respond(input)); + } + + private static List Golden(string input, string expected) => + [new GoldenExample { Input = input, Expected = expected }]; + + [Fact] + public async Task Passes_when_output_matches_expected() + { + var report = await new SkillEvaluator().EvaluateAsync( + Golden("anything", "a clear logout button in the header"), + "body", + new StubExecutor(_ => "a clear logout button in the header")); + + Assert.True(report.Passed); + Assert.Equal(0d, report.WorstDistance, precision: 3); + } + + [Fact] + public async Task Fails_when_output_diverges() + { + var report = await new SkillEvaluator().EvaluateAsync( + Golden("anything", "a clear logout button in the header"), + "body", + new StubExecutor(_ => "something completely unrelated and very different indeed")); + + Assert.False(report.Passed); + Assert.True(report.WorstDistance > 0.34); + } + + [Fact] + public async Task Fails_when_there_are_no_golden_tests() + { + var report = await new SkillEvaluator().EvaluateAsync([], "body", new StubExecutor(_ => "x")); + Assert.False(report.Passed); + } +}