M2: eval harness — golden tests gated on edit distance
- SkillEvaluator (internal to Skills): runs each golden test through an ISkillExecutor and passes only if normalized edit distance <= threshold (the north-star metric). The executor is a stub in M2 (no model runtime); M4's assembler supplies the real one and publishing is gated on the report. The indexer's structural gate (roles + >=1 golden test) stands until then. - InternalsVisibleTo the integration tests so the harness is exercised directly. Verified: build green; ArchitectureTests 8/8; IntegrationTests 25/25 (+3 eval-harness unit tests: pass on match, fail on divergence, fail with no golden tests).
This commit is contained in:
@@ -0,0 +1,45 @@
|
|||||||
|
using TeamUp.Modules.Skills.Domain;
|
||||||
|
using TeamUp.SharedKernel.Metrics;
|
||||||
|
|
||||||
|
namespace TeamUp.Modules.Skills.Eval;
|
||||||
|
|
||||||
|
/// <summary>Runs a skill against one golden input and returns its output.</summary>
|
||||||
|
internal interface ISkillExecutor
|
||||||
|
{
|
||||||
|
Task<string> ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default);
|
||||||
|
}
|
||||||
|
|
||||||
|
internal sealed record GoldenResult(string Input, double Distance, bool Passed);
|
||||||
|
|
||||||
|
internal sealed record EvalReport(bool Passed, double WorstDistance, IReadOnlyList<GoldenResult> Results);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The eval harness: runs each golden test through an executor and gates on normalized edit
|
||||||
|
/// distance (the north-star metric). In M2 the executor is a stub (no model runtime); M4's
|
||||||
|
/// assembler supplies the real one, and publishing is gated on <see cref="EvalReport.Passed"/>.
|
||||||
|
/// </summary>
|
||||||
|
internal sealed class SkillEvaluator(double passThreshold = 0.34)
|
||||||
|
{
|
||||||
|
public async Task<EvalReport> EvaluateAsync(
|
||||||
|
IReadOnlyList<GoldenExample> goldenTests,
|
||||||
|
string skillBody,
|
||||||
|
ISkillExecutor executor,
|
||||||
|
CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
if (goldenTests.Count == 0)
|
||||||
|
{
|
||||||
|
return new EvalReport(false, 1.0, []);
|
||||||
|
}
|
||||||
|
|
||||||
|
var results = new List<GoldenResult>(goldenTests.Count);
|
||||||
|
foreach (var test in goldenTests)
|
||||||
|
{
|
||||||
|
var output = await executor.ExecuteAsync(skillBody, test.Input, cancellationToken);
|
||||||
|
var distance = EditDistance.Normalized(test.Expected, output);
|
||||||
|
results.Add(new GoldenResult(test.Input, distance, distance <= passThreshold));
|
||||||
|
}
|
||||||
|
|
||||||
|
var worst = results.Max(r => r.Distance);
|
||||||
|
return new EvalReport(results.All(r => r.Passed), worst, results);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,6 +7,11 @@
|
|||||||
<ProjectReference Include="..\..\Shared\TeamUp.SharedKernel\TeamUp.SharedKernel.csproj" />
|
<ProjectReference Include="..\..\Shared\TeamUp.SharedKernel\TeamUp.SharedKernel.csproj" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
|
<!-- The eval harness is internal; let the integration tests exercise it directly. -->
|
||||||
|
<ItemGroup>
|
||||||
|
<InternalsVisibleTo Include="TeamUp.IntegrationTests" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.EntityFrameworkCore" />
|
<PackageReference Include="Microsoft.EntityFrameworkCore" />
|
||||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" PrivateAssets="all" />
|
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" PrivateAssets="all" />
|
||||||
|
|||||||
@@ -0,0 +1,49 @@
|
|||||||
|
using TeamUp.Modules.Skills.Domain;
|
||||||
|
using TeamUp.Modules.Skills.Eval;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
namespace TeamUp.IntegrationTests;
|
||||||
|
|
||||||
|
/// <summary>Unit coverage for the eval harness (no database). Uses a stub executor for the model.</summary>
|
||||||
|
public sealed class SkillEvaluatorTests
|
||||||
|
{
|
||||||
|
private sealed class StubExecutor(Func<string, string> respond) : ISkillExecutor
|
||||||
|
{
|
||||||
|
public Task<string> ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default) =>
|
||||||
|
Task.FromResult(respond(input));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<GoldenExample> Golden(string input, string expected) =>
|
||||||
|
[new GoldenExample { Input = input, Expected = expected }];
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Passes_when_output_matches_expected()
|
||||||
|
{
|
||||||
|
var report = await new SkillEvaluator().EvaluateAsync(
|
||||||
|
Golden("anything", "a clear logout button in the header"),
|
||||||
|
"body",
|
||||||
|
new StubExecutor(_ => "a clear logout button in the header"));
|
||||||
|
|
||||||
|
Assert.True(report.Passed);
|
||||||
|
Assert.Equal(0d, report.WorstDistance, precision: 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Fails_when_output_diverges()
|
||||||
|
{
|
||||||
|
var report = await new SkillEvaluator().EvaluateAsync(
|
||||||
|
Golden("anything", "a clear logout button in the header"),
|
||||||
|
"body",
|
||||||
|
new StubExecutor(_ => "something completely unrelated and very different indeed"));
|
||||||
|
|
||||||
|
Assert.False(report.Passed);
|
||||||
|
Assert.True(report.WorstDistance > 0.34);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Fails_when_there_are_no_golden_tests()
|
||||||
|
{
|
||||||
|
var report = await new SkillEvaluator().EvaluateAsync([], "body", new StubExecutor(_ => "x"));
|
||||||
|
Assert.False(report.Passed);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user