M2: eval harness — golden tests gated on edit distance

- SkillEvaluator (internal to Skills): runs each golden test through an ISkillExecutor and
  passes only if normalized edit distance <= threshold (the north-star metric). The executor
  is a stub in M2 (no model runtime); M4's assembler supplies the real one and publishing is
  gated on the report. The indexer's structural gate (roles + >=1 golden test) stands until then.
- InternalsVisibleTo the integration tests so the harness is exercised directly.

Verified: build green; ArchitectureTests 8/8; IntegrationTests 25/25 (+3 eval-harness unit
tests: pass on match, fail on divergence, fail with no golden tests).
This commit is contained in:
soroush.asadi
2026-06-09 18:42:19 +03:30
parent bfcd223374
commit e987e33c0a
3 changed files with 99 additions and 0 deletions
@@ -0,0 +1,45 @@
using TeamUp.Modules.Skills.Domain;
using TeamUp.SharedKernel.Metrics;
namespace TeamUp.Modules.Skills.Eval;
/// <summary>Runs a skill against one golden input and returns its output.</summary>
internal interface ISkillExecutor
{
Task<string> ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default);
}
internal sealed record GoldenResult(string Input, double Distance, bool Passed);
internal sealed record EvalReport(bool Passed, double WorstDistance, IReadOnlyList<GoldenResult> Results);
/// <summary>
/// The eval harness: runs each golden test through an executor and gates on normalized edit
/// distance (the north-star metric). In M2 the executor is a stub (no model runtime); M4's
/// assembler supplies the real one, and publishing is gated on <see cref="EvalReport.Passed"/>.
/// </summary>
internal sealed class SkillEvaluator(double passThreshold = 0.34)
{
public async Task<EvalReport> EvaluateAsync(
IReadOnlyList<GoldenExample> goldenTests,
string skillBody,
ISkillExecutor executor,
CancellationToken cancellationToken = default)
{
if (goldenTests.Count == 0)
{
return new EvalReport(false, 1.0, []);
}
var results = new List<GoldenResult>(goldenTests.Count);
foreach (var test in goldenTests)
{
var output = await executor.ExecuteAsync(skillBody, test.Input, cancellationToken);
var distance = EditDistance.Normalized(test.Expected, output);
results.Add(new GoldenResult(test.Input, distance, distance <= passThreshold));
}
var worst = results.Max(r => r.Distance);
return new EvalReport(results.All(r => r.Passed), worst, results);
}
}
@@ -7,6 +7,11 @@
<ProjectReference Include="..\..\Shared\TeamUp.SharedKernel\TeamUp.SharedKernel.csproj" />
</ItemGroup>
<!-- The eval harness is internal; let the integration tests exercise it directly. -->
<ItemGroup>
<InternalsVisibleTo Include="TeamUp.IntegrationTests" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.EntityFrameworkCore" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" PrivateAssets="all" />