diff --git a/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs b/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs
new file mode 100644
index 0000000..8d57322
--- /dev/null
+++ b/src/Modules/TeamUp.Modules.Skills/Eval/SkillEvaluator.cs
@@ -0,0 +1,45 @@
+using TeamUp.Modules.Skills.Domain;
+using TeamUp.SharedKernel.Metrics;
+
+namespace TeamUp.Modules.Skills.Eval;
+
+/// Runs a skill against one golden input and returns its output.
+internal interface ISkillExecutor
+{
+ Task ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default);
+}
+
+internal sealed record GoldenResult(string Input, double Distance, bool Passed);
+
+internal sealed record EvalReport(bool Passed, double WorstDistance, IReadOnlyList Results);
+
+///
+/// The eval harness: runs each golden test through an executor and gates on normalized edit
+/// distance (the north-star metric). In M2 the executor is a stub (no model runtime); M4's
+/// assembler supplies the real one, and publishing is gated on .
+///
+internal sealed class SkillEvaluator(double passThreshold = 0.34)
+{
+ public async Task EvaluateAsync(
+ IReadOnlyList goldenTests,
+ string skillBody,
+ ISkillExecutor executor,
+ CancellationToken cancellationToken = default)
+ {
+ if (goldenTests.Count == 0)
+ {
+ return new EvalReport(false, 1.0, []);
+ }
+
+ var results = new List(goldenTests.Count);
+ foreach (var test in goldenTests)
+ {
+ var output = await executor.ExecuteAsync(skillBody, test.Input, cancellationToken);
+ var distance = EditDistance.Normalized(test.Expected, output);
+ results.Add(new GoldenResult(test.Input, distance, distance <= passThreshold));
+ }
+
+ var worst = results.Max(r => r.Distance);
+ return new EvalReport(results.All(r => r.Passed), worst, results);
+ }
+}
diff --git a/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj b/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj
index 662caa0..79f6ce6 100644
--- a/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj
+++ b/src/Modules/TeamUp.Modules.Skills/TeamUp.Modules.Skills.csproj
@@ -7,6 +7,11 @@
+
+
+
+
+
diff --git a/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs b/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs
new file mode 100644
index 0000000..bf18ed4
--- /dev/null
+++ b/tests/TeamUp.IntegrationTests/SkillEvaluatorTests.cs
@@ -0,0 +1,49 @@
+using TeamUp.Modules.Skills.Domain;
+using TeamUp.Modules.Skills.Eval;
+using Xunit;
+
+namespace TeamUp.IntegrationTests;
+
+/// Unit coverage for the eval harness (no database). Uses a stub executor for the model.
+public sealed class SkillEvaluatorTests
+{
+ private sealed class StubExecutor(Func respond) : ISkillExecutor
+ {
+ public Task ExecuteAsync(string skillBody, string input, CancellationToken cancellationToken = default) =>
+ Task.FromResult(respond(input));
+ }
+
+ private static List Golden(string input, string expected) =>
+ [new GoldenExample { Input = input, Expected = expected }];
+
+ [Fact]
+ public async Task Passes_when_output_matches_expected()
+ {
+ var report = await new SkillEvaluator().EvaluateAsync(
+ Golden("anything", "a clear logout button in the header"),
+ "body",
+ new StubExecutor(_ => "a clear logout button in the header"));
+
+ Assert.True(report.Passed);
+ Assert.Equal(0d, report.WorstDistance, precision: 3);
+ }
+
+ [Fact]
+ public async Task Fails_when_output_diverges()
+ {
+ var report = await new SkillEvaluator().EvaluateAsync(
+ Golden("anything", "a clear logout button in the header"),
+ "body",
+ new StubExecutor(_ => "something completely unrelated and very different indeed"));
+
+ Assert.False(report.Passed);
+ Assert.True(report.WorstDistance > 0.34);
+ }
+
+ [Fact]
+ public async Task Fails_when_there_are_no_golden_tests()
+ {
+ var report = await new SkillEvaluator().EvaluateAsync([], "body", new StubExecutor(_ => "x"));
+ Assert.False(report.Passed);
+ }
+}