Files
dreamweaver/backend/app/services/harness/evaluation_replay.py

323 lines
11 KiB
Python

"""Internal golden-case replay support for harness evaluations.
The replay helpers are intentionally not wired to user-facing APIs. They exist
to make evaluation behavior reproducible in tests and internal tooling.
"""
import json
from collections import Counter
from dataclasses import dataclass, field
from enum import StrEnum
from pathlib import Path
from typing import Any, Iterable
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.harness.evaluators import (
EvaluationDimension,
EvaluationResult,
evaluate_story_output,
evaluate_storybook_output,
)
class EvaluationReplayArtifact(StrEnum):
"""Artifacts supported by deterministic evaluation replay."""
STORY = "story"
STORYBOOK = "storybook"
@dataclass(frozen=True)
class ExpectedEvaluation:
"""Expected evaluation outcome for one golden case."""
passed: bool
blocking: bool
min_overall_score: float | None = None
max_overall_score: float | None = None
required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple)
quality_gate_codes: tuple[str, ...] = field(default_factory=tuple)
warning_substrings: tuple[str, ...] = field(default_factory=tuple)
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation":
"""Build expectations from a JSON-safe payload."""
return cls(
passed=bool(payload["passed"]),
blocking=bool(payload["blocking"]),
min_overall_score=payload.get("min_overall_score"),
max_overall_score=payload.get("max_overall_score"),
required_dimensions=tuple(
EvaluationDimension(dimension)
for dimension in payload.get("required_dimensions", [])
),
quality_gate_codes=tuple(payload.get("quality_gate_codes", [])),
warning_substrings=tuple(payload.get("warning_substrings", [])),
)
@dataclass(frozen=True)
class EvaluationReplayCoverage:
"""Internal coverage labels for one golden replay case."""
age_band: str = "unknown"
content_shape: str = "unknown"
risk_area: str = "unknown"
tags: tuple[str, ...] = field(default_factory=tuple)
@classmethod
def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage":
"""Build coverage labels from a JSON-safe payload."""
payload = payload or {}
return cls(
age_band=str(payload.get("age_band", "unknown")),
content_shape=str(payload.get("content_shape", "unknown")),
risk_area=str(payload.get("risk_area", "unknown")),
tags=tuple(str(tag) for tag in payload.get("tags", [])),
)
@dataclass(frozen=True)
class EvaluationReplayCase:
"""One internal golden evaluation case."""
case_id: str
artifact: EvaluationReplayArtifact
output_payload: dict[str, Any]
expected: ExpectedEvaluation
education_theme: str | None = None
minimum_score: float = 0.7
description: str = ""
input_payload: dict[str, Any] = field(default_factory=dict)
coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage)
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase":
"""Build a replay case from a JSON-safe payload."""
input_payload = dict(payload.get("input", {}))
minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7))
education_theme = input_payload.get("education_theme", payload.get("education_theme"))
return cls(
case_id=str(payload["id"]),
artifact=EvaluationReplayArtifact(payload["artifact"]),
description=str(payload.get("description", "")),
input_payload=input_payload,
output_payload=dict(payload["output"]),
education_theme=education_theme,
minimum_score=float(minimum_score),
expected=ExpectedEvaluation.from_payload(payload["expected"]),
coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")),
)
def evaluate(self) -> EvaluationResult:
"""Run the deterministic evaluator for this case."""
if self.artifact == EvaluationReplayArtifact.STORY:
return evaluate_story_output(
_story_output_from_payload(self.output_payload),
education_theme=self.education_theme,
minimum_score=self.minimum_score,
)
return evaluate_storybook_output(
_storybook_from_payload(self.output_payload),
education_theme=self.education_theme,
minimum_score=self.minimum_score,
)
def replay(self) -> "EvaluationReplayCaseResult":
"""Evaluate the case and compare it with expected outcomes."""
evaluation = self.evaluate()
failures = tuple(_compare_evaluation(self, evaluation))
return EvaluationReplayCaseResult(
case_id=self.case_id,
artifact=self.artifact,
coverage=self.coverage,
evaluation=evaluation,
failures=failures,
)
@dataclass(frozen=True)
class EvaluationReplayCaseResult:
"""Replay result for one golden case."""
case_id: str
artifact: EvaluationReplayArtifact
coverage: EvaluationReplayCoverage
evaluation: EvaluationResult
failures: tuple[str, ...] = field(default_factory=tuple)
@property
def expectations_met(self) -> bool:
"""Return whether the case matched all expectations."""
return not self.failures
@dataclass(frozen=True)
class EvaluationReplaySuiteResult:
"""Replay result for a set of golden cases."""
cases: tuple[EvaluationReplayCaseResult, ...]
@property
def passed(self) -> bool:
"""Return whether every replay case matched expectations."""
return all(case.expectations_met for case in self.cases)
@property
def failed_case_ids(self) -> tuple[str, ...]:
"""Return case IDs with expectation mismatches."""
return tuple(case.case_id for case in self.cases if not case.expectations_met)
def failure_report(self) -> str:
"""Return a compact failure report for assertion messages."""
lines: list[str] = []
for case in self.cases:
for failure in case.failures:
lines.append(f"{case.case_id}: {failure}")
return "\n".join(lines)
def coverage_summary(self) -> dict[str, dict[str, int]]:
"""Return internal coverage counts for golden replay review."""
return {
"artifact": _count_values(case.artifact.value for case in self.cases),
"age_band": _count_values(case.coverage.age_band for case in self.cases),
"content_shape": _count_values(
case.coverage.content_shape for case in self.cases
),
"risk_area": _count_values(case.coverage.risk_area for case in self.cases),
"tags": _count_values(
tag for case in self.cases for tag in case.coverage.tags
),
"outcome": _count_values(
"passed" if case.evaluation.passed else "blocked"
for case in self.cases
),
}
def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]:
"""Load internal golden replay cases from a JSON file."""
raw_cases = json.loads(Path(path).read_text(encoding="utf-8"))
if not isinstance(raw_cases, list):
raise ValueError("Evaluation replay fixture must be a JSON array.")
return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases)
def run_evaluation_replay_cases(
cases: Iterable[EvaluationReplayCase],
) -> EvaluationReplaySuiteResult:
"""Run a set of internal golden evaluation replay cases."""
return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases))
def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult:
"""Load and run internal golden evaluation replay cases."""
return run_evaluation_replay_cases(load_evaluation_replay_cases(path))
def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput:
return StoryOutput(
mode=payload.get("mode", "generated"),
title=payload.get("title", ""),
story_text=payload.get("story_text", ""),
cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""),
)
def _storybook_from_payload(payload: dict[str, Any]) -> Storybook:
pages = [
StorybookPage(
page_number=page.get("page_number", index + 1),
text=page.get("text", ""),
image_prompt=page.get("image_prompt", ""),
image_url=page.get("image_url"),
)
for index, page in enumerate(payload.get("pages", []))
]
return Storybook(
title=payload.get("title", ""),
main_character=payload.get("main_character", ""),
art_style=payload.get("art_style", ""),
pages=pages,
cover_prompt=payload.get("cover_prompt", ""),
cover_url=payload.get("cover_url"),
)
def _count_values(values: Iterable[str]) -> dict[str, int]:
counts = Counter(value for value in values if value)
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
def _compare_evaluation(
case: EvaluationReplayCase,
evaluation: EvaluationResult,
) -> list[str]:
expected = case.expected
failures: list[str] = []
if evaluation.passed != expected.passed:
failures.append(f"expected passed={expected.passed}, got {evaluation.passed}")
if evaluation.blocking != expected.blocking:
failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}")
if (
expected.min_overall_score is not None
and evaluation.overall_score < expected.min_overall_score
):
failures.append(
"expected overall_score >= "
f"{expected.min_overall_score}, got {evaluation.overall_score}"
)
if (
expected.max_overall_score is not None
and evaluation.overall_score > expected.max_overall_score
):
failures.append(
"expected overall_score <= "
f"{expected.max_overall_score}, got {evaluation.overall_score}"
)
actual_dimensions = {score.dimension for score in evaluation.scores}
missing_dimensions = [
dimension.value
for dimension in expected.required_dimensions
if dimension not in actual_dimensions
]
if missing_dimensions:
failures.append(f"missing dimensions: {', '.join(missing_dimensions)}")
actual_quality_gate_codes = tuple(
issue.code.value for issue in evaluation.gate_error.issues
) if evaluation.gate_error is not None else ()
if actual_quality_gate_codes != expected.quality_gate_codes:
failures.append(
"expected quality_gate_codes="
f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}"
)
for expected_warning in expected.warning_substrings:
if not any(expected_warning in warning for warning in evaluation.warnings):
failures.append(f"missing warning containing: {expected_warning}")
return failures