323 lines
11 KiB
Python
323 lines
11 KiB
Python
"""Internal golden-case replay support for harness evaluations.
|
|
|
|
The replay helpers are intentionally not wired to user-facing APIs. They exist
|
|
to make evaluation behavior reproducible in tests and internal tooling.
|
|
"""
|
|
|
|
import json
|
|
from collections import Counter
|
|
from dataclasses import dataclass, field
|
|
from enum import StrEnum
|
|
from pathlib import Path
|
|
from typing import Any, Iterable
|
|
|
|
from app.services.adapters.storybook.primary import Storybook, StorybookPage
|
|
from app.services.adapters.text.models import StoryOutput
|
|
from app.services.harness.evaluators import (
|
|
EvaluationDimension,
|
|
EvaluationResult,
|
|
evaluate_story_output,
|
|
evaluate_storybook_output,
|
|
)
|
|
|
|
|
|
class EvaluationReplayArtifact(StrEnum):
|
|
"""Artifacts supported by deterministic evaluation replay."""
|
|
|
|
STORY = "story"
|
|
STORYBOOK = "storybook"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ExpectedEvaluation:
|
|
"""Expected evaluation outcome for one golden case."""
|
|
|
|
passed: bool
|
|
blocking: bool
|
|
min_overall_score: float | None = None
|
|
max_overall_score: float | None = None
|
|
required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple)
|
|
quality_gate_codes: tuple[str, ...] = field(default_factory=tuple)
|
|
warning_substrings: tuple[str, ...] = field(default_factory=tuple)
|
|
|
|
@classmethod
|
|
def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation":
|
|
"""Build expectations from a JSON-safe payload."""
|
|
|
|
return cls(
|
|
passed=bool(payload["passed"]),
|
|
blocking=bool(payload["blocking"]),
|
|
min_overall_score=payload.get("min_overall_score"),
|
|
max_overall_score=payload.get("max_overall_score"),
|
|
required_dimensions=tuple(
|
|
EvaluationDimension(dimension)
|
|
for dimension in payload.get("required_dimensions", [])
|
|
),
|
|
quality_gate_codes=tuple(payload.get("quality_gate_codes", [])),
|
|
warning_substrings=tuple(payload.get("warning_substrings", [])),
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EvaluationReplayCoverage:
|
|
"""Internal coverage labels for one golden replay case."""
|
|
|
|
age_band: str = "unknown"
|
|
content_shape: str = "unknown"
|
|
risk_area: str = "unknown"
|
|
tags: tuple[str, ...] = field(default_factory=tuple)
|
|
|
|
@classmethod
|
|
def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage":
|
|
"""Build coverage labels from a JSON-safe payload."""
|
|
|
|
payload = payload or {}
|
|
return cls(
|
|
age_band=str(payload.get("age_band", "unknown")),
|
|
content_shape=str(payload.get("content_shape", "unknown")),
|
|
risk_area=str(payload.get("risk_area", "unknown")),
|
|
tags=tuple(str(tag) for tag in payload.get("tags", [])),
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EvaluationReplayCase:
|
|
"""One internal golden evaluation case."""
|
|
|
|
case_id: str
|
|
artifact: EvaluationReplayArtifact
|
|
output_payload: dict[str, Any]
|
|
expected: ExpectedEvaluation
|
|
education_theme: str | None = None
|
|
minimum_score: float = 0.7
|
|
description: str = ""
|
|
input_payload: dict[str, Any] = field(default_factory=dict)
|
|
coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage)
|
|
|
|
@classmethod
|
|
def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase":
|
|
"""Build a replay case from a JSON-safe payload."""
|
|
|
|
input_payload = dict(payload.get("input", {}))
|
|
minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7))
|
|
education_theme = input_payload.get("education_theme", payload.get("education_theme"))
|
|
|
|
return cls(
|
|
case_id=str(payload["id"]),
|
|
artifact=EvaluationReplayArtifact(payload["artifact"]),
|
|
description=str(payload.get("description", "")),
|
|
input_payload=input_payload,
|
|
output_payload=dict(payload["output"]),
|
|
education_theme=education_theme,
|
|
minimum_score=float(minimum_score),
|
|
expected=ExpectedEvaluation.from_payload(payload["expected"]),
|
|
coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")),
|
|
)
|
|
|
|
def evaluate(self) -> EvaluationResult:
|
|
"""Run the deterministic evaluator for this case."""
|
|
|
|
if self.artifact == EvaluationReplayArtifact.STORY:
|
|
return evaluate_story_output(
|
|
_story_output_from_payload(self.output_payload),
|
|
education_theme=self.education_theme,
|
|
minimum_score=self.minimum_score,
|
|
)
|
|
|
|
return evaluate_storybook_output(
|
|
_storybook_from_payload(self.output_payload),
|
|
education_theme=self.education_theme,
|
|
minimum_score=self.minimum_score,
|
|
)
|
|
|
|
def replay(self) -> "EvaluationReplayCaseResult":
|
|
"""Evaluate the case and compare it with expected outcomes."""
|
|
|
|
evaluation = self.evaluate()
|
|
failures = tuple(_compare_evaluation(self, evaluation))
|
|
return EvaluationReplayCaseResult(
|
|
case_id=self.case_id,
|
|
artifact=self.artifact,
|
|
coverage=self.coverage,
|
|
evaluation=evaluation,
|
|
failures=failures,
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EvaluationReplayCaseResult:
|
|
"""Replay result for one golden case."""
|
|
|
|
case_id: str
|
|
artifact: EvaluationReplayArtifact
|
|
coverage: EvaluationReplayCoverage
|
|
evaluation: EvaluationResult
|
|
failures: tuple[str, ...] = field(default_factory=tuple)
|
|
|
|
@property
|
|
def expectations_met(self) -> bool:
|
|
"""Return whether the case matched all expectations."""
|
|
|
|
return not self.failures
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EvaluationReplaySuiteResult:
|
|
"""Replay result for a set of golden cases."""
|
|
|
|
cases: tuple[EvaluationReplayCaseResult, ...]
|
|
|
|
@property
|
|
def passed(self) -> bool:
|
|
"""Return whether every replay case matched expectations."""
|
|
|
|
return all(case.expectations_met for case in self.cases)
|
|
|
|
@property
|
|
def failed_case_ids(self) -> tuple[str, ...]:
|
|
"""Return case IDs with expectation mismatches."""
|
|
|
|
return tuple(case.case_id for case in self.cases if not case.expectations_met)
|
|
|
|
def failure_report(self) -> str:
|
|
"""Return a compact failure report for assertion messages."""
|
|
|
|
lines: list[str] = []
|
|
for case in self.cases:
|
|
for failure in case.failures:
|
|
lines.append(f"{case.case_id}: {failure}")
|
|
return "\n".join(lines)
|
|
|
|
def coverage_summary(self) -> dict[str, dict[str, int]]:
|
|
"""Return internal coverage counts for golden replay review."""
|
|
|
|
return {
|
|
"artifact": _count_values(case.artifact.value for case in self.cases),
|
|
"age_band": _count_values(case.coverage.age_band for case in self.cases),
|
|
"content_shape": _count_values(
|
|
case.coverage.content_shape for case in self.cases
|
|
),
|
|
"risk_area": _count_values(case.coverage.risk_area for case in self.cases),
|
|
"tags": _count_values(
|
|
tag for case in self.cases for tag in case.coverage.tags
|
|
),
|
|
"outcome": _count_values(
|
|
"passed" if case.evaluation.passed else "blocked"
|
|
for case in self.cases
|
|
),
|
|
}
|
|
|
|
|
|
def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]:
|
|
"""Load internal golden replay cases from a JSON file."""
|
|
|
|
raw_cases = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
if not isinstance(raw_cases, list):
|
|
raise ValueError("Evaluation replay fixture must be a JSON array.")
|
|
return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases)
|
|
|
|
|
|
def run_evaluation_replay_cases(
|
|
cases: Iterable[EvaluationReplayCase],
|
|
) -> EvaluationReplaySuiteResult:
|
|
"""Run a set of internal golden evaluation replay cases."""
|
|
|
|
return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases))
|
|
|
|
|
|
def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult:
|
|
"""Load and run internal golden evaluation replay cases."""
|
|
|
|
return run_evaluation_replay_cases(load_evaluation_replay_cases(path))
|
|
|
|
|
|
def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput:
|
|
return StoryOutput(
|
|
mode=payload.get("mode", "generated"),
|
|
title=payload.get("title", ""),
|
|
story_text=payload.get("story_text", ""),
|
|
cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""),
|
|
)
|
|
|
|
|
|
def _storybook_from_payload(payload: dict[str, Any]) -> Storybook:
|
|
pages = [
|
|
StorybookPage(
|
|
page_number=page.get("page_number", index + 1),
|
|
text=page.get("text", ""),
|
|
image_prompt=page.get("image_prompt", ""),
|
|
image_url=page.get("image_url"),
|
|
)
|
|
for index, page in enumerate(payload.get("pages", []))
|
|
]
|
|
|
|
return Storybook(
|
|
title=payload.get("title", ""),
|
|
main_character=payload.get("main_character", ""),
|
|
art_style=payload.get("art_style", ""),
|
|
pages=pages,
|
|
cover_prompt=payload.get("cover_prompt", ""),
|
|
cover_url=payload.get("cover_url"),
|
|
)
|
|
|
|
|
|
def _count_values(values: Iterable[str]) -> dict[str, int]:
|
|
counts = Counter(value for value in values if value)
|
|
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
|
|
|
|
|
|
def _compare_evaluation(
|
|
case: EvaluationReplayCase,
|
|
evaluation: EvaluationResult,
|
|
) -> list[str]:
|
|
expected = case.expected
|
|
failures: list[str] = []
|
|
|
|
if evaluation.passed != expected.passed:
|
|
failures.append(f"expected passed={expected.passed}, got {evaluation.passed}")
|
|
|
|
if evaluation.blocking != expected.blocking:
|
|
failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}")
|
|
|
|
if (
|
|
expected.min_overall_score is not None
|
|
and evaluation.overall_score < expected.min_overall_score
|
|
):
|
|
failures.append(
|
|
"expected overall_score >= "
|
|
f"{expected.min_overall_score}, got {evaluation.overall_score}"
|
|
)
|
|
|
|
if (
|
|
expected.max_overall_score is not None
|
|
and evaluation.overall_score > expected.max_overall_score
|
|
):
|
|
failures.append(
|
|
"expected overall_score <= "
|
|
f"{expected.max_overall_score}, got {evaluation.overall_score}"
|
|
)
|
|
|
|
actual_dimensions = {score.dimension for score in evaluation.scores}
|
|
missing_dimensions = [
|
|
dimension.value
|
|
for dimension in expected.required_dimensions
|
|
if dimension not in actual_dimensions
|
|
]
|
|
if missing_dimensions:
|
|
failures.append(f"missing dimensions: {', '.join(missing_dimensions)}")
|
|
|
|
actual_quality_gate_codes = tuple(
|
|
issue.code.value for issue in evaluation.gate_error.issues
|
|
) if evaluation.gate_error is not None else ()
|
|
if actual_quality_gate_codes != expected.quality_gate_codes:
|
|
failures.append(
|
|
"expected quality_gate_codes="
|
|
f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}"
|
|
)
|
|
|
|
for expected_warning in expected.warning_substrings:
|
|
if not any(expected_warning in warning for warning in evaluation.warnings):
|
|
failures.append(f"missing warning containing: {expected_warning}")
|
|
|
|
return failures
|