Expand generation harness observability
This commit is contained in:
322
backend/app/services/harness/evaluation_replay.py
Normal file
322
backend/app/services/harness/evaluation_replay.py
Normal file
@@ -0,0 +1,322 @@
|
||||
"""Internal golden-case replay support for harness evaluations.
|
||||
|
||||
The replay helpers are intentionally not wired to user-facing APIs. They exist
|
||||
to make evaluation behavior reproducible in tests and internal tooling.
|
||||
"""
|
||||
|
||||
import json
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from enum import StrEnum
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
|
||||
from app.services.adapters.storybook.primary import Storybook, StorybookPage
|
||||
from app.services.adapters.text.models import StoryOutput
|
||||
from app.services.harness.evaluators import (
|
||||
EvaluationDimension,
|
||||
EvaluationResult,
|
||||
evaluate_story_output,
|
||||
evaluate_storybook_output,
|
||||
)
|
||||
|
||||
|
||||
class EvaluationReplayArtifact(StrEnum):
|
||||
"""Artifacts supported by deterministic evaluation replay."""
|
||||
|
||||
STORY = "story"
|
||||
STORYBOOK = "storybook"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExpectedEvaluation:
|
||||
"""Expected evaluation outcome for one golden case."""
|
||||
|
||||
passed: bool
|
||||
blocking: bool
|
||||
min_overall_score: float | None = None
|
||||
max_overall_score: float | None = None
|
||||
required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple)
|
||||
quality_gate_codes: tuple[str, ...] = field(default_factory=tuple)
|
||||
warning_substrings: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
@classmethod
|
||||
def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation":
|
||||
"""Build expectations from a JSON-safe payload."""
|
||||
|
||||
return cls(
|
||||
passed=bool(payload["passed"]),
|
||||
blocking=bool(payload["blocking"]),
|
||||
min_overall_score=payload.get("min_overall_score"),
|
||||
max_overall_score=payload.get("max_overall_score"),
|
||||
required_dimensions=tuple(
|
||||
EvaluationDimension(dimension)
|
||||
for dimension in payload.get("required_dimensions", [])
|
||||
),
|
||||
quality_gate_codes=tuple(payload.get("quality_gate_codes", [])),
|
||||
warning_substrings=tuple(payload.get("warning_substrings", [])),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationReplayCoverage:
|
||||
"""Internal coverage labels for one golden replay case."""
|
||||
|
||||
age_band: str = "unknown"
|
||||
content_shape: str = "unknown"
|
||||
risk_area: str = "unknown"
|
||||
tags: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
@classmethod
|
||||
def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage":
|
||||
"""Build coverage labels from a JSON-safe payload."""
|
||||
|
||||
payload = payload or {}
|
||||
return cls(
|
||||
age_band=str(payload.get("age_band", "unknown")),
|
||||
content_shape=str(payload.get("content_shape", "unknown")),
|
||||
risk_area=str(payload.get("risk_area", "unknown")),
|
||||
tags=tuple(str(tag) for tag in payload.get("tags", [])),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationReplayCase:
|
||||
"""One internal golden evaluation case."""
|
||||
|
||||
case_id: str
|
||||
artifact: EvaluationReplayArtifact
|
||||
output_payload: dict[str, Any]
|
||||
expected: ExpectedEvaluation
|
||||
education_theme: str | None = None
|
||||
minimum_score: float = 0.7
|
||||
description: str = ""
|
||||
input_payload: dict[str, Any] = field(default_factory=dict)
|
||||
coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage)
|
||||
|
||||
@classmethod
|
||||
def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase":
|
||||
"""Build a replay case from a JSON-safe payload."""
|
||||
|
||||
input_payload = dict(payload.get("input", {}))
|
||||
minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7))
|
||||
education_theme = input_payload.get("education_theme", payload.get("education_theme"))
|
||||
|
||||
return cls(
|
||||
case_id=str(payload["id"]),
|
||||
artifact=EvaluationReplayArtifact(payload["artifact"]),
|
||||
description=str(payload.get("description", "")),
|
||||
input_payload=input_payload,
|
||||
output_payload=dict(payload["output"]),
|
||||
education_theme=education_theme,
|
||||
minimum_score=float(minimum_score),
|
||||
expected=ExpectedEvaluation.from_payload(payload["expected"]),
|
||||
coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")),
|
||||
)
|
||||
|
||||
def evaluate(self) -> EvaluationResult:
|
||||
"""Run the deterministic evaluator for this case."""
|
||||
|
||||
if self.artifact == EvaluationReplayArtifact.STORY:
|
||||
return evaluate_story_output(
|
||||
_story_output_from_payload(self.output_payload),
|
||||
education_theme=self.education_theme,
|
||||
minimum_score=self.minimum_score,
|
||||
)
|
||||
|
||||
return evaluate_storybook_output(
|
||||
_storybook_from_payload(self.output_payload),
|
||||
education_theme=self.education_theme,
|
||||
minimum_score=self.minimum_score,
|
||||
)
|
||||
|
||||
def replay(self) -> "EvaluationReplayCaseResult":
|
||||
"""Evaluate the case and compare it with expected outcomes."""
|
||||
|
||||
evaluation = self.evaluate()
|
||||
failures = tuple(_compare_evaluation(self, evaluation))
|
||||
return EvaluationReplayCaseResult(
|
||||
case_id=self.case_id,
|
||||
artifact=self.artifact,
|
||||
coverage=self.coverage,
|
||||
evaluation=evaluation,
|
||||
failures=failures,
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationReplayCaseResult:
|
||||
"""Replay result for one golden case."""
|
||||
|
||||
case_id: str
|
||||
artifact: EvaluationReplayArtifact
|
||||
coverage: EvaluationReplayCoverage
|
||||
evaluation: EvaluationResult
|
||||
failures: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def expectations_met(self) -> bool:
|
||||
"""Return whether the case matched all expectations."""
|
||||
|
||||
return not self.failures
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationReplaySuiteResult:
|
||||
"""Replay result for a set of golden cases."""
|
||||
|
||||
cases: tuple[EvaluationReplayCaseResult, ...]
|
||||
|
||||
@property
|
||||
def passed(self) -> bool:
|
||||
"""Return whether every replay case matched expectations."""
|
||||
|
||||
return all(case.expectations_met for case in self.cases)
|
||||
|
||||
@property
|
||||
def failed_case_ids(self) -> tuple[str, ...]:
|
||||
"""Return case IDs with expectation mismatches."""
|
||||
|
||||
return tuple(case.case_id for case in self.cases if not case.expectations_met)
|
||||
|
||||
def failure_report(self) -> str:
|
||||
"""Return a compact failure report for assertion messages."""
|
||||
|
||||
lines: list[str] = []
|
||||
for case in self.cases:
|
||||
for failure in case.failures:
|
||||
lines.append(f"{case.case_id}: {failure}")
|
||||
return "\n".join(lines)
|
||||
|
||||
def coverage_summary(self) -> dict[str, dict[str, int]]:
|
||||
"""Return internal coverage counts for golden replay review."""
|
||||
|
||||
return {
|
||||
"artifact": _count_values(case.artifact.value for case in self.cases),
|
||||
"age_band": _count_values(case.coverage.age_band for case in self.cases),
|
||||
"content_shape": _count_values(
|
||||
case.coverage.content_shape for case in self.cases
|
||||
),
|
||||
"risk_area": _count_values(case.coverage.risk_area for case in self.cases),
|
||||
"tags": _count_values(
|
||||
tag for case in self.cases for tag in case.coverage.tags
|
||||
),
|
||||
"outcome": _count_values(
|
||||
"passed" if case.evaluation.passed else "blocked"
|
||||
for case in self.cases
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]:
|
||||
"""Load internal golden replay cases from a JSON file."""
|
||||
|
||||
raw_cases = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
if not isinstance(raw_cases, list):
|
||||
raise ValueError("Evaluation replay fixture must be a JSON array.")
|
||||
return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases)
|
||||
|
||||
|
||||
def run_evaluation_replay_cases(
|
||||
cases: Iterable[EvaluationReplayCase],
|
||||
) -> EvaluationReplaySuiteResult:
|
||||
"""Run a set of internal golden evaluation replay cases."""
|
||||
|
||||
return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases))
|
||||
|
||||
|
||||
def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult:
|
||||
"""Load and run internal golden evaluation replay cases."""
|
||||
|
||||
return run_evaluation_replay_cases(load_evaluation_replay_cases(path))
|
||||
|
||||
|
||||
def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput:
|
||||
return StoryOutput(
|
||||
mode=payload.get("mode", "generated"),
|
||||
title=payload.get("title", ""),
|
||||
story_text=payload.get("story_text", ""),
|
||||
cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""),
|
||||
)
|
||||
|
||||
|
||||
def _storybook_from_payload(payload: dict[str, Any]) -> Storybook:
|
||||
pages = [
|
||||
StorybookPage(
|
||||
page_number=page.get("page_number", index + 1),
|
||||
text=page.get("text", ""),
|
||||
image_prompt=page.get("image_prompt", ""),
|
||||
image_url=page.get("image_url"),
|
||||
)
|
||||
for index, page in enumerate(payload.get("pages", []))
|
||||
]
|
||||
|
||||
return Storybook(
|
||||
title=payload.get("title", ""),
|
||||
main_character=payload.get("main_character", ""),
|
||||
art_style=payload.get("art_style", ""),
|
||||
pages=pages,
|
||||
cover_prompt=payload.get("cover_prompt", ""),
|
||||
cover_url=payload.get("cover_url"),
|
||||
)
|
||||
|
||||
|
||||
def _count_values(values: Iterable[str]) -> dict[str, int]:
|
||||
counts = Counter(value for value in values if value)
|
||||
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
|
||||
|
||||
|
||||
def _compare_evaluation(
|
||||
case: EvaluationReplayCase,
|
||||
evaluation: EvaluationResult,
|
||||
) -> list[str]:
|
||||
expected = case.expected
|
||||
failures: list[str] = []
|
||||
|
||||
if evaluation.passed != expected.passed:
|
||||
failures.append(f"expected passed={expected.passed}, got {evaluation.passed}")
|
||||
|
||||
if evaluation.blocking != expected.blocking:
|
||||
failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}")
|
||||
|
||||
if (
|
||||
expected.min_overall_score is not None
|
||||
and evaluation.overall_score < expected.min_overall_score
|
||||
):
|
||||
failures.append(
|
||||
"expected overall_score >= "
|
||||
f"{expected.min_overall_score}, got {evaluation.overall_score}"
|
||||
)
|
||||
|
||||
if (
|
||||
expected.max_overall_score is not None
|
||||
and evaluation.overall_score > expected.max_overall_score
|
||||
):
|
||||
failures.append(
|
||||
"expected overall_score <= "
|
||||
f"{expected.max_overall_score}, got {evaluation.overall_score}"
|
||||
)
|
||||
|
||||
actual_dimensions = {score.dimension for score in evaluation.scores}
|
||||
missing_dimensions = [
|
||||
dimension.value
|
||||
for dimension in expected.required_dimensions
|
||||
if dimension not in actual_dimensions
|
||||
]
|
||||
if missing_dimensions:
|
||||
failures.append(f"missing dimensions: {', '.join(missing_dimensions)}")
|
||||
|
||||
actual_quality_gate_codes = tuple(
|
||||
issue.code.value for issue in evaluation.gate_error.issues
|
||||
) if evaluation.gate_error is not None else ()
|
||||
if actual_quality_gate_codes != expected.quality_gate_codes:
|
||||
failures.append(
|
||||
"expected quality_gate_codes="
|
||||
f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}"
|
||||
)
|
||||
|
||||
for expected_warning in expected.warning_substrings:
|
||||
if not any(expected_warning in warning for warning in evaluation.warnings):
|
||||
failures.append(f"missing warning containing: {expected_warning}")
|
||||
|
||||
return failures
|
||||
Reference in New Issue
Block a user