Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -0,0 +1,267 @@
"""Deterministic evaluation helpers for generated child-facing content."""
from dataclasses import dataclass, field
from enum import StrEnum
from typing import Any
from app.services.adapters.storybook.primary import Storybook
from app.services.adapters.text.models import StoryOutput
from app.services.harness.quality_gates import (
QualityGateError,
validate_story_output,
validate_storybook_output,
)
class EvaluationDimension(StrEnum):
"""Stable dimensions used by harness evaluations."""
STRUCTURE = "structure"
SAFETY = "safety"
AGE_FIT = "age_fit"
EDUCATIONAL_VALUE = "educational_value"
READABILITY = "readability"
@dataclass(frozen=True)
class EvaluationScore:
"""One scored evaluation dimension."""
dimension: EvaluationDimension
score: float
reason: str
def to_metadata(self) -> dict[str, Any]:
"""Return a JSON-safe metadata payload."""
return {
"dimension": self.dimension.value,
"score": self.score,
"reason": self.reason,
}
@dataclass(frozen=True)
class EvaluationResult:
"""Deterministic evaluation result for one generated artifact."""
overall_score: float
passed: bool
blocking: bool
scores: tuple[EvaluationScore, ...]
gate_error: QualityGateError | None = None
warnings: tuple[str, ...] = field(default_factory=tuple)
def to_metadata(self) -> dict[str, Any]:
"""Return a JSON-safe metadata payload."""
metadata: dict[str, Any] = {
"overall_score": self.overall_score,
"passed": self.passed,
"blocking": self.blocking,
"scores": [score.to_metadata() for score in self.scores],
"warnings": list(self.warnings),
}
if self.gate_error is not None:
metadata["quality_gate"] = self.gate_error.to_metadata()
return metadata
def _clamp_score(value: float) -> float:
return max(0.0, min(1.0, round(value, 2)))
def _story_text_readability_score(story_text: str) -> float:
"""Score text length with a conservative 3-8 age readability heuristic."""
normalized_length = len(story_text.strip())
if normalized_length < 30:
return 0.45
if normalized_length > 2500:
return 0.72
if normalized_length > 1800:
return 0.84
return 0.96
def _educational_value_score(story_text: str, education_theme: str | None) -> float:
if not education_theme:
return 0.82
return 0.96 if education_theme.strip() in story_text else 0.88
def _storybook_readability_score(page_texts: list[str]) -> float:
if not page_texts:
return 0.0
page_lengths = [len(text.strip()) for text in page_texts]
if any(length < 8 for length in page_lengths):
return 0.62
if any(length > 320 for length in page_lengths):
return 0.78
if any(length > 220 for length in page_lengths):
return 0.88
return 0.96
def _storybook_educational_value_score(
page_texts: list[str],
education_theme: str | None,
) -> float:
if not education_theme:
return 0.82
combined_text = " ".join(page_texts)
return 0.96 if education_theme.strip() in combined_text else 0.88
def evaluate_story_output(
output: StoryOutput,
*,
education_theme: str | None = None,
minimum_score: float = 0.7,
) -> EvaluationResult:
"""Evaluate a generated text story before persistence."""
try:
validate_story_output(output)
except QualityGateError as exc:
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=0.0,
reason="故事结构未通过质量门。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=0.0,
reason="内容未通过儿童安全或结构完整性检查。",
),
)
return EvaluationResult(
overall_score=0.0,
passed=False,
blocking=True,
scores=scores,
gate_error=exc,
)
readability_score = _story_text_readability_score(output.story_text)
educational_score = _educational_value_score(output.story_text, education_theme)
warnings: list[str] = []
if readability_score < 0.8:
warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。")
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=1.0,
reason="标题、正文和封面提示词完整。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=1.0,
reason="未命中确定性儿童安全风险词。",
),
EvaluationScore(
dimension=EvaluationDimension.AGE_FIT,
score=readability_score,
reason="根据正文长度估算低龄儿童阅读适配度。",
),
EvaluationScore(
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
score=educational_score,
reason="根据教育主题是否清晰融入正文估算。",
),
EvaluationScore(
dimension=EvaluationDimension.READABILITY,
score=readability_score,
reason="根据正文长度估算朗读和亲子共读流畅度。",
),
)
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
return EvaluationResult(
overall_score=overall_score,
passed=overall_score >= minimum_score,
blocking=overall_score < minimum_score,
scores=scores,
warnings=tuple(warnings),
)
def evaluate_storybook_output(
output: Storybook,
*,
education_theme: str | None = None,
minimum_score: float = 0.7,
) -> EvaluationResult:
"""Evaluate generated storybook structure before persistence."""
try:
validate_storybook_output(output)
except QualityGateError as exc:
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=0.0,
reason="绘本结构未通过质量门。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=0.0,
reason="绘本内容未通过儿童安全或结构完整性检查。",
),
)
return EvaluationResult(
overall_score=0.0,
passed=False,
blocking=True,
scores=scores,
gate_error=exc,
)
page_texts = [page.text for page in output.pages]
readability_score = _storybook_readability_score(page_texts)
educational_score = _storybook_educational_value_score(page_texts, education_theme)
warnings: list[str] = []
if readability_score < 0.8:
warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。")
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=1.0,
reason="绘本标题、分页和页码结构完整。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=1.0,
reason="未命中确定性儿童安全风险词。",
),
EvaluationScore(
dimension=EvaluationDimension.AGE_FIT,
score=readability_score,
reason="根据每页正文长度估算低龄儿童翻页阅读适配度。",
),
EvaluationScore(
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
score=educational_score,
reason="根据教育主题是否清晰融入分页正文估算。",
),
EvaluationScore(
dimension=EvaluationDimension.READABILITY,
score=readability_score,
reason="根据分页正文长度估算亲子共读流畅度。",
),
)
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
return EvaluationResult(
overall_score=overall_score,
passed=overall_score >= minimum_score,
blocking=overall_score < minimum_score,
scores=scores,
warnings=tuple(warnings),
)