Expand generation harness observability
This commit is contained in:
267
backend/app/services/harness/evaluators.py
Normal file
267
backend/app/services/harness/evaluators.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Deterministic evaluation helpers for generated child-facing content."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
|
||||
from app.services.adapters.storybook.primary import Storybook
|
||||
from app.services.adapters.text.models import StoryOutput
|
||||
from app.services.harness.quality_gates import (
|
||||
QualityGateError,
|
||||
validate_story_output,
|
||||
validate_storybook_output,
|
||||
)
|
||||
|
||||
|
||||
class EvaluationDimension(StrEnum):
|
||||
"""Stable dimensions used by harness evaluations."""
|
||||
|
||||
STRUCTURE = "structure"
|
||||
SAFETY = "safety"
|
||||
AGE_FIT = "age_fit"
|
||||
EDUCATIONAL_VALUE = "educational_value"
|
||||
READABILITY = "readability"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationScore:
|
||||
"""One scored evaluation dimension."""
|
||||
|
||||
dimension: EvaluationDimension
|
||||
score: float
|
||||
reason: str
|
||||
|
||||
def to_metadata(self) -> dict[str, Any]:
|
||||
"""Return a JSON-safe metadata payload."""
|
||||
|
||||
return {
|
||||
"dimension": self.dimension.value,
|
||||
"score": self.score,
|
||||
"reason": self.reason,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationResult:
|
||||
"""Deterministic evaluation result for one generated artifact."""
|
||||
|
||||
overall_score: float
|
||||
passed: bool
|
||||
blocking: bool
|
||||
scores: tuple[EvaluationScore, ...]
|
||||
gate_error: QualityGateError | None = None
|
||||
warnings: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
def to_metadata(self) -> dict[str, Any]:
|
||||
"""Return a JSON-safe metadata payload."""
|
||||
|
||||
metadata: dict[str, Any] = {
|
||||
"overall_score": self.overall_score,
|
||||
"passed": self.passed,
|
||||
"blocking": self.blocking,
|
||||
"scores": [score.to_metadata() for score in self.scores],
|
||||
"warnings": list(self.warnings),
|
||||
}
|
||||
if self.gate_error is not None:
|
||||
metadata["quality_gate"] = self.gate_error.to_metadata()
|
||||
return metadata
|
||||
|
||||
|
||||
def _clamp_score(value: float) -> float:
|
||||
return max(0.0, min(1.0, round(value, 2)))
|
||||
|
||||
|
||||
def _story_text_readability_score(story_text: str) -> float:
|
||||
"""Score text length with a conservative 3-8 age readability heuristic."""
|
||||
|
||||
normalized_length = len(story_text.strip())
|
||||
if normalized_length < 30:
|
||||
return 0.45
|
||||
if normalized_length > 2500:
|
||||
return 0.72
|
||||
if normalized_length > 1800:
|
||||
return 0.84
|
||||
return 0.96
|
||||
|
||||
|
||||
def _educational_value_score(story_text: str, education_theme: str | None) -> float:
|
||||
if not education_theme:
|
||||
return 0.82
|
||||
return 0.96 if education_theme.strip() in story_text else 0.88
|
||||
|
||||
|
||||
def _storybook_readability_score(page_texts: list[str]) -> float:
|
||||
if not page_texts:
|
||||
return 0.0
|
||||
|
||||
page_lengths = [len(text.strip()) for text in page_texts]
|
||||
if any(length < 8 for length in page_lengths):
|
||||
return 0.62
|
||||
if any(length > 320 for length in page_lengths):
|
||||
return 0.78
|
||||
if any(length > 220 for length in page_lengths):
|
||||
return 0.88
|
||||
return 0.96
|
||||
|
||||
|
||||
def _storybook_educational_value_score(
|
||||
page_texts: list[str],
|
||||
education_theme: str | None,
|
||||
) -> float:
|
||||
if not education_theme:
|
||||
return 0.82
|
||||
combined_text = " ".join(page_texts)
|
||||
return 0.96 if education_theme.strip() in combined_text else 0.88
|
||||
|
||||
|
||||
def evaluate_story_output(
|
||||
output: StoryOutput,
|
||||
*,
|
||||
education_theme: str | None = None,
|
||||
minimum_score: float = 0.7,
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate a generated text story before persistence."""
|
||||
|
||||
try:
|
||||
validate_story_output(output)
|
||||
except QualityGateError as exc:
|
||||
scores = (
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.STRUCTURE,
|
||||
score=0.0,
|
||||
reason="故事结构未通过质量门。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.SAFETY,
|
||||
score=0.0,
|
||||
reason="内容未通过儿童安全或结构完整性检查。",
|
||||
),
|
||||
)
|
||||
return EvaluationResult(
|
||||
overall_score=0.0,
|
||||
passed=False,
|
||||
blocking=True,
|
||||
scores=scores,
|
||||
gate_error=exc,
|
||||
)
|
||||
|
||||
readability_score = _story_text_readability_score(output.story_text)
|
||||
educational_score = _educational_value_score(output.story_text, education_theme)
|
||||
warnings: list[str] = []
|
||||
|
||||
if readability_score < 0.8:
|
||||
warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。")
|
||||
|
||||
scores = (
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.STRUCTURE,
|
||||
score=1.0,
|
||||
reason="标题、正文和封面提示词完整。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.SAFETY,
|
||||
score=1.0,
|
||||
reason="未命中确定性儿童安全风险词。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.AGE_FIT,
|
||||
score=readability_score,
|
||||
reason="根据正文长度估算低龄儿童阅读适配度。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
|
||||
score=educational_score,
|
||||
reason="根据教育主题是否清晰融入正文估算。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.READABILITY,
|
||||
score=readability_score,
|
||||
reason="根据正文长度估算朗读和亲子共读流畅度。",
|
||||
),
|
||||
)
|
||||
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
|
||||
|
||||
return EvaluationResult(
|
||||
overall_score=overall_score,
|
||||
passed=overall_score >= minimum_score,
|
||||
blocking=overall_score < minimum_score,
|
||||
scores=scores,
|
||||
warnings=tuple(warnings),
|
||||
)
|
||||
|
||||
|
||||
def evaluate_storybook_output(
|
||||
output: Storybook,
|
||||
*,
|
||||
education_theme: str | None = None,
|
||||
minimum_score: float = 0.7,
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate generated storybook structure before persistence."""
|
||||
|
||||
try:
|
||||
validate_storybook_output(output)
|
||||
except QualityGateError as exc:
|
||||
scores = (
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.STRUCTURE,
|
||||
score=0.0,
|
||||
reason="绘本结构未通过质量门。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.SAFETY,
|
||||
score=0.0,
|
||||
reason="绘本内容未通过儿童安全或结构完整性检查。",
|
||||
),
|
||||
)
|
||||
return EvaluationResult(
|
||||
overall_score=0.0,
|
||||
passed=False,
|
||||
blocking=True,
|
||||
scores=scores,
|
||||
gate_error=exc,
|
||||
)
|
||||
|
||||
page_texts = [page.text for page in output.pages]
|
||||
readability_score = _storybook_readability_score(page_texts)
|
||||
educational_score = _storybook_educational_value_score(page_texts, education_theme)
|
||||
warnings: list[str] = []
|
||||
|
||||
if readability_score < 0.8:
|
||||
warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。")
|
||||
|
||||
scores = (
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.STRUCTURE,
|
||||
score=1.0,
|
||||
reason="绘本标题、分页和页码结构完整。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.SAFETY,
|
||||
score=1.0,
|
||||
reason="未命中确定性儿童安全风险词。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.AGE_FIT,
|
||||
score=readability_score,
|
||||
reason="根据每页正文长度估算低龄儿童翻页阅读适配度。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
|
||||
score=educational_score,
|
||||
reason="根据教育主题是否清晰融入分页正文估算。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.READABILITY,
|
||||
score=readability_score,
|
||||
reason="根据分页正文长度估算亲子共读流畅度。",
|
||||
),
|
||||
)
|
||||
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
|
||||
|
||||
return EvaluationResult(
|
||||
overall_score=overall_score,
|
||||
passed=overall_score >= minimum_score,
|
||||
blocking=overall_score < minimum_score,
|
||||
scores=scores,
|
||||
warnings=tuple(warnings),
|
||||
)
|
||||
Reference in New Issue
Block a user