268 lines
8.2 KiB
Python
268 lines
8.2 KiB
Python
"""Deterministic evaluation helpers for generated child-facing content."""
|
|
|
|
from dataclasses import dataclass, field
|
|
from enum import StrEnum
|
|
from typing import Any
|
|
|
|
from app.services.adapters.storybook.primary import Storybook
|
|
from app.services.adapters.text.models import StoryOutput
|
|
from app.services.harness.quality_gates import (
|
|
QualityGateError,
|
|
validate_story_output,
|
|
validate_storybook_output,
|
|
)
|
|
|
|
|
|
class EvaluationDimension(StrEnum):
|
|
"""Stable dimensions used by harness evaluations."""
|
|
|
|
STRUCTURE = "structure"
|
|
SAFETY = "safety"
|
|
AGE_FIT = "age_fit"
|
|
EDUCATIONAL_VALUE = "educational_value"
|
|
READABILITY = "readability"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EvaluationScore:
|
|
"""One scored evaluation dimension."""
|
|
|
|
dimension: EvaluationDimension
|
|
score: float
|
|
reason: str
|
|
|
|
def to_metadata(self) -> dict[str, Any]:
|
|
"""Return a JSON-safe metadata payload."""
|
|
|
|
return {
|
|
"dimension": self.dimension.value,
|
|
"score": self.score,
|
|
"reason": self.reason,
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EvaluationResult:
|
|
"""Deterministic evaluation result for one generated artifact."""
|
|
|
|
overall_score: float
|
|
passed: bool
|
|
blocking: bool
|
|
scores: tuple[EvaluationScore, ...]
|
|
gate_error: QualityGateError | None = None
|
|
warnings: tuple[str, ...] = field(default_factory=tuple)
|
|
|
|
def to_metadata(self) -> dict[str, Any]:
|
|
"""Return a JSON-safe metadata payload."""
|
|
|
|
metadata: dict[str, Any] = {
|
|
"overall_score": self.overall_score,
|
|
"passed": self.passed,
|
|
"blocking": self.blocking,
|
|
"scores": [score.to_metadata() for score in self.scores],
|
|
"warnings": list(self.warnings),
|
|
}
|
|
if self.gate_error is not None:
|
|
metadata["quality_gate"] = self.gate_error.to_metadata()
|
|
return metadata
|
|
|
|
|
|
def _clamp_score(value: float) -> float:
|
|
return max(0.0, min(1.0, round(value, 2)))
|
|
|
|
|
|
def _story_text_readability_score(story_text: str) -> float:
|
|
"""Score text length with a conservative 3-8 age readability heuristic."""
|
|
|
|
normalized_length = len(story_text.strip())
|
|
if normalized_length < 30:
|
|
return 0.45
|
|
if normalized_length > 2500:
|
|
return 0.72
|
|
if normalized_length > 1800:
|
|
return 0.84
|
|
return 0.96
|
|
|
|
|
|
def _educational_value_score(story_text: str, education_theme: str | None) -> float:
|
|
if not education_theme:
|
|
return 0.82
|
|
return 0.96 if education_theme.strip() in story_text else 0.88
|
|
|
|
|
|
def _storybook_readability_score(page_texts: list[str]) -> float:
|
|
if not page_texts:
|
|
return 0.0
|
|
|
|
page_lengths = [len(text.strip()) for text in page_texts]
|
|
if any(length < 8 for length in page_lengths):
|
|
return 0.62
|
|
if any(length > 320 for length in page_lengths):
|
|
return 0.78
|
|
if any(length > 220 for length in page_lengths):
|
|
return 0.88
|
|
return 0.96
|
|
|
|
|
|
def _storybook_educational_value_score(
|
|
page_texts: list[str],
|
|
education_theme: str | None,
|
|
) -> float:
|
|
if not education_theme:
|
|
return 0.82
|
|
combined_text = " ".join(page_texts)
|
|
return 0.96 if education_theme.strip() in combined_text else 0.88
|
|
|
|
|
|
def evaluate_story_output(
|
|
output: StoryOutput,
|
|
*,
|
|
education_theme: str | None = None,
|
|
minimum_score: float = 0.7,
|
|
) -> EvaluationResult:
|
|
"""Evaluate a generated text story before persistence."""
|
|
|
|
try:
|
|
validate_story_output(output)
|
|
except QualityGateError as exc:
|
|
scores = (
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.STRUCTURE,
|
|
score=0.0,
|
|
reason="故事结构未通过质量门。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.SAFETY,
|
|
score=0.0,
|
|
reason="内容未通过儿童安全或结构完整性检查。",
|
|
),
|
|
)
|
|
return EvaluationResult(
|
|
overall_score=0.0,
|
|
passed=False,
|
|
blocking=True,
|
|
scores=scores,
|
|
gate_error=exc,
|
|
)
|
|
|
|
readability_score = _story_text_readability_score(output.story_text)
|
|
educational_score = _educational_value_score(output.story_text, education_theme)
|
|
warnings: list[str] = []
|
|
|
|
if readability_score < 0.8:
|
|
warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。")
|
|
|
|
scores = (
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.STRUCTURE,
|
|
score=1.0,
|
|
reason="标题、正文和封面提示词完整。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.SAFETY,
|
|
score=1.0,
|
|
reason="未命中确定性儿童安全风险词。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.AGE_FIT,
|
|
score=readability_score,
|
|
reason="根据正文长度估算低龄儿童阅读适配度。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
|
|
score=educational_score,
|
|
reason="根据教育主题是否清晰融入正文估算。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.READABILITY,
|
|
score=readability_score,
|
|
reason="根据正文长度估算朗读和亲子共读流畅度。",
|
|
),
|
|
)
|
|
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
|
|
|
|
return EvaluationResult(
|
|
overall_score=overall_score,
|
|
passed=overall_score >= minimum_score,
|
|
blocking=overall_score < minimum_score,
|
|
scores=scores,
|
|
warnings=tuple(warnings),
|
|
)
|
|
|
|
|
|
def evaluate_storybook_output(
|
|
output: Storybook,
|
|
*,
|
|
education_theme: str | None = None,
|
|
minimum_score: float = 0.7,
|
|
) -> EvaluationResult:
|
|
"""Evaluate generated storybook structure before persistence."""
|
|
|
|
try:
|
|
validate_storybook_output(output)
|
|
except QualityGateError as exc:
|
|
scores = (
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.STRUCTURE,
|
|
score=0.0,
|
|
reason="绘本结构未通过质量门。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.SAFETY,
|
|
score=0.0,
|
|
reason="绘本内容未通过儿童安全或结构完整性检查。",
|
|
),
|
|
)
|
|
return EvaluationResult(
|
|
overall_score=0.0,
|
|
passed=False,
|
|
blocking=True,
|
|
scores=scores,
|
|
gate_error=exc,
|
|
)
|
|
|
|
page_texts = [page.text for page in output.pages]
|
|
readability_score = _storybook_readability_score(page_texts)
|
|
educational_score = _storybook_educational_value_score(page_texts, education_theme)
|
|
warnings: list[str] = []
|
|
|
|
if readability_score < 0.8:
|
|
warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。")
|
|
|
|
scores = (
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.STRUCTURE,
|
|
score=1.0,
|
|
reason="绘本标题、分页和页码结构完整。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.SAFETY,
|
|
score=1.0,
|
|
reason="未命中确定性儿童安全风险词。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.AGE_FIT,
|
|
score=readability_score,
|
|
reason="根据每页正文长度估算低龄儿童翻页阅读适配度。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
|
|
score=educational_score,
|
|
reason="根据教育主题是否清晰融入分页正文估算。",
|
|
),
|
|
EvaluationScore(
|
|
dimension=EvaluationDimension.READABILITY,
|
|
score=readability_score,
|
|
reason="根据分页正文长度估算亲子共读流畅度。",
|
|
),
|
|
)
|
|
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
|
|
|
|
return EvaluationResult(
|
|
overall_score=overall_score,
|
|
passed=overall_score >= minimum_score,
|
|
blocking=overall_score < minimum_score,
|
|
scores=scores,
|
|
warnings=tuple(warnings),
|
|
)
|