Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -0,0 +1,322 @@
"""Internal golden-case replay support for harness evaluations.
The replay helpers are intentionally not wired to user-facing APIs. They exist
to make evaluation behavior reproducible in tests and internal tooling.
"""
import json
from collections import Counter
from dataclasses import dataclass, field
from enum import StrEnum
from pathlib import Path
from typing import Any, Iterable
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.harness.evaluators import (
EvaluationDimension,
EvaluationResult,
evaluate_story_output,
evaluate_storybook_output,
)
class EvaluationReplayArtifact(StrEnum):
"""Artifacts supported by deterministic evaluation replay."""
STORY = "story"
STORYBOOK = "storybook"
@dataclass(frozen=True)
class ExpectedEvaluation:
"""Expected evaluation outcome for one golden case."""
passed: bool
blocking: bool
min_overall_score: float | None = None
max_overall_score: float | None = None
required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple)
quality_gate_codes: tuple[str, ...] = field(default_factory=tuple)
warning_substrings: tuple[str, ...] = field(default_factory=tuple)
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation":
"""Build expectations from a JSON-safe payload."""
return cls(
passed=bool(payload["passed"]),
blocking=bool(payload["blocking"]),
min_overall_score=payload.get("min_overall_score"),
max_overall_score=payload.get("max_overall_score"),
required_dimensions=tuple(
EvaluationDimension(dimension)
for dimension in payload.get("required_dimensions", [])
),
quality_gate_codes=tuple(payload.get("quality_gate_codes", [])),
warning_substrings=tuple(payload.get("warning_substrings", [])),
)
@dataclass(frozen=True)
class EvaluationReplayCoverage:
"""Internal coverage labels for one golden replay case."""
age_band: str = "unknown"
content_shape: str = "unknown"
risk_area: str = "unknown"
tags: tuple[str, ...] = field(default_factory=tuple)
@classmethod
def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage":
"""Build coverage labels from a JSON-safe payload."""
payload = payload or {}
return cls(
age_band=str(payload.get("age_band", "unknown")),
content_shape=str(payload.get("content_shape", "unknown")),
risk_area=str(payload.get("risk_area", "unknown")),
tags=tuple(str(tag) for tag in payload.get("tags", [])),
)
@dataclass(frozen=True)
class EvaluationReplayCase:
"""One internal golden evaluation case."""
case_id: str
artifact: EvaluationReplayArtifact
output_payload: dict[str, Any]
expected: ExpectedEvaluation
education_theme: str | None = None
minimum_score: float = 0.7
description: str = ""
input_payload: dict[str, Any] = field(default_factory=dict)
coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage)
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase":
"""Build a replay case from a JSON-safe payload."""
input_payload = dict(payload.get("input", {}))
minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7))
education_theme = input_payload.get("education_theme", payload.get("education_theme"))
return cls(
case_id=str(payload["id"]),
artifact=EvaluationReplayArtifact(payload["artifact"]),
description=str(payload.get("description", "")),
input_payload=input_payload,
output_payload=dict(payload["output"]),
education_theme=education_theme,
minimum_score=float(minimum_score),
expected=ExpectedEvaluation.from_payload(payload["expected"]),
coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")),
)
def evaluate(self) -> EvaluationResult:
"""Run the deterministic evaluator for this case."""
if self.artifact == EvaluationReplayArtifact.STORY:
return evaluate_story_output(
_story_output_from_payload(self.output_payload),
education_theme=self.education_theme,
minimum_score=self.minimum_score,
)
return evaluate_storybook_output(
_storybook_from_payload(self.output_payload),
education_theme=self.education_theme,
minimum_score=self.minimum_score,
)
def replay(self) -> "EvaluationReplayCaseResult":
"""Evaluate the case and compare it with expected outcomes."""
evaluation = self.evaluate()
failures = tuple(_compare_evaluation(self, evaluation))
return EvaluationReplayCaseResult(
case_id=self.case_id,
artifact=self.artifact,
coverage=self.coverage,
evaluation=evaluation,
failures=failures,
)
@dataclass(frozen=True)
class EvaluationReplayCaseResult:
"""Replay result for one golden case."""
case_id: str
artifact: EvaluationReplayArtifact
coverage: EvaluationReplayCoverage
evaluation: EvaluationResult
failures: tuple[str, ...] = field(default_factory=tuple)
@property
def expectations_met(self) -> bool:
"""Return whether the case matched all expectations."""
return not self.failures
@dataclass(frozen=True)
class EvaluationReplaySuiteResult:
"""Replay result for a set of golden cases."""
cases: tuple[EvaluationReplayCaseResult, ...]
@property
def passed(self) -> bool:
"""Return whether every replay case matched expectations."""
return all(case.expectations_met for case in self.cases)
@property
def failed_case_ids(self) -> tuple[str, ...]:
"""Return case IDs with expectation mismatches."""
return tuple(case.case_id for case in self.cases if not case.expectations_met)
def failure_report(self) -> str:
"""Return a compact failure report for assertion messages."""
lines: list[str] = []
for case in self.cases:
for failure in case.failures:
lines.append(f"{case.case_id}: {failure}")
return "\n".join(lines)
def coverage_summary(self) -> dict[str, dict[str, int]]:
"""Return internal coverage counts for golden replay review."""
return {
"artifact": _count_values(case.artifact.value for case in self.cases),
"age_band": _count_values(case.coverage.age_band for case in self.cases),
"content_shape": _count_values(
case.coverage.content_shape for case in self.cases
),
"risk_area": _count_values(case.coverage.risk_area for case in self.cases),
"tags": _count_values(
tag for case in self.cases for tag in case.coverage.tags
),
"outcome": _count_values(
"passed" if case.evaluation.passed else "blocked"
for case in self.cases
),
}
def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]:
"""Load internal golden replay cases from a JSON file."""
raw_cases = json.loads(Path(path).read_text(encoding="utf-8"))
if not isinstance(raw_cases, list):
raise ValueError("Evaluation replay fixture must be a JSON array.")
return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases)
def run_evaluation_replay_cases(
cases: Iterable[EvaluationReplayCase],
) -> EvaluationReplaySuiteResult:
"""Run a set of internal golden evaluation replay cases."""
return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases))
def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult:
"""Load and run internal golden evaluation replay cases."""
return run_evaluation_replay_cases(load_evaluation_replay_cases(path))
def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput:
return StoryOutput(
mode=payload.get("mode", "generated"),
title=payload.get("title", ""),
story_text=payload.get("story_text", ""),
cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""),
)
def _storybook_from_payload(payload: dict[str, Any]) -> Storybook:
pages = [
StorybookPage(
page_number=page.get("page_number", index + 1),
text=page.get("text", ""),
image_prompt=page.get("image_prompt", ""),
image_url=page.get("image_url"),
)
for index, page in enumerate(payload.get("pages", []))
]
return Storybook(
title=payload.get("title", ""),
main_character=payload.get("main_character", ""),
art_style=payload.get("art_style", ""),
pages=pages,
cover_prompt=payload.get("cover_prompt", ""),
cover_url=payload.get("cover_url"),
)
def _count_values(values: Iterable[str]) -> dict[str, int]:
counts = Counter(value for value in values if value)
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
def _compare_evaluation(
case: EvaluationReplayCase,
evaluation: EvaluationResult,
) -> list[str]:
expected = case.expected
failures: list[str] = []
if evaluation.passed != expected.passed:
failures.append(f"expected passed={expected.passed}, got {evaluation.passed}")
if evaluation.blocking != expected.blocking:
failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}")
if (
expected.min_overall_score is not None
and evaluation.overall_score < expected.min_overall_score
):
failures.append(
"expected overall_score >= "
f"{expected.min_overall_score}, got {evaluation.overall_score}"
)
if (
expected.max_overall_score is not None
and evaluation.overall_score > expected.max_overall_score
):
failures.append(
"expected overall_score <= "
f"{expected.max_overall_score}, got {evaluation.overall_score}"
)
actual_dimensions = {score.dimension for score in evaluation.scores}
missing_dimensions = [
dimension.value
for dimension in expected.required_dimensions
if dimension not in actual_dimensions
]
if missing_dimensions:
failures.append(f"missing dimensions: {', '.join(missing_dimensions)}")
actual_quality_gate_codes = tuple(
issue.code.value for issue in evaluation.gate_error.issues
) if evaluation.gate_error is not None else ()
if actual_quality_gate_codes != expected.quality_gate_codes:
failures.append(
"expected quality_gate_codes="
f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}"
)
for expected_warning in expected.warning_substrings:
if not any(expected_warning in warning for warning in evaluation.warnings):
failures.append(f"missing warning containing: {expected_warning}")
return failures

View File

@@ -0,0 +1,267 @@
"""Deterministic evaluation helpers for generated child-facing content."""
from dataclasses import dataclass, field
from enum import StrEnum
from typing import Any
from app.services.adapters.storybook.primary import Storybook
from app.services.adapters.text.models import StoryOutput
from app.services.harness.quality_gates import (
QualityGateError,
validate_story_output,
validate_storybook_output,
)
class EvaluationDimension(StrEnum):
"""Stable dimensions used by harness evaluations."""
STRUCTURE = "structure"
SAFETY = "safety"
AGE_FIT = "age_fit"
EDUCATIONAL_VALUE = "educational_value"
READABILITY = "readability"
@dataclass(frozen=True)
class EvaluationScore:
"""One scored evaluation dimension."""
dimension: EvaluationDimension
score: float
reason: str
def to_metadata(self) -> dict[str, Any]:
"""Return a JSON-safe metadata payload."""
return {
"dimension": self.dimension.value,
"score": self.score,
"reason": self.reason,
}
@dataclass(frozen=True)
class EvaluationResult:
"""Deterministic evaluation result for one generated artifact."""
overall_score: float
passed: bool
blocking: bool
scores: tuple[EvaluationScore, ...]
gate_error: QualityGateError | None = None
warnings: tuple[str, ...] = field(default_factory=tuple)
def to_metadata(self) -> dict[str, Any]:
"""Return a JSON-safe metadata payload."""
metadata: dict[str, Any] = {
"overall_score": self.overall_score,
"passed": self.passed,
"blocking": self.blocking,
"scores": [score.to_metadata() for score in self.scores],
"warnings": list(self.warnings),
}
if self.gate_error is not None:
metadata["quality_gate"] = self.gate_error.to_metadata()
return metadata
def _clamp_score(value: float) -> float:
return max(0.0, min(1.0, round(value, 2)))
def _story_text_readability_score(story_text: str) -> float:
"""Score text length with a conservative 3-8 age readability heuristic."""
normalized_length = len(story_text.strip())
if normalized_length < 30:
return 0.45
if normalized_length > 2500:
return 0.72
if normalized_length > 1800:
return 0.84
return 0.96
def _educational_value_score(story_text: str, education_theme: str | None) -> float:
if not education_theme:
return 0.82
return 0.96 if education_theme.strip() in story_text else 0.88
def _storybook_readability_score(page_texts: list[str]) -> float:
if not page_texts:
return 0.0
page_lengths = [len(text.strip()) for text in page_texts]
if any(length < 8 for length in page_lengths):
return 0.62
if any(length > 320 for length in page_lengths):
return 0.78
if any(length > 220 for length in page_lengths):
return 0.88
return 0.96
def _storybook_educational_value_score(
page_texts: list[str],
education_theme: str | None,
) -> float:
if not education_theme:
return 0.82
combined_text = " ".join(page_texts)
return 0.96 if education_theme.strip() in combined_text else 0.88
def evaluate_story_output(
output: StoryOutput,
*,
education_theme: str | None = None,
minimum_score: float = 0.7,
) -> EvaluationResult:
"""Evaluate a generated text story before persistence."""
try:
validate_story_output(output)
except QualityGateError as exc:
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=0.0,
reason="故事结构未通过质量门。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=0.0,
reason="内容未通过儿童安全或结构完整性检查。",
),
)
return EvaluationResult(
overall_score=0.0,
passed=False,
blocking=True,
scores=scores,
gate_error=exc,
)
readability_score = _story_text_readability_score(output.story_text)
educational_score = _educational_value_score(output.story_text, education_theme)
warnings: list[str] = []
if readability_score < 0.8:
warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。")
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=1.0,
reason="标题、正文和封面提示词完整。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=1.0,
reason="未命中确定性儿童安全风险词。",
),
EvaluationScore(
dimension=EvaluationDimension.AGE_FIT,
score=readability_score,
reason="根据正文长度估算低龄儿童阅读适配度。",
),
EvaluationScore(
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
score=educational_score,
reason="根据教育主题是否清晰融入正文估算。",
),
EvaluationScore(
dimension=EvaluationDimension.READABILITY,
score=readability_score,
reason="根据正文长度估算朗读和亲子共读流畅度。",
),
)
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
return EvaluationResult(
overall_score=overall_score,
passed=overall_score >= minimum_score,
blocking=overall_score < minimum_score,
scores=scores,
warnings=tuple(warnings),
)
def evaluate_storybook_output(
output: Storybook,
*,
education_theme: str | None = None,
minimum_score: float = 0.7,
) -> EvaluationResult:
"""Evaluate generated storybook structure before persistence."""
try:
validate_storybook_output(output)
except QualityGateError as exc:
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=0.0,
reason="绘本结构未通过质量门。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=0.0,
reason="绘本内容未通过儿童安全或结构完整性检查。",
),
)
return EvaluationResult(
overall_score=0.0,
passed=False,
blocking=True,
scores=scores,
gate_error=exc,
)
page_texts = [page.text for page in output.pages]
readability_score = _storybook_readability_score(page_texts)
educational_score = _storybook_educational_value_score(page_texts, education_theme)
warnings: list[str] = []
if readability_score < 0.8:
warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。")
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=1.0,
reason="绘本标题、分页和页码结构完整。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=1.0,
reason="未命中确定性儿童安全风险词。",
),
EvaluationScore(
dimension=EvaluationDimension.AGE_FIT,
score=readability_score,
reason="根据每页正文长度估算低龄儿童翻页阅读适配度。",
),
EvaluationScore(
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
score=educational_score,
reason="根据教育主题是否清晰融入分页正文估算。",
),
EvaluationScore(
dimension=EvaluationDimension.READABILITY,
score=readability_score,
reason="根据分页正文长度估算亲子共读流畅度。",
),
)
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
return EvaluationResult(
overall_score=overall_score,
passed=overall_score >= minimum_score,
blocking=overall_score < minimum_score,
scores=scores,
warnings=tuple(warnings),
)

View File

@@ -0,0 +1,150 @@
"""Small-step workflow executor helpers for generation harness adoption."""
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.harness.artifacts import AssetCompletionResult
from app.services.harness.plans import WorkflowPlan
from app.services.harness.trace import TraceRecorder
from app.services.harness.types import ArtifactKind, WorkflowStep
if TYPE_CHECKING:
from app.db.models import GenerationJob
AssetTask = Callable[[], Awaitable[AssetCompletionResult]]
@dataclass(frozen=True)
class AssetPlanRunResult:
"""Result of executing asset-producing tasks from one workflow plan."""
task_results: tuple[AssetCompletionResult, ...]
executed_task_keys: tuple[str, ...]
ignored_task_keys: tuple[str, ...]
@property
def result_assets(self) -> tuple[str, ...]:
"""Assets returned by executed task handlers."""
return tuple(result.asset for result in self.task_results)
def to_metadata(self, plan: WorkflowPlan) -> dict[str, Any]:
"""Return internal executor coverage metadata for admin-only analytics."""
return {
"plan_mode": plan.mode.value,
"planned_task_count": len(plan.tasks),
"executed_task_count": len(self.executed_task_keys),
"ignored_task_count": len(self.ignored_task_keys),
"result_count": len(self.task_results),
"executed_task_keys": list(self.executed_task_keys),
"ignored_task_keys": list(self.ignored_task_keys),
"result_assets": list(self.result_assets),
}
async def record_workflow_plan(
db: AsyncSession,
*,
job: "GenerationJob | None",
plan: WorkflowPlan,
) -> None:
"""Persist a workflow plan snapshot for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
event_type="workflow_planned",
status="succeeded",
message="Workflow plan selected for this generation request.",
metadata={"plan": plan.to_snapshot()},
step=WorkflowStep.REQUEST_ACCEPTANCE,
artifact=ArtifactKind.NONE,
blocks_main_result=True,
)
async def record_evaluation_result(
db: AsyncSession,
*,
job: "GenerationJob | None",
story_id: int | None = None,
metadata: dict[str, Any],
status: str,
artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
) -> None:
"""Persist a deterministic evaluation result for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
story_id=story_id,
event_type="evaluation_completed",
status=status,
message="Generated content evaluation completed.",
metadata=metadata,
step=WorkflowStep.EVALUATION,
artifact=artifact,
blocks_main_result=status != "succeeded",
)
async def record_executor_result(
db: AsyncSession,
*,
job: "GenerationJob | None",
plan: WorkflowPlan,
result: AssetPlanRunResult,
) -> None:
"""Persist internal executor coverage metadata for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
event_type="executor_completed",
status="succeeded",
message="Workflow executor completed planned asset tasks.",
metadata=result.to_metadata(plan),
step=WorkflowStep.UNKNOWN,
artifact=ArtifactKind.NONE,
blocks_main_result=False,
)
async def run_asset_plan(
plan: WorkflowPlan,
*,
image_task: AssetTask | None = None,
audio_task: AssetTask | None = None,
) -> AssetPlanRunResult:
"""Execute asset-producing tasks in the order declared by a workflow plan."""
if plan.mode.value not in {"asset_generation", "asset_retry"}:
raise ValueError("run_asset_plan only supports asset workflow plans")
task_results: list[AssetCompletionResult] = []
executed_task_keys: list[str] = []
ignored_task_keys: list[str] = []
for task in plan.tasks:
if task.key == "complete_image_asset":
if image_task is None:
raise ValueError("Asset workflow plan requires an image task handler")
task_results.append(await image_task())
executed_task_keys.append(task.key)
continue
if task.key == "complete_audio_asset":
if audio_task is None:
raise ValueError("Asset workflow plan requires an audio task handler")
task_results.append(await audio_task())
executed_task_keys.append(task.key)
continue
ignored_task_keys.append(task.key)
return AssetPlanRunResult(
task_results=tuple(task_results),
executed_task_keys=tuple(executed_task_keys),
ignored_task_keys=tuple(ignored_task_keys),
)

View File

@@ -0,0 +1,400 @@
[
{
"id": "story-safe-theme-pass",
"artifact": "story",
"description": "完整、儿童安全且清晰包含教育主题的普通故事。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "story"]
},
"input": {
"keywords": "小兔子, 月光花园",
"education_theme": "复盘"
},
"output": {
"mode": "generated",
"title": "小兔子的月光花园",
"story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。",
"cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-long-safe-pass",
"artifact": "story",
"description": "较长但仍适合亲子共读的普通故事。",
"coverage": {
"age_band": "7-8",
"content_shape": "long_story",
"risk_area": "length_boundary",
"tags": ["theme_present", "long_text", "story"]
},
"input": {
"keywords": "小海豚, 图书馆",
"education_theme": "合作"
},
"output": {
"mode": "generated",
"title": "小海豚的蓝色图书馆",
"story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。",
"cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-missing-text-blocks",
"artifact": "story",
"description": "故事正文缺失会被确定性质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "empty_story",
"risk_area": "schema_error",
"tags": ["missing_text", "story", "blocking"]
},
"input": {
"keywords": "小熊, 星星"
},
"output": {
"mode": "generated",
"title": "小熊找星星",
"story_text": "",
"cover_prompt_suggestion": "A bear looking at friendly stars"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_story_text"
]
}
},
{
"id": "story-missing-cover-prompt-blocks",
"artifact": "story",
"description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "schema_error",
"tags": ["missing_cover_prompt", "story", "blocking"]
},
"input": {
"keywords": "小松鼠, 风筝",
"education_theme": "勇敢"
},
"output": {
"mode": "generated",
"title": "小松鼠的风筝",
"story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。",
"cover_prompt_suggestion": ""
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_cover_prompt"
]
}
},
{
"id": "story-unsafe-term-blocks",
"artifact": "story",
"description": "明显不适合儿童的风险词会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "short_story",
"risk_area": "safety_error",
"tags": ["unsafe_term", "story", "blocking"]
},
"input": {
"keywords": "小猫, 城堡"
},
"output": {
"mode": "generated",
"title": "小猫的城堡",
"story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。",
"cover_prompt_suggestion": "A cat near a castle"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "story-short-high-threshold-blocks",
"artifact": "story",
"description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "very_short_story",
"risk_area": "readability_warning",
"tags": ["short_text", "threshold_block", "story"]
},
"input": {
"keywords": "小鹿, 书签",
"education_theme": "耐心",
"minimum_score": 0.82
},
"output": {
"mode": "generated",
"title": "小鹿的书签",
"story_text": "小鹿学会了耐心等待。",
"cover_prompt_suggestion": "A deer with a golden bookmark"
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.7,
"max_overall_score": 0.8,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"正文长度"
]
}
},
{
"id": "storybook-safe-theme-pass",
"artifact": "storybook",
"description": "完整、儿童安全且包含教育主题的绘本分页输出。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_3_pages",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "storybook"]
},
"input": {
"keywords": "小狐狸, 彩虹桥",
"education_theme": "合作"
},
"output": {
"title": "彩虹桥上的合作",
"main_character": "小狐狸米米",
"art_style": "温暖水彩",
"cover_prompt": "A warm watercolor fox near a rainbow bridge",
"pages": [
{
"page_number": 1,
"text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
"image_prompt": "A little fox finds a rainbow bridge"
},
{
"page_number": 2,
"text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。",
"image_prompt": "Forest friends work together"
},
{
"page_number": 3,
"text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。",
"image_prompt": "Friends carrying leaves together"
}
]
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "storybook-duplicate-page-blocks",
"artifact": "storybook",
"description": "重复页码的绘本结构会被质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_invalid_pages",
"risk_area": "schema_error",
"tags": ["duplicate_page", "storybook", "blocking"]
},
"input": {
"keywords": "小熊, 森林"
},
"output": {
"title": "森林里的小熊",
"main_character": "小熊布布",
"art_style": "水彩",
"cover_prompt": "A bear in a forest",
"pages": [
{
"page_number": 1,
"text": "布布在森林里找到一颗松果。",
"image_prompt": "Bear finds a pinecone"
},
{
"page_number": 1,
"text": "布布把松果带给朋友一起观察。",
"image_prompt": "Bear shares the pinecone"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"invalid_storybook_page_number"
]
}
},
{
"id": "storybook-missing-page-blocks",
"artifact": "storybook",
"description": "没有分页内容的绘本会被结构质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "storybook_empty_pages",
"risk_area": "schema_error",
"tags": ["missing_page", "storybook", "blocking"]
},
"input": {
"keywords": "小鸟, 云朵"
},
"output": {
"title": "小鸟和云朵",
"main_character": "小鸟啾啾",
"art_style": "柔和水彩",
"cover_prompt": "A bird near soft clouds",
"pages": []
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_storybook_page"
]
}
},
{
"id": "storybook-unsafe-term-blocks",
"artifact": "storybook",
"description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "safety_error",
"tags": ["unsafe_term", "storybook", "blocking"]
},
"input": {
"keywords": "小兔子, 山洞"
},
"output": {
"title": "山洞里的声音",
"main_character": "小兔子米粒",
"art_style": "温暖水彩",
"cover_prompt": "A rabbit near a cave",
"pages": [
{
"page_number": 1,
"text": "米粒走到山洞边,听见奇怪的声音。",
"image_prompt": "Rabbit near a cave"
},
{
"page_number": 2,
"text": "洞里出现血腥画面,米粒吓得跑开。",
"image_prompt": "Rabbit running away"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "storybook-short-page-warning",
"artifact": "storybook",
"description": "分页正文过短时保留内部警告,用于评测回归。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "readability_warning",
"tags": ["short_page_text", "threshold_block", "storybook"]
},
"input": {
"keywords": "小羊, 风铃",
"minimum_score": 0.85
},
"output": {
"title": "风铃响了",
"main_character": "小羊团团",
"art_style": "柔和蜡笔",
"cover_prompt": "A lamb listening to a wind chime",
"pages": [
{
"page_number": 1,
"text": "风响。",
"image_prompt": "Wind chime rings"
},
{
"page_number": 2,
"text": "团团笑。",
"image_prompt": "Lamb smiles"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.8,
"max_overall_score": 0.82,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"分页正文长度"
]
}
}
]

View File

@@ -69,6 +69,11 @@ def build_story_plan(*, generate_images: bool) -> WorkflowPlan:
step=WorkflowStep.NARRATIVE_GENERATION,
artifact=ArtifactKind.STORY_TEXT,
),
WorkflowTask(
key="evaluate_narrative",
step=WorkflowStep.EVALUATION,
artifact=ArtifactKind.STORY_TEXT,
),
WorkflowTask(
key="persist_story",
step=WorkflowStep.STORY_PERSISTENCE,
@@ -124,6 +129,11 @@ def build_storybook_plan(*, generate_images: bool) -> WorkflowPlan:
step=WorkflowStep.NARRATIVE_GENERATION,
artifact=ArtifactKind.STORYBOOK_PAGES,
),
WorkflowTask(
key="evaluate_storybook_pages",
step=WorkflowStep.EVALUATION,
artifact=ArtifactKind.STORYBOOK_PAGES,
),
]
if generate_images:

View File

@@ -11,6 +11,7 @@ class WorkflowStep(StrEnum):
WORKER_START = "worker_start"
CONTEXT_PREPARATION = "context_preparation"
NARRATIVE_GENERATION = "narrative_generation"
EVALUATION = "evaluation"
STORY_PERSISTENCE = "story_persistence"
PROVIDER_INVOCATION = "provider_invocation"
IMAGE_GENERATION = "image_generation"
@@ -64,6 +65,8 @@ class StepStatus(StrEnum):
EVENT_STEP_MAP: dict[str, WorkflowStep] = {
"request_accepted": WorkflowStep.REQUEST_ACCEPTANCE,
"workflow_planned": WorkflowStep.REQUEST_ACCEPTANCE,
"executor_completed": WorkflowStep.UNKNOWN,
"retry_queued": WorkflowStep.REQUEST_ACCEPTANCE,
"worker_started": WorkflowStep.WORKER_START,
"context_prepared": WorkflowStep.CONTEXT_PREPARATION,
@@ -73,6 +76,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
"provider_call_succeeded": WorkflowStep.PROVIDER_INVOCATION,
"provider_call_failed": WorkflowStep.PROVIDER_INVOCATION,
"quality_gate_failed": WorkflowStep.NARRATIVE_GENERATION,
"evaluation_completed": WorkflowStep.EVALUATION,
"cover_image_started": WorkflowStep.IMAGE_GENERATION,
"cover_image_succeeded": WorkflowStep.IMAGE_GENERATION,
"cover_image_failed": WorkflowStep.IMAGE_GENERATION,
@@ -100,6 +104,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
EVENT_ARTIFACT_MAP: dict[str, ArtifactKind] = {
"narrative_generated": ArtifactKind.STORY_TEXT,
"quality_gate_failed": ArtifactKind.STORY_TEXT,
"evaluation_completed": ArtifactKind.STORY_TEXT,
"cover_image_started": ArtifactKind.COVER_IMAGE,
"cover_image_succeeded": ArtifactKind.COVER_IMAGE,
"cover_image_failed": ArtifactKind.COVER_IMAGE,