Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -1,4 +1,5 @@
from typing import Literal
from datetime import datetime
from typing import Any, Literal
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel, ConfigDict, Field
@@ -9,6 +10,10 @@ from app.core.admin_auth import admin_guard
from app.db.admin_models import Provider
from app.db.database import get_db
from app.services.adapters.registry import AdapterRegistry
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
from app.services.admin_executor_coverage import get_admin_executor_coverage
from app.services.admin_generation_trace import get_admin_generation_job_trace
from app.services.admin_harness_readiness import get_admin_harness_readiness
from app.services.admin_provider_analytics import get_admin_provider_analytics
from app.services.cost_tracker import cost_tracker
from app.services.provider_policy import DEFAULT_PROVIDERS, list_capability_policies
@@ -103,6 +108,169 @@ class ProviderAnalyticsResponse(BaseModel):
by_user: list[ProviderAnalyticsUserBucket]
failure_reasons: list[ProviderAnalyticsFailureReason]
class EvaluationAnalyticsArtifactBucket(BaseModel):
artifact: str
count: int
class EvaluationAnalyticsOutputModeBucket(BaseModel):
output_mode: str
count: int
class EvaluationAnalyticsScoreBandBucket(BaseModel):
band: str
count: int
class EvaluationAnalyticsDimensionScore(BaseModel):
dimension: str
average_score: float
count: int
class EvaluationAnalyticsQualityGateIssue(BaseModel):
code: str
count: int
class EvaluationAnalyticsFailureCategory(BaseModel):
category: str
count: int
class EvaluationAnalyticsWarning(BaseModel):
message: str
count: int
class EvaluationAnalyticsResponse(BaseModel):
scope: str
window_days: int | None = None
artifact: str | None = None
total_evaluations: int
passed_evaluations: int
blocked_evaluations: int
pass_rate: float
average_score: float | None = None
job_count: int
story_count: int
user_count: int
by_artifact: list[EvaluationAnalyticsArtifactBucket]
by_output_mode: list[EvaluationAnalyticsOutputModeBucket]
score_bands: list[EvaluationAnalyticsScoreBandBucket]
dimension_scores: list[EvaluationAnalyticsDimensionScore]
quality_gate_issues: list[EvaluationAnalyticsQualityGateIssue]
failure_categories: list[EvaluationAnalyticsFailureCategory]
warnings: list[EvaluationAnalyticsWarning]
class ExecutorCoveragePlanModeBucket(BaseModel):
plan_mode: str
count: int
class ExecutorCoverageOutputModeBucket(BaseModel):
output_mode: str
count: int
class ExecutorCoverageTaskKeyBucket(BaseModel):
task_key: str
count: int
class ExecutorCoverageAssetBucket(BaseModel):
asset: str
count: int
class ExecutorCoverageResponse(BaseModel):
scope: str
window_days: int | None = None
plan_mode: str | None = None
total_runs: int
total_planned_tasks: int
total_executed_tasks: int
total_ignored_tasks: int
coverage_ratio: float
job_count: int
story_count: int
user_count: int
by_plan_mode: list[ExecutorCoveragePlanModeBucket]
by_output_mode: list[ExecutorCoverageOutputModeBucket]
executed_task_keys: list[ExecutorCoverageTaskKeyBucket]
ignored_task_keys: list[ExecutorCoverageTaskKeyBucket]
result_assets: list[ExecutorCoverageAssetBucket]
class AdminGenerationJobEventResponse(BaseModel):
id: int
job_id: str
story_id: int | None = None
event_type: str
status: str
message: str | None = None
event_metadata: dict[str, Any] = Field(default_factory=dict)
created_at: datetime
class AdminGenerationJobTraceResponse(BaseModel):
id: str
user_id: str
story_id: int | None = None
output_mode: str
input_type: str
status: str
current_step: str
progress_percent: int
progress_label: str
is_terminal: bool
can_cancel: bool = False
can_retry: bool = False
result_snapshot: dict[str, Any] = Field(default_factory=dict)
error_message: str | None = None
request_payload: dict[str, Any] = Field(default_factory=dict)
executor_coverage: ExecutorCoverageResponse
events: list[AdminGenerationJobEventResponse] = Field(default_factory=list)
created_at: datetime
updated_at: datetime
class HarnessReadinessCheck(BaseModel):
code: str
status: Literal["ready", "needs_attention", "blocked"]
message: str
details: dict[str, Any] = Field(default_factory=dict)
class HarnessReadinessGoldenReplay(BaseModel):
passed: bool
total_cases: int
failed_case_ids: list[str]
coverage_summary: dict[str, dict[str, int]] = Field(default_factory=dict)
class HarnessReadinessThresholds(BaseModel):
min_runtime_evaluations: int
min_executor_runs: int
min_evaluation_pass_rate: float
min_evaluation_average_score: float
min_executor_coverage_ratio: float
class HarnessReadinessResponse(BaseModel):
scope: str
window_days: int | None = None
status: Literal["ready", "needs_attention", "blocked"]
thresholds: HarnessReadinessThresholds
checks: list[HarnessReadinessCheck]
golden_replay: HarnessReadinessGoldenReplay
evaluation_analytics: EvaluationAnalyticsResponse
executor_coverage: ExecutorCoverageResponse
@router.get("/providers/adapters")
async def list_available_adapters():
"""获取所有可用的适配器类型 (定义的类)。"""
@@ -137,6 +305,55 @@ async def get_provider_analytics(
)
@router.get("/evaluations/analytics", response_model=EvaluationAnalyticsResponse)
async def get_evaluation_analytics(
days: int | None = Query(default=None, ge=1, le=365),
artifact: Literal["story_text", "storybook_pages"] | None = Query(default=None),
db: AsyncSession = Depends(get_db),
):
"""获取内部内容评测摘要,仅供管理控制面使用。"""
return await get_admin_evaluation_analytics(
db,
days=days,
artifact=artifact,
)
@router.get("/executors/coverage", response_model=ExecutorCoverageResponse)
async def get_executor_coverage(
days: int | None = Query(default=None, ge=1, le=365),
plan_mode: Literal["asset_generation", "asset_retry"] | None = Query(default=None),
db: AsyncSession = Depends(get_db),
):
"""获取内部 executor 执行覆盖率,仅供管理控制面使用。"""
return await get_admin_executor_coverage(
db,
days=days,
plan_mode=plan_mode,
)
@router.get("/harness/readiness", response_model=HarnessReadinessResponse)
async def get_harness_readiness(
days: int | None = Query(default=None, ge=1, le=365),
db: AsyncSession = Depends(get_db),
):
"""获取内部 harness readiness 审查摘要,仅供管理控制面使用。"""
return await get_admin_harness_readiness(db, days=days)
@router.get(
"/generations/jobs/{job_id}/trace",
response_model=AdminGenerationJobTraceResponse,
)
async def get_generation_job_trace(
job_id: str,
db: AsyncSession = Depends(get_db),
):
"""获取完整内部生成链路,仅供管理控制面排查与审查使用。"""
return await get_admin_generation_job_trace(db, job_id=job_id)
@router.get("/providers", response_model=list[ProviderResponse])
async def list_providers(db: AsyncSession = Depends(get_db)):
result = await db.execute(select(Provider))

View File

@@ -24,6 +24,7 @@ from app.schemas.story_schemas import (
GenerationProviderStatsResponse,
GenerationRequest,
GenerationResponse,
GenerationTraceSummaryResponse,
StoryAssetRetryRequest,
StoryAudioStatusResponse,
StorybookRequest,
@@ -37,6 +38,7 @@ from app.services import story_service
from app.services.generation_jobs import (
get_generation_job_detail,
get_story_provider_stats,
get_story_trace_summary,
get_user_generation_ops_summary,
get_user_provider_analytics,
list_story_generation_jobs,
@@ -181,6 +183,25 @@ async def get_generation_provider_stats(
)
@router.get(
"/generations/{story_id}/trace-summary",
response_model=GenerationTraceSummaryResponse,
)
async def get_generation_trace_summary(
story_id: int,
days: int | None = Query(default=None, ge=1, le=365),
user: User = Depends(require_user),
db: AsyncSession = Depends(get_db),
):
"""Get workflow trace summary aggregated from generation job events."""
return await get_story_trace_summary(
db,
story_id=story_id,
user_id=user.id,
days=days,
)
@router.get("/generations/{story_id}", response_model=StoryDetailResponse)
async def get_generation(
story_id: int,

View File

@@ -244,6 +244,25 @@ class GenerationProviderStatsResponse(BaseModel):
failure_reasons: list[GenerationProviderFailureReasonResponse] = Field(default_factory=list)
class GenerationTraceBucketResponse(BaseModel):
"""Aggregated generation trace bucket."""
name: str
count: int
class GenerationTraceSummaryResponse(BaseModel):
"""Workflow trace summary aggregated from generation job events."""
story_id: int
window_days: int | None = None
total_events: int
failed_events: int
by_step: list[GenerationTraceBucketResponse] = Field(default_factory=list)
by_artifact: list[GenerationTraceBucketResponse] = Field(default_factory=list)
failure_categories: list[GenerationTraceBucketResponse] = Field(default_factory=list)
class GenerationProviderAnalyticsResponse(BaseModel):
"""Provider call stats aggregated across one user's generation history."""

View File

@@ -0,0 +1,204 @@
"""Admin-only analytics for internal generation evaluation events."""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
from typing import Any
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import GenerationJob, GenerationJobEvent
def _as_float(value: Any) -> float | None:
if isinstance(value, int | float):
return float(value)
return None
def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
return [
{key_name: name, "count": count}
for name, count in sorted(
counts.items(),
key=lambda item: (-item[1], item[0]),
)
]
def _average_bucket(
totals: dict[str, float],
counts: dict[str, int],
*,
key_name: str,
) -> list[dict[str, Any]]:
rows = [
{
key_name: name,
"average_score": round(totals[name] / counts[name], 4),
"count": counts[name],
}
for name in totals
if counts.get(name)
]
rows.sort(key=lambda item: (-int(item["count"]), str(item[key_name])))
return rows
def _score_band(score: float) -> str:
if score >= 0.9:
return "excellent"
if score >= 0.8:
return "good"
if score >= 0.7:
return "pass"
if score > 0:
return "blocked_low_score"
return "blocked_quality_gate"
def _metadata_scores(metadata: dict[str, Any]) -> list[dict[str, Any]]:
raw_scores = metadata.get("scores")
if not isinstance(raw_scores, list):
return []
return [score for score in raw_scores if isinstance(score, dict)]
def _quality_gate_issues(metadata: dict[str, Any]) -> list[dict[str, Any]]:
quality_gate = metadata.get("quality_gate")
if not isinstance(quality_gate, dict):
return []
raw_issues = quality_gate.get("issues")
if not isinstance(raw_issues, list):
return []
return [issue for issue in raw_issues if isinstance(issue, dict)]
async def get_admin_evaluation_analytics(
db: AsyncSession,
*,
days: int | None = None,
artifact: str | None = None,
) -> dict[str, Any]:
"""Aggregate internal evaluation results for the admin control plane."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None
query = (
select(GenerationJobEvent, GenerationJob)
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
.where(GenerationJobEvent.event_type == "evaluation_completed")
.order_by(GenerationJobEvent.id)
)
if cutoff is not None:
query = query.where(GenerationJobEvent.created_at >= cutoff)
rows = (await db.execute(query)).all()
total_evaluations = 0
passed_evaluations = 0
blocked_evaluations = 0
score_total = 0.0
score_count = 0
job_ids: set[str] = set()
story_ids: set[int] = set()
user_ids: set[str] = set()
artifacts: dict[str, int] = {}
output_modes: dict[str, int] = {}
score_bands: dict[str, int] = {}
dimension_totals: dict[str, float] = {}
dimension_counts: dict[str, int] = {}
quality_gate_codes: dict[str, int] = {}
failure_categories: dict[str, int] = {}
warning_counts: dict[str, int] = {}
for event, job in rows:
metadata = event.event_metadata or {}
event_artifact = str(metadata.get("artifact") or "unknown")
if artifact is not None and event_artifact != artifact:
continue
total_evaluations += 1
job_ids.add(job.id)
user_ids.add(job.user_id)
if event.story_id is not None:
story_ids.add(int(event.story_id))
elif job.story_id is not None:
story_ids.add(int(job.story_id))
artifacts[event_artifact] = artifacts.get(event_artifact, 0) + 1
output_modes[job.output_mode] = output_modes.get(job.output_mode, 0) + 1
passed = metadata.get("passed") is True
blocking = metadata.get("blocking") is True
if passed:
passed_evaluations += 1
if blocking:
blocked_evaluations += 1
overall_score = _as_float(metadata.get("overall_score"))
if overall_score is not None:
score_total += overall_score
score_count += 1
band = _score_band(overall_score)
score_bands[band] = score_bands.get(band, 0) + 1
for score in _metadata_scores(metadata):
dimension = score.get("dimension")
dimension_score = _as_float(score.get("score"))
if not isinstance(dimension, str) or dimension_score is None:
continue
dimension_totals[dimension] = dimension_totals.get(dimension, 0.0) + dimension_score
dimension_counts[dimension] = dimension_counts.get(dimension, 0) + 1
for issue in _quality_gate_issues(metadata):
code = issue.get("code")
if isinstance(code, str) and code:
quality_gate_codes[code] = quality_gate_codes.get(code, 0) + 1
failure_category = issue.get("failure_category")
if isinstance(failure_category, str) and failure_category:
failure_categories[failure_category] = (
failure_categories.get(failure_category, 0) + 1
)
warnings = metadata.get("warnings")
if isinstance(warnings, list):
for warning in warnings:
if isinstance(warning, str) and warning:
warning_counts[warning] = warning_counts.get(warning, 0) + 1
return {
"scope": "admin_internal_evaluations",
"window_days": days,
"artifact": artifact,
"total_evaluations": total_evaluations,
"passed_evaluations": passed_evaluations,
"blocked_evaluations": blocked_evaluations,
"pass_rate": (
round(passed_evaluations / total_evaluations, 4)
if total_evaluations
else 0.0
),
"average_score": round(score_total / score_count, 4) if score_count else None,
"job_count": len(job_ids),
"story_count": len(story_ids),
"user_count": len(user_ids),
"by_artifact": _sorted_count_buckets(artifacts, key_name="artifact"),
"by_output_mode": _sorted_count_buckets(output_modes, key_name="output_mode"),
"score_bands": _sorted_count_buckets(score_bands, key_name="band"),
"dimension_scores": _average_bucket(
dimension_totals,
dimension_counts,
key_name="dimension",
),
"quality_gate_issues": _sorted_count_buckets(
quality_gate_codes,
key_name="code",
),
"failure_categories": _sorted_count_buckets(
failure_categories,
key_name="category",
),
"warnings": _sorted_count_buckets(warning_counts, key_name="message"),
}

View File

@@ -0,0 +1,147 @@
"""Admin-only analytics for internal workflow executor coverage."""
from __future__ import annotations
from collections.abc import Iterable
from datetime import datetime, timedelta, timezone
from typing import Any
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import GenerationJob, GenerationJobEvent
def _as_int(value: Any) -> int:
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
return 0
def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
return [
{key_name: name, "count": count}
for name, count in sorted(
counts.items(),
key=lambda item: (-item[1], item[0]),
)
]
def _iter_strings(value: Any) -> Iterable[str]:
if not isinstance(value, list | tuple | set):
return
for item in value:
if isinstance(item, str) and item:
yield item
def summarize_executor_coverage_rows(
rows: Iterable[tuple[GenerationJobEvent, GenerationJob]],
*,
days: int | None = None,
plan_mode: str | None = None,
scope: str = "admin_internal_executor_coverage",
) -> dict[str, Any]:
"""Aggregate internal executor coverage rows into an admin-only summary."""
total_runs = 0
total_planned_tasks = 0
total_executed_tasks = 0
total_ignored_tasks = 0
job_ids: set[str] = set()
story_ids: set[int] = set()
user_ids: set[str] = set()
by_plan_mode: dict[str, int] = {}
by_output_mode: dict[str, int] = {}
executed_task_keys: dict[str, int] = {}
ignored_task_keys: dict[str, int] = {}
result_assets: dict[str, int] = {}
for event, job in rows:
metadata = event.event_metadata or {}
event_plan_mode = str(metadata.get("plan_mode") or "unknown")
if plan_mode is not None and event_plan_mode != plan_mode:
continue
total_runs += 1
job_ids.add(job.id)
user_ids.add(job.user_id)
if event.story_id is not None:
story_ids.add(int(event.story_id))
elif job.story_id is not None:
story_ids.add(int(job.story_id))
by_plan_mode[event_plan_mode] = by_plan_mode.get(event_plan_mode, 0) + 1
by_output_mode[job.output_mode] = by_output_mode.get(job.output_mode, 0) + 1
total_planned_tasks += _as_int(metadata.get("planned_task_count"))
total_executed_tasks += _as_int(metadata.get("executed_task_count"))
total_ignored_tasks += _as_int(metadata.get("ignored_task_count"))
for key in _iter_strings(metadata.get("executed_task_keys")):
executed_task_keys[key] = executed_task_keys.get(key, 0) + 1
for key in _iter_strings(metadata.get("ignored_task_keys")):
ignored_task_keys[key] = ignored_task_keys.get(key, 0) + 1
for asset in _iter_strings(metadata.get("result_assets")):
result_assets[asset] = result_assets.get(asset, 0) + 1
coverage_ratio = (
round(total_executed_tasks / total_planned_tasks, 4)
if total_planned_tasks
else 0.0
)
return {
"scope": scope,
"window_days": days,
"plan_mode": plan_mode,
"total_runs": total_runs,
"total_planned_tasks": total_planned_tasks,
"total_executed_tasks": total_executed_tasks,
"total_ignored_tasks": total_ignored_tasks,
"coverage_ratio": coverage_ratio,
"job_count": len(job_ids),
"story_count": len(story_ids),
"user_count": len(user_ids),
"by_plan_mode": _sorted_count_buckets(by_plan_mode, key_name="plan_mode"),
"by_output_mode": _sorted_count_buckets(by_output_mode, key_name="output_mode"),
"executed_task_keys": _sorted_count_buckets(
executed_task_keys,
key_name="task_key",
),
"ignored_task_keys": _sorted_count_buckets(
ignored_task_keys,
key_name="task_key",
),
"result_assets": _sorted_count_buckets(result_assets, key_name="asset"),
}
async def get_admin_executor_coverage(
db: AsyncSession,
*,
days: int | None = None,
plan_mode: str | None = None,
) -> dict[str, Any]:
"""Aggregate internal executor coverage events for the admin control plane."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None
query = (
select(GenerationJobEvent, GenerationJob)
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
.where(GenerationJobEvent.event_type == "executor_completed")
.order_by(GenerationJobEvent.id)
)
if cutoff is not None:
query = query.where(GenerationJobEvent.created_at >= cutoff)
rows = (await db.execute(query)).all()
return summarize_executor_coverage_rows(rows, days=days, plan_mode=plan_mode)

View File

@@ -0,0 +1,52 @@
"""Admin-only generation trace detail service."""
from __future__ import annotations
from typing import Any
from fastapi import HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import GenerationJob, GenerationJobEvent
from app.services.admin_executor_coverage import summarize_executor_coverage_rows
from app.services.generation_jobs import (
generation_event_to_response,
generation_job_to_summary,
)
async def get_admin_generation_job_trace(
db: AsyncSession,
*,
job_id: str,
) -> dict[str, Any]:
"""Return a complete internal generation trace for the admin control plane."""
job = (
await db.execute(select(GenerationJob).where(GenerationJob.id == job_id))
).scalar_one_or_none()
if job is None:
raise HTTPException(status_code=404, detail="Generation job not found")
events = (
await db.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
executor_rows = [
(event, job) for event in events if event.event_type == "executor_completed"
]
return {
**generation_job_to_summary(job),
"user_id": job.user_id,
"request_payload": job.request_payload or {},
"executor_coverage": summarize_executor_coverage_rows(
executor_rows,
scope="admin_internal_job_executor_coverage",
),
"events": [generation_event_to_response(event) for event in events],
}

View File

@@ -0,0 +1,262 @@
"""Admin-only readiness audit for harness-driven generation."""
from __future__ import annotations
from pathlib import Path
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
from app.services.admin_executor_coverage import get_admin_executor_coverage
from app.services.harness.evaluation_replay import replay_evaluation_golden_cases
_GOLDEN_CASES_PATH = (
Path(__file__).resolve().parent
/ "harness"
/ "fixtures"
/ "evaluation_golden_cases.json"
)
_MIN_RUNTIME_EVALUATIONS = 1
_MIN_EXECUTOR_RUNS = 1
_MIN_EVALUATION_PASS_RATE = 0.7
_MIN_EVALUATION_AVERAGE_SCORE = 0.7
_MIN_EXECUTOR_COVERAGE_RATIO = 0.2
def _check(
*,
code: str,
status: str,
message: str,
details: dict[str, Any] | None = None,
) -> dict[str, Any]:
return {
"code": code,
"status": status,
"message": message,
"details": details or {},
}
def _overall_status(checks: list[dict[str, Any]]) -> str:
statuses = {check["status"] for check in checks}
if "blocked" in statuses:
return "blocked"
if "needs_attention" in statuses:
return "needs_attention"
return "ready"
def _run_golden_replay() -> dict[str, Any]:
if not _GOLDEN_CASES_PATH.exists():
return {
"passed": False,
"total_cases": 0,
"failed_case_ids": ["fixture_missing"],
"coverage_summary": {},
}
result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH)
return {
"passed": result.passed,
"total_cases": len(result.cases),
"failed_case_ids": list(result.failed_case_ids),
"coverage_summary": result.coverage_summary(),
}
def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]:
if golden_replay["passed"] and golden_replay["total_cases"] > 0:
return _check(
code="golden_replay",
status="ready",
message="内部 golden replay 全部通过。",
details={
"total_cases": golden_replay["total_cases"],
"failed_case_count": len(golden_replay["failed_case_ids"]),
},
)
return _check(
code="golden_replay",
status="blocked",
message="内部 golden replay 未通过,暂停扩大 harness 接管范围。",
details={
"total_cases": golden_replay["total_cases"],
"failed_case_count": len(golden_replay["failed_case_ids"]),
"failed_case_ids": golden_replay["failed_case_ids"],
},
)
def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
total = int(evaluation_analytics["total_evaluations"])
if total >= _MIN_RUNTIME_EVALUATIONS:
return _check(
code="runtime_evaluation_samples",
status="ready",
message="当前窗口已有内部 evaluation 运行样本。",
details={
"total_evaluations": total,
"min_required": _MIN_RUNTIME_EVALUATIONS,
},
)
return _check(
code="runtime_evaluation_samples",
status="needs_attention",
message="当前窗口缺少内部 evaluation 运行样本,建议先跑生成烟测。",
details={
"total_evaluations": total,
"min_required": _MIN_RUNTIME_EVALUATIONS,
},
)
def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
total = int(evaluation_analytics["total_evaluations"])
pass_rate = float(evaluation_analytics["pass_rate"])
average_score = evaluation_analytics["average_score"]
if total == 0:
return _check(
code="runtime_evaluation_quality",
status="needs_attention",
message="暂无运行期 evaluation 质量样本。",
details={
"total_evaluations": total,
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
},
)
if pass_rate < _MIN_EVALUATION_PASS_RATE or (
average_score is not None
and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE
):
return _check(
code="runtime_evaluation_quality",
status="blocked",
message="运行期 evaluation 质量未达到内部 readiness 门槛。",
details={
"pass_rate": pass_rate,
"average_score": average_score,
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
},
)
return _check(
code="runtime_evaluation_quality",
status="ready",
message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。",
details={
"pass_rate": pass_rate,
"average_score": average_score,
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
},
)
def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
total_runs = int(executor_coverage["total_runs"])
if total_runs >= _MIN_EXECUTOR_RUNS:
return _check(
code="executor_coverage_samples",
status="ready",
message="当前窗口已有 executor coverage 运行样本。",
details={
"total_runs": total_runs,
"min_required": _MIN_EXECUTOR_RUNS,
},
)
return _check(
code="executor_coverage_samples",
status="needs_attention",
message="当前窗口缺少 executor coverage 样本,建议先跑资产生成或重试烟测。",
details={
"total_runs": total_runs,
"min_required": _MIN_EXECUTOR_RUNS,
},
)
def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
total_runs = int(executor_coverage["total_runs"])
coverage_ratio = float(executor_coverage["coverage_ratio"])
if total_runs == 0:
return _check(
code="executor_coverage_ratio",
status="needs_attention",
message="暂无 executor coverage 运行样本。",
details={
"total_runs": total_runs,
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
},
)
if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO:
return _check(
code="executor_coverage_ratio",
status="blocked",
message="executor coverage ratio 未达到内部 readiness 门槛。",
details={
"coverage_ratio": coverage_ratio,
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
"total_planned_tasks": executor_coverage["total_planned_tasks"],
"total_executed_tasks": executor_coverage["total_executed_tasks"],
},
)
return _check(
code="executor_coverage_ratio",
status="ready",
message="executor coverage ratio 达到内部 readiness 门槛。",
details={
"coverage_ratio": coverage_ratio,
"total_planned_tasks": executor_coverage["total_planned_tasks"],
"total_executed_tasks": executor_coverage["total_executed_tasks"],
},
)
async def get_admin_harness_readiness(
db: AsyncSession,
*,
days: int | None = None,
) -> dict[str, Any]:
"""Return an admin-only readiness audit for harness release decisions."""
golden_replay = _run_golden_replay()
evaluation_analytics = await get_admin_evaluation_analytics(db, days=days)
executor_coverage = await get_admin_executor_coverage(db, days=days)
checks = [
_golden_replay_check(golden_replay),
_evaluation_sample_check(evaluation_analytics),
_evaluation_quality_check(evaluation_analytics),
_executor_sample_check(executor_coverage),
_executor_ratio_check(executor_coverage),
]
return {
"scope": "admin_internal_harness_readiness",
"window_days": days,
"status": _overall_status(checks),
"thresholds": {
"min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS,
"min_executor_runs": _MIN_EXECUTOR_RUNS,
"min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
"min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
},
"checks": checks,
"golden_replay": golden_replay,
"evaluation_analytics": evaluation_analytics,
"executor_coverage": executor_coverage,
}

View File

@@ -90,11 +90,13 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]:
progress_map: dict[str, tuple[int, str]] = {
"request_accepted": (5, "已接收请求"),
"workflow_planned": (8, "工作流已规划"),
"retry_queued": (8, "重新排队中"),
"worker_started": (12, "后台任务已开始"),
"cancel_requested": (15, "已请求取消"),
"context_prepared": (20, "上下文已准备"),
"narrative_generated": (45, "正文已生成"),
"evaluation_completed": (52, "内容评测已完成"),
"story_saved": (60, "主记录已保存"),
"provider_call_started": (65, "Provider 调用中"),
"provider_call_succeeded": (72, "Provider 调用成功"),
@@ -307,6 +309,137 @@ def generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any]:
}
_PUBLIC_EVENT_METADATA_KEYS = {
"adapter",
"artifact",
"asset",
"assets",
"attempted_cover",
"audio_status",
"blocks_main_result",
"capability",
"completed_pages",
"cover_prompt_present",
"estimated_cost_usd",
"failed_pages",
"failure_category",
"generation_status",
"has_memory_context",
"image_status",
"input_type",
"latency_ms",
"mode",
"output_mode",
"page_count",
"page_number",
"recoverable",
"requested_from_step",
"retryable",
"scope",
"stale_after_minutes",
"status",
"step",
"strategy",
"text_status",
}
_PUBLIC_REQUEST_PAYLOAD_KEYS = {
"assets",
"child_profile_id",
"generate_images",
"input_type",
"output_mode",
"page_count",
"story_id",
"type",
"universe_id",
}
def _public_metadata_value(value: Any) -> Any:
"""Return a JSON-safe public value or None when the value is internal."""
if isinstance(value, str | int | float | bool) or value is None:
return value
if isinstance(value, list):
public_items = [
item
for item in value
if isinstance(item, str | int | float | bool) or item is None
]
return public_items
return None
def public_generation_request_payload(job: GenerationJob) -> dict[str, Any]:
"""Return request payload fields safe for user-facing job details."""
payload = job.request_payload or {}
public_payload: dict[str, Any] = {}
for key in sorted(_PUBLIC_REQUEST_PAYLOAD_KEYS):
if key not in payload:
continue
value = _public_metadata_value(payload[key])
if value is not None:
public_payload[key] = value
return public_payload
def _public_plan_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Expose only coarse workflow plan metadata to user-facing responses."""
plan = metadata.get("plan")
if not isinstance(plan, dict):
return {}
public: dict[str, Any] = {}
mode = plan.get("mode")
if isinstance(mode, str):
public["plan_mode"] = mode
tasks = plan.get("tasks")
if isinstance(tasks, list):
public["planned_task_count"] = len(tasks)
public["recoverable_task_count"] = sum(
1
for task in tasks
if isinstance(task, dict) and task.get("recoverable") is True
)
return public
def public_generation_event_metadata(event: GenerationJobEvent) -> dict[str, Any]:
"""Return event metadata safe for user-facing job event streams."""
metadata = event.event_metadata or {}
public_metadata: dict[str, Any] = {}
for key in sorted(_PUBLIC_EVENT_METADATA_KEYS):
if key not in metadata:
continue
value = _public_metadata_value(metadata[key])
if value is not None:
public_metadata[key] = value
if event.event_type == "workflow_planned":
public_metadata.update(_public_plan_metadata(metadata))
return public_metadata
def public_generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any] | None:
"""Convert a generation event for user-facing APIs with internal data removed."""
if event.event_type in {"evaluation_completed", "executor_completed"}:
return None
response = generation_event_to_response(event)
response["event_metadata"] = public_generation_event_metadata(event)
return response
def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
"""Convert a generation job ORM object to an API summary dict."""
@@ -328,6 +461,23 @@ def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
}
def public_generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
"""Convert a generation job for user-facing APIs with internal steps hidden."""
summary = generation_job_to_summary(job)
if summary["current_step"] == "evaluation_completed":
summary["current_step"] = "narrative_generated"
summary["progress_percent"] = 45
summary["progress_label"] = "正文已生成"
summary["is_terminal"] = False
elif summary["current_step"] == "executor_completed":
summary["current_step"] = "workflow_planned"
summary["progress_percent"] = 8
summary["progress_label"] = "工作流已规划"
summary["is_terminal"] = False
return summary
async def get_generation_job_for_user(
db: AsyncSession,
*,
@@ -362,13 +512,13 @@ async def request_generation_job_cancel(
raise HTTPException(status_code=409, detail="当前任务不支持取消")
if job.status == "canceled":
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
if _is_terminal_status(job.status):
raise HTTPException(status_code=409, detail="当前任务已终止,无法取消")
if job.current_step == "cancel_requested":
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
if job.current_step in {"request_accepted", "retry_queued"}:
story = None
@@ -391,7 +541,7 @@ async def request_generation_job_cancel(
error_message="Generation canceled by user before worker execution started.",
message="Generation job was canceled before worker execution started.",
)
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
previous_step = job.current_step
job.error_message = "Cancellation requested by user."
@@ -407,7 +557,7 @@ async def request_generation_job_cancel(
)
await db.commit()
await db.refresh(job)
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
async def get_generation_job_detail(
@@ -437,9 +587,13 @@ async def get_generation_job_detail(
).scalars().all()
return {
**generation_job_to_summary(job),
"request_payload": job.request_payload or {},
"events": [generation_event_to_response(event) for event in events],
**public_generation_job_to_summary(job),
"request_payload": public_generation_request_payload(job),
"events": [
response
for event in events
if (response := public_generation_event_to_response(event)) is not None
],
}
@@ -461,7 +615,7 @@ async def list_story_generation_jobs(
.order_by(desc(GenerationJob.created_at), desc(GenerationJob.id))
)
).scalars().all()
return [generation_job_to_summary(job) for job in jobs]
return [public_generation_job_to_summary(job) for job in jobs]
async def get_active_story_generation_job(
@@ -513,6 +667,59 @@ def _as_float(value: Any) -> float | None:
return None
def _sorted_buckets(counts: dict[str, int]) -> list[dict[str, Any]]:
return [
{"name": name, "count": count}
for name, count in sorted(
counts.items(),
key=lambda item: (-item[1], item[0]),
)
]
def _aggregate_trace_events(events: list[GenerationJobEvent]) -> dict[str, Any]:
"""Aggregate workflow trace metadata across job events."""
by_step: dict[str, int] = {}
by_artifact: dict[str, int] = {}
failure_categories: dict[str, int] = {}
failed_events = 0
total_events = 0
for event in events:
if event.event_type in {"evaluation_completed", "executor_completed"}:
continue
total_events += 1
metadata = event.event_metadata or {}
step = metadata.get("step")
artifact = metadata.get("artifact")
failure_category = metadata.get("failure_category")
if isinstance(step, str) and step:
by_step[step] = by_step.get(step, 0) + 1
if isinstance(artifact, str) and artifact and artifact != "none":
by_artifact[artifact] = by_artifact.get(artifact, 0) + 1
if event.status == "failed":
failed_events += 1
category = (
failure_category
if isinstance(failure_category, str) and failure_category
else "unknown_error"
)
failure_categories[category] = failure_categories.get(category, 0) + 1
return {
"total_events": total_events,
"failed_events": failed_events,
"by_step": _sorted_buckets(by_step),
"by_artifact": _sorted_buckets(by_artifact),
"failure_categories": _sorted_buckets(failure_categories),
}
def _aggregate_provider_events(
events: list[GenerationJobEvent],
*,
@@ -679,6 +886,38 @@ async def get_story_provider_stats(
}
async def get_story_trace_summary(
db: AsyncSession,
*,
story_id: int,
user_id: str,
days: int | None = None,
) -> dict[str, Any]:
"""Aggregate workflow trace metadata from all user-owned jobs for one story."""
query = (
select(GenerationJobEvent)
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
.where(
GenerationJob.story_id == story_id,
GenerationJob.user_id == user_id,
)
.order_by(GenerationJobEvent.id)
)
if days is not None:
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
query = query.where(GenerationJobEvent.created_at >= cutoff)
events = (await db.execute(query)).scalars().all()
return {
"story_id": story_id,
"window_days": days,
**_aggregate_trace_events(events),
}
async def get_user_provider_analytics(
db: AsyncSession,
*,

View File

@@ -0,0 +1,322 @@
"""Internal golden-case replay support for harness evaluations.
The replay helpers are intentionally not wired to user-facing APIs. They exist
to make evaluation behavior reproducible in tests and internal tooling.
"""
import json
from collections import Counter
from dataclasses import dataclass, field
from enum import StrEnum
from pathlib import Path
from typing import Any, Iterable
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.harness.evaluators import (
EvaluationDimension,
EvaluationResult,
evaluate_story_output,
evaluate_storybook_output,
)
class EvaluationReplayArtifact(StrEnum):
"""Artifacts supported by deterministic evaluation replay."""
STORY = "story"
STORYBOOK = "storybook"
@dataclass(frozen=True)
class ExpectedEvaluation:
"""Expected evaluation outcome for one golden case."""
passed: bool
blocking: bool
min_overall_score: float | None = None
max_overall_score: float | None = None
required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple)
quality_gate_codes: tuple[str, ...] = field(default_factory=tuple)
warning_substrings: tuple[str, ...] = field(default_factory=tuple)
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation":
"""Build expectations from a JSON-safe payload."""
return cls(
passed=bool(payload["passed"]),
blocking=bool(payload["blocking"]),
min_overall_score=payload.get("min_overall_score"),
max_overall_score=payload.get("max_overall_score"),
required_dimensions=tuple(
EvaluationDimension(dimension)
for dimension in payload.get("required_dimensions", [])
),
quality_gate_codes=tuple(payload.get("quality_gate_codes", [])),
warning_substrings=tuple(payload.get("warning_substrings", [])),
)
@dataclass(frozen=True)
class EvaluationReplayCoverage:
"""Internal coverage labels for one golden replay case."""
age_band: str = "unknown"
content_shape: str = "unknown"
risk_area: str = "unknown"
tags: tuple[str, ...] = field(default_factory=tuple)
@classmethod
def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage":
"""Build coverage labels from a JSON-safe payload."""
payload = payload or {}
return cls(
age_band=str(payload.get("age_band", "unknown")),
content_shape=str(payload.get("content_shape", "unknown")),
risk_area=str(payload.get("risk_area", "unknown")),
tags=tuple(str(tag) for tag in payload.get("tags", [])),
)
@dataclass(frozen=True)
class EvaluationReplayCase:
"""One internal golden evaluation case."""
case_id: str
artifact: EvaluationReplayArtifact
output_payload: dict[str, Any]
expected: ExpectedEvaluation
education_theme: str | None = None
minimum_score: float = 0.7
description: str = ""
input_payload: dict[str, Any] = field(default_factory=dict)
coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage)
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase":
"""Build a replay case from a JSON-safe payload."""
input_payload = dict(payload.get("input", {}))
minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7))
education_theme = input_payload.get("education_theme", payload.get("education_theme"))
return cls(
case_id=str(payload["id"]),
artifact=EvaluationReplayArtifact(payload["artifact"]),
description=str(payload.get("description", "")),
input_payload=input_payload,
output_payload=dict(payload["output"]),
education_theme=education_theme,
minimum_score=float(minimum_score),
expected=ExpectedEvaluation.from_payload(payload["expected"]),
coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")),
)
def evaluate(self) -> EvaluationResult:
"""Run the deterministic evaluator for this case."""
if self.artifact == EvaluationReplayArtifact.STORY:
return evaluate_story_output(
_story_output_from_payload(self.output_payload),
education_theme=self.education_theme,
minimum_score=self.minimum_score,
)
return evaluate_storybook_output(
_storybook_from_payload(self.output_payload),
education_theme=self.education_theme,
minimum_score=self.minimum_score,
)
def replay(self) -> "EvaluationReplayCaseResult":
"""Evaluate the case and compare it with expected outcomes."""
evaluation = self.evaluate()
failures = tuple(_compare_evaluation(self, evaluation))
return EvaluationReplayCaseResult(
case_id=self.case_id,
artifact=self.artifact,
coverage=self.coverage,
evaluation=evaluation,
failures=failures,
)
@dataclass(frozen=True)
class EvaluationReplayCaseResult:
"""Replay result for one golden case."""
case_id: str
artifact: EvaluationReplayArtifact
coverage: EvaluationReplayCoverage
evaluation: EvaluationResult
failures: tuple[str, ...] = field(default_factory=tuple)
@property
def expectations_met(self) -> bool:
"""Return whether the case matched all expectations."""
return not self.failures
@dataclass(frozen=True)
class EvaluationReplaySuiteResult:
"""Replay result for a set of golden cases."""
cases: tuple[EvaluationReplayCaseResult, ...]
@property
def passed(self) -> bool:
"""Return whether every replay case matched expectations."""
return all(case.expectations_met for case in self.cases)
@property
def failed_case_ids(self) -> tuple[str, ...]:
"""Return case IDs with expectation mismatches."""
return tuple(case.case_id for case in self.cases if not case.expectations_met)
def failure_report(self) -> str:
"""Return a compact failure report for assertion messages."""
lines: list[str] = []
for case in self.cases:
for failure in case.failures:
lines.append(f"{case.case_id}: {failure}")
return "\n".join(lines)
def coverage_summary(self) -> dict[str, dict[str, int]]:
"""Return internal coverage counts for golden replay review."""
return {
"artifact": _count_values(case.artifact.value for case in self.cases),
"age_band": _count_values(case.coverage.age_band for case in self.cases),
"content_shape": _count_values(
case.coverage.content_shape for case in self.cases
),
"risk_area": _count_values(case.coverage.risk_area for case in self.cases),
"tags": _count_values(
tag for case in self.cases for tag in case.coverage.tags
),
"outcome": _count_values(
"passed" if case.evaluation.passed else "blocked"
for case in self.cases
),
}
def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]:
"""Load internal golden replay cases from a JSON file."""
raw_cases = json.loads(Path(path).read_text(encoding="utf-8"))
if not isinstance(raw_cases, list):
raise ValueError("Evaluation replay fixture must be a JSON array.")
return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases)
def run_evaluation_replay_cases(
cases: Iterable[EvaluationReplayCase],
) -> EvaluationReplaySuiteResult:
"""Run a set of internal golden evaluation replay cases."""
return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases))
def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult:
"""Load and run internal golden evaluation replay cases."""
return run_evaluation_replay_cases(load_evaluation_replay_cases(path))
def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput:
return StoryOutput(
mode=payload.get("mode", "generated"),
title=payload.get("title", ""),
story_text=payload.get("story_text", ""),
cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""),
)
def _storybook_from_payload(payload: dict[str, Any]) -> Storybook:
pages = [
StorybookPage(
page_number=page.get("page_number", index + 1),
text=page.get("text", ""),
image_prompt=page.get("image_prompt", ""),
image_url=page.get("image_url"),
)
for index, page in enumerate(payload.get("pages", []))
]
return Storybook(
title=payload.get("title", ""),
main_character=payload.get("main_character", ""),
art_style=payload.get("art_style", ""),
pages=pages,
cover_prompt=payload.get("cover_prompt", ""),
cover_url=payload.get("cover_url"),
)
def _count_values(values: Iterable[str]) -> dict[str, int]:
counts = Counter(value for value in values if value)
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
def _compare_evaluation(
case: EvaluationReplayCase,
evaluation: EvaluationResult,
) -> list[str]:
expected = case.expected
failures: list[str] = []
if evaluation.passed != expected.passed:
failures.append(f"expected passed={expected.passed}, got {evaluation.passed}")
if evaluation.blocking != expected.blocking:
failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}")
if (
expected.min_overall_score is not None
and evaluation.overall_score < expected.min_overall_score
):
failures.append(
"expected overall_score >= "
f"{expected.min_overall_score}, got {evaluation.overall_score}"
)
if (
expected.max_overall_score is not None
and evaluation.overall_score > expected.max_overall_score
):
failures.append(
"expected overall_score <= "
f"{expected.max_overall_score}, got {evaluation.overall_score}"
)
actual_dimensions = {score.dimension for score in evaluation.scores}
missing_dimensions = [
dimension.value
for dimension in expected.required_dimensions
if dimension not in actual_dimensions
]
if missing_dimensions:
failures.append(f"missing dimensions: {', '.join(missing_dimensions)}")
actual_quality_gate_codes = tuple(
issue.code.value for issue in evaluation.gate_error.issues
) if evaluation.gate_error is not None else ()
if actual_quality_gate_codes != expected.quality_gate_codes:
failures.append(
"expected quality_gate_codes="
f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}"
)
for expected_warning in expected.warning_substrings:
if not any(expected_warning in warning for warning in evaluation.warnings):
failures.append(f"missing warning containing: {expected_warning}")
return failures

View File

@@ -0,0 +1,267 @@
"""Deterministic evaluation helpers for generated child-facing content."""
from dataclasses import dataclass, field
from enum import StrEnum
from typing import Any
from app.services.adapters.storybook.primary import Storybook
from app.services.adapters.text.models import StoryOutput
from app.services.harness.quality_gates import (
QualityGateError,
validate_story_output,
validate_storybook_output,
)
class EvaluationDimension(StrEnum):
"""Stable dimensions used by harness evaluations."""
STRUCTURE = "structure"
SAFETY = "safety"
AGE_FIT = "age_fit"
EDUCATIONAL_VALUE = "educational_value"
READABILITY = "readability"
@dataclass(frozen=True)
class EvaluationScore:
"""One scored evaluation dimension."""
dimension: EvaluationDimension
score: float
reason: str
def to_metadata(self) -> dict[str, Any]:
"""Return a JSON-safe metadata payload."""
return {
"dimension": self.dimension.value,
"score": self.score,
"reason": self.reason,
}
@dataclass(frozen=True)
class EvaluationResult:
"""Deterministic evaluation result for one generated artifact."""
overall_score: float
passed: bool
blocking: bool
scores: tuple[EvaluationScore, ...]
gate_error: QualityGateError | None = None
warnings: tuple[str, ...] = field(default_factory=tuple)
def to_metadata(self) -> dict[str, Any]:
"""Return a JSON-safe metadata payload."""
metadata: dict[str, Any] = {
"overall_score": self.overall_score,
"passed": self.passed,
"blocking": self.blocking,
"scores": [score.to_metadata() for score in self.scores],
"warnings": list(self.warnings),
}
if self.gate_error is not None:
metadata["quality_gate"] = self.gate_error.to_metadata()
return metadata
def _clamp_score(value: float) -> float:
return max(0.0, min(1.0, round(value, 2)))
def _story_text_readability_score(story_text: str) -> float:
"""Score text length with a conservative 3-8 age readability heuristic."""
normalized_length = len(story_text.strip())
if normalized_length < 30:
return 0.45
if normalized_length > 2500:
return 0.72
if normalized_length > 1800:
return 0.84
return 0.96
def _educational_value_score(story_text: str, education_theme: str | None) -> float:
if not education_theme:
return 0.82
return 0.96 if education_theme.strip() in story_text else 0.88
def _storybook_readability_score(page_texts: list[str]) -> float:
if not page_texts:
return 0.0
page_lengths = [len(text.strip()) for text in page_texts]
if any(length < 8 for length in page_lengths):
return 0.62
if any(length > 320 for length in page_lengths):
return 0.78
if any(length > 220 for length in page_lengths):
return 0.88
return 0.96
def _storybook_educational_value_score(
page_texts: list[str],
education_theme: str | None,
) -> float:
if not education_theme:
return 0.82
combined_text = " ".join(page_texts)
return 0.96 if education_theme.strip() in combined_text else 0.88
def evaluate_story_output(
output: StoryOutput,
*,
education_theme: str | None = None,
minimum_score: float = 0.7,
) -> EvaluationResult:
"""Evaluate a generated text story before persistence."""
try:
validate_story_output(output)
except QualityGateError as exc:
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=0.0,
reason="故事结构未通过质量门。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=0.0,
reason="内容未通过儿童安全或结构完整性检查。",
),
)
return EvaluationResult(
overall_score=0.0,
passed=False,
blocking=True,
scores=scores,
gate_error=exc,
)
readability_score = _story_text_readability_score(output.story_text)
educational_score = _educational_value_score(output.story_text, education_theme)
warnings: list[str] = []
if readability_score < 0.8:
warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。")
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=1.0,
reason="标题、正文和封面提示词完整。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=1.0,
reason="未命中确定性儿童安全风险词。",
),
EvaluationScore(
dimension=EvaluationDimension.AGE_FIT,
score=readability_score,
reason="根据正文长度估算低龄儿童阅读适配度。",
),
EvaluationScore(
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
score=educational_score,
reason="根据教育主题是否清晰融入正文估算。",
),
EvaluationScore(
dimension=EvaluationDimension.READABILITY,
score=readability_score,
reason="根据正文长度估算朗读和亲子共读流畅度。",
),
)
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
return EvaluationResult(
overall_score=overall_score,
passed=overall_score >= minimum_score,
blocking=overall_score < minimum_score,
scores=scores,
warnings=tuple(warnings),
)
def evaluate_storybook_output(
output: Storybook,
*,
education_theme: str | None = None,
minimum_score: float = 0.7,
) -> EvaluationResult:
"""Evaluate generated storybook structure before persistence."""
try:
validate_storybook_output(output)
except QualityGateError as exc:
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=0.0,
reason="绘本结构未通过质量门。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=0.0,
reason="绘本内容未通过儿童安全或结构完整性检查。",
),
)
return EvaluationResult(
overall_score=0.0,
passed=False,
blocking=True,
scores=scores,
gate_error=exc,
)
page_texts = [page.text for page in output.pages]
readability_score = _storybook_readability_score(page_texts)
educational_score = _storybook_educational_value_score(page_texts, education_theme)
warnings: list[str] = []
if readability_score < 0.8:
warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。")
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=1.0,
reason="绘本标题、分页和页码结构完整。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=1.0,
reason="未命中确定性儿童安全风险词。",
),
EvaluationScore(
dimension=EvaluationDimension.AGE_FIT,
score=readability_score,
reason="根据每页正文长度估算低龄儿童翻页阅读适配度。",
),
EvaluationScore(
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
score=educational_score,
reason="根据教育主题是否清晰融入分页正文估算。",
),
EvaluationScore(
dimension=EvaluationDimension.READABILITY,
score=readability_score,
reason="根据分页正文长度估算亲子共读流畅度。",
),
)
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
return EvaluationResult(
overall_score=overall_score,
passed=overall_score >= minimum_score,
blocking=overall_score < minimum_score,
scores=scores,
warnings=tuple(warnings),
)

View File

@@ -0,0 +1,150 @@
"""Small-step workflow executor helpers for generation harness adoption."""
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.harness.artifacts import AssetCompletionResult
from app.services.harness.plans import WorkflowPlan
from app.services.harness.trace import TraceRecorder
from app.services.harness.types import ArtifactKind, WorkflowStep
if TYPE_CHECKING:
from app.db.models import GenerationJob
AssetTask = Callable[[], Awaitable[AssetCompletionResult]]
@dataclass(frozen=True)
class AssetPlanRunResult:
"""Result of executing asset-producing tasks from one workflow plan."""
task_results: tuple[AssetCompletionResult, ...]
executed_task_keys: tuple[str, ...]
ignored_task_keys: tuple[str, ...]
@property
def result_assets(self) -> tuple[str, ...]:
"""Assets returned by executed task handlers."""
return tuple(result.asset for result in self.task_results)
def to_metadata(self, plan: WorkflowPlan) -> dict[str, Any]:
"""Return internal executor coverage metadata for admin-only analytics."""
return {
"plan_mode": plan.mode.value,
"planned_task_count": len(plan.tasks),
"executed_task_count": len(self.executed_task_keys),
"ignored_task_count": len(self.ignored_task_keys),
"result_count": len(self.task_results),
"executed_task_keys": list(self.executed_task_keys),
"ignored_task_keys": list(self.ignored_task_keys),
"result_assets": list(self.result_assets),
}
async def record_workflow_plan(
db: AsyncSession,
*,
job: "GenerationJob | None",
plan: WorkflowPlan,
) -> None:
"""Persist a workflow plan snapshot for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
event_type="workflow_planned",
status="succeeded",
message="Workflow plan selected for this generation request.",
metadata={"plan": plan.to_snapshot()},
step=WorkflowStep.REQUEST_ACCEPTANCE,
artifact=ArtifactKind.NONE,
blocks_main_result=True,
)
async def record_evaluation_result(
db: AsyncSession,
*,
job: "GenerationJob | None",
story_id: int | None = None,
metadata: dict[str, Any],
status: str,
artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
) -> None:
"""Persist a deterministic evaluation result for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
story_id=story_id,
event_type="evaluation_completed",
status=status,
message="Generated content evaluation completed.",
metadata=metadata,
step=WorkflowStep.EVALUATION,
artifact=artifact,
blocks_main_result=status != "succeeded",
)
async def record_executor_result(
db: AsyncSession,
*,
job: "GenerationJob | None",
plan: WorkflowPlan,
result: AssetPlanRunResult,
) -> None:
"""Persist internal executor coverage metadata for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
event_type="executor_completed",
status="succeeded",
message="Workflow executor completed planned asset tasks.",
metadata=result.to_metadata(plan),
step=WorkflowStep.UNKNOWN,
artifact=ArtifactKind.NONE,
blocks_main_result=False,
)
async def run_asset_plan(
plan: WorkflowPlan,
*,
image_task: AssetTask | None = None,
audio_task: AssetTask | None = None,
) -> AssetPlanRunResult:
"""Execute asset-producing tasks in the order declared by a workflow plan."""
if plan.mode.value not in {"asset_generation", "asset_retry"}:
raise ValueError("run_asset_plan only supports asset workflow plans")
task_results: list[AssetCompletionResult] = []
executed_task_keys: list[str] = []
ignored_task_keys: list[str] = []
for task in plan.tasks:
if task.key == "complete_image_asset":
if image_task is None:
raise ValueError("Asset workflow plan requires an image task handler")
task_results.append(await image_task())
executed_task_keys.append(task.key)
continue
if task.key == "complete_audio_asset":
if audio_task is None:
raise ValueError("Asset workflow plan requires an audio task handler")
task_results.append(await audio_task())
executed_task_keys.append(task.key)
continue
ignored_task_keys.append(task.key)
return AssetPlanRunResult(
task_results=tuple(task_results),
executed_task_keys=tuple(executed_task_keys),
ignored_task_keys=tuple(ignored_task_keys),
)

View File

@@ -0,0 +1,400 @@
[
{
"id": "story-safe-theme-pass",
"artifact": "story",
"description": "完整、儿童安全且清晰包含教育主题的普通故事。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "story"]
},
"input": {
"keywords": "小兔子, 月光花园",
"education_theme": "复盘"
},
"output": {
"mode": "generated",
"title": "小兔子的月光花园",
"story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。",
"cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-long-safe-pass",
"artifact": "story",
"description": "较长但仍适合亲子共读的普通故事。",
"coverage": {
"age_band": "7-8",
"content_shape": "long_story",
"risk_area": "length_boundary",
"tags": ["theme_present", "long_text", "story"]
},
"input": {
"keywords": "小海豚, 图书馆",
"education_theme": "合作"
},
"output": {
"mode": "generated",
"title": "小海豚的蓝色图书馆",
"story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。",
"cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-missing-text-blocks",
"artifact": "story",
"description": "故事正文缺失会被确定性质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "empty_story",
"risk_area": "schema_error",
"tags": ["missing_text", "story", "blocking"]
},
"input": {
"keywords": "小熊, 星星"
},
"output": {
"mode": "generated",
"title": "小熊找星星",
"story_text": "",
"cover_prompt_suggestion": "A bear looking at friendly stars"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_story_text"
]
}
},
{
"id": "story-missing-cover-prompt-blocks",
"artifact": "story",
"description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "schema_error",
"tags": ["missing_cover_prompt", "story", "blocking"]
},
"input": {
"keywords": "小松鼠, 风筝",
"education_theme": "勇敢"
},
"output": {
"mode": "generated",
"title": "小松鼠的风筝",
"story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。",
"cover_prompt_suggestion": ""
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_cover_prompt"
]
}
},
{
"id": "story-unsafe-term-blocks",
"artifact": "story",
"description": "明显不适合儿童的风险词会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "short_story",
"risk_area": "safety_error",
"tags": ["unsafe_term", "story", "blocking"]
},
"input": {
"keywords": "小猫, 城堡"
},
"output": {
"mode": "generated",
"title": "小猫的城堡",
"story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。",
"cover_prompt_suggestion": "A cat near a castle"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "story-short-high-threshold-blocks",
"artifact": "story",
"description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "very_short_story",
"risk_area": "readability_warning",
"tags": ["short_text", "threshold_block", "story"]
},
"input": {
"keywords": "小鹿, 书签",
"education_theme": "耐心",
"minimum_score": 0.82
},
"output": {
"mode": "generated",
"title": "小鹿的书签",
"story_text": "小鹿学会了耐心等待。",
"cover_prompt_suggestion": "A deer with a golden bookmark"
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.7,
"max_overall_score": 0.8,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"正文长度"
]
}
},
{
"id": "storybook-safe-theme-pass",
"artifact": "storybook",
"description": "完整、儿童安全且包含教育主题的绘本分页输出。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_3_pages",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "storybook"]
},
"input": {
"keywords": "小狐狸, 彩虹桥",
"education_theme": "合作"
},
"output": {
"title": "彩虹桥上的合作",
"main_character": "小狐狸米米",
"art_style": "温暖水彩",
"cover_prompt": "A warm watercolor fox near a rainbow bridge",
"pages": [
{
"page_number": 1,
"text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
"image_prompt": "A little fox finds a rainbow bridge"
},
{
"page_number": 2,
"text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。",
"image_prompt": "Forest friends work together"
},
{
"page_number": 3,
"text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。",
"image_prompt": "Friends carrying leaves together"
}
]
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "storybook-duplicate-page-blocks",
"artifact": "storybook",
"description": "重复页码的绘本结构会被质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_invalid_pages",
"risk_area": "schema_error",
"tags": ["duplicate_page", "storybook", "blocking"]
},
"input": {
"keywords": "小熊, 森林"
},
"output": {
"title": "森林里的小熊",
"main_character": "小熊布布",
"art_style": "水彩",
"cover_prompt": "A bear in a forest",
"pages": [
{
"page_number": 1,
"text": "布布在森林里找到一颗松果。",
"image_prompt": "Bear finds a pinecone"
},
{
"page_number": 1,
"text": "布布把松果带给朋友一起观察。",
"image_prompt": "Bear shares the pinecone"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"invalid_storybook_page_number"
]
}
},
{
"id": "storybook-missing-page-blocks",
"artifact": "storybook",
"description": "没有分页内容的绘本会被结构质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "storybook_empty_pages",
"risk_area": "schema_error",
"tags": ["missing_page", "storybook", "blocking"]
},
"input": {
"keywords": "小鸟, 云朵"
},
"output": {
"title": "小鸟和云朵",
"main_character": "小鸟啾啾",
"art_style": "柔和水彩",
"cover_prompt": "A bird near soft clouds",
"pages": []
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_storybook_page"
]
}
},
{
"id": "storybook-unsafe-term-blocks",
"artifact": "storybook",
"description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "safety_error",
"tags": ["unsafe_term", "storybook", "blocking"]
},
"input": {
"keywords": "小兔子, 山洞"
},
"output": {
"title": "山洞里的声音",
"main_character": "小兔子米粒",
"art_style": "温暖水彩",
"cover_prompt": "A rabbit near a cave",
"pages": [
{
"page_number": 1,
"text": "米粒走到山洞边,听见奇怪的声音。",
"image_prompt": "Rabbit near a cave"
},
{
"page_number": 2,
"text": "洞里出现血腥画面,米粒吓得跑开。",
"image_prompt": "Rabbit running away"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "storybook-short-page-warning",
"artifact": "storybook",
"description": "分页正文过短时保留内部警告,用于评测回归。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "readability_warning",
"tags": ["short_page_text", "threshold_block", "storybook"]
},
"input": {
"keywords": "小羊, 风铃",
"minimum_score": 0.85
},
"output": {
"title": "风铃响了",
"main_character": "小羊团团",
"art_style": "柔和蜡笔",
"cover_prompt": "A lamb listening to a wind chime",
"pages": [
{
"page_number": 1,
"text": "风响。",
"image_prompt": "Wind chime rings"
},
{
"page_number": 2,
"text": "团团笑。",
"image_prompt": "Lamb smiles"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.8,
"max_overall_score": 0.82,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"分页正文长度"
]
}
}
]

View File

@@ -69,6 +69,11 @@ def build_story_plan(*, generate_images: bool) -> WorkflowPlan:
step=WorkflowStep.NARRATIVE_GENERATION,
artifact=ArtifactKind.STORY_TEXT,
),
WorkflowTask(
key="evaluate_narrative",
step=WorkflowStep.EVALUATION,
artifact=ArtifactKind.STORY_TEXT,
),
WorkflowTask(
key="persist_story",
step=WorkflowStep.STORY_PERSISTENCE,
@@ -124,6 +129,11 @@ def build_storybook_plan(*, generate_images: bool) -> WorkflowPlan:
step=WorkflowStep.NARRATIVE_GENERATION,
artifact=ArtifactKind.STORYBOOK_PAGES,
),
WorkflowTask(
key="evaluate_storybook_pages",
step=WorkflowStep.EVALUATION,
artifact=ArtifactKind.STORYBOOK_PAGES,
),
]
if generate_images:

View File

@@ -11,6 +11,7 @@ class WorkflowStep(StrEnum):
WORKER_START = "worker_start"
CONTEXT_PREPARATION = "context_preparation"
NARRATIVE_GENERATION = "narrative_generation"
EVALUATION = "evaluation"
STORY_PERSISTENCE = "story_persistence"
PROVIDER_INVOCATION = "provider_invocation"
IMAGE_GENERATION = "image_generation"
@@ -64,6 +65,8 @@ class StepStatus(StrEnum):
EVENT_STEP_MAP: dict[str, WorkflowStep] = {
"request_accepted": WorkflowStep.REQUEST_ACCEPTANCE,
"workflow_planned": WorkflowStep.REQUEST_ACCEPTANCE,
"executor_completed": WorkflowStep.UNKNOWN,
"retry_queued": WorkflowStep.REQUEST_ACCEPTANCE,
"worker_started": WorkflowStep.WORKER_START,
"context_prepared": WorkflowStep.CONTEXT_PREPARATION,
@@ -73,6 +76,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
"provider_call_succeeded": WorkflowStep.PROVIDER_INVOCATION,
"provider_call_failed": WorkflowStep.PROVIDER_INVOCATION,
"quality_gate_failed": WorkflowStep.NARRATIVE_GENERATION,
"evaluation_completed": WorkflowStep.EVALUATION,
"cover_image_started": WorkflowStep.IMAGE_GENERATION,
"cover_image_succeeded": WorkflowStep.IMAGE_GENERATION,
"cover_image_failed": WorkflowStep.IMAGE_GENERATION,
@@ -100,6 +104,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
EVENT_ARTIFACT_MAP: dict[str, ArtifactKind] = {
"narrative_generated": ArtifactKind.STORY_TEXT,
"quality_gate_failed": ArtifactKind.STORY_TEXT,
"evaluation_completed": ArtifactKind.STORY_TEXT,
"cover_image_started": ArtifactKind.COVER_IMAGE,
"cover_image_succeeded": ArtifactKind.COVER_IMAGE,
"cover_image_failed": ArtifactKind.COVER_IMAGE,

View File

@@ -36,8 +36,8 @@ from app.services.generation_jobs import (
ensure_no_active_story_generation_job,
finish_generation_job,
generation_job_can_retry,
generation_job_to_summary,
get_generation_job_for_user,
public_generation_job_to_summary,
record_generation_event,
)
from app.services.harness.artifacts import (
@@ -57,12 +57,27 @@ from app.services.harness.control import (
ExecutionControl,
GenerationJobCanceledError,
)
from app.services.harness.evaluators import (
EvaluationResult,
evaluate_story_output,
evaluate_storybook_output,
)
from app.services.harness.executor import (
record_evaluation_result,
record_executor_result,
record_workflow_plan,
run_asset_plan,
)
from app.services.harness.plans import (
build_asset_plan,
build_story_plan,
build_storybook_plan,
)
from app.services.harness.quality_gates import (
QualityGateError,
validate_story_output,
validate_storybook_output,
)
from app.services.harness.trace import TraceRecorder
from app.services.harness.types import ArtifactKind
from app.services.memory_service import build_enhanced_memory_context
from app.services.provider_router import (
generate_image,
@@ -129,6 +144,24 @@ async def _record_quality_gate_failure_if_present(
)
async def _record_evaluation_result_if_present(
db: AsyncSession,
*,
job,
evaluation: EvaluationResult,
artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
) -> None:
"""Append deterministic evaluation metadata for tracked worker jobs."""
await record_evaluation_result(
db,
job=job,
metadata=evaluation.to_metadata(),
status="succeeded" if evaluation.passed else "failed",
artifact=artifact,
)
def _asset_result_metadata(result: AssetCompletionResult) -> dict:
"""Build JSON-safe metadata for asset workflow events."""
@@ -643,18 +676,33 @@ async def generate_and_save_story(
user_id=user_id,
generation_job=job,
)
validate_story_output(result)
except QualityGateError as exc:
await _record_quality_gate_failure_if_present(db, job=job, error=exc)
raise HTTPException(
status_code=502,
detail="Story generation failed quality checks, please try again.",
) from exc
except Exception as exc:
raise HTTPException(
status_code=502,
detail="Story generation failed, please try again.",
) from exc
evaluation = evaluate_story_output(
result,
education_theme=request.education_theme,
)
if evaluation.gate_error is not None:
await _record_quality_gate_failure_if_present(
db,
job=job,
error=evaluation.gate_error,
)
await _record_evaluation_result_if_present(
db,
job=job,
evaluation=evaluation,
)
if evaluation.blocking:
raise HTTPException(
status_code=502,
detail="Story generation failed quality checks, please try again.",
)
await _record_job_event_if_present(
db,
job=job,
@@ -758,13 +806,32 @@ async def generate_storybook_service(
user_id=user_id,
generation_job=job,
)
validate_storybook_output(storybook)
except QualityGateError as exc:
await _record_quality_gate_failure_if_present(db, job=job, error=exc)
raise HTTPException(status_code=500, detail=f"故事书质量检查失败: {exc}") from exc
except Exception as e:
logger.error("storybook_generation_failed", error=str(e))
raise HTTPException(status_code=500, detail=f"故事书生成失败: {e}")
evaluation = evaluate_storybook_output(
storybook,
education_theme=request.education_theme,
)
if evaluation.gate_error is not None:
await _record_quality_gate_failure_if_present(
db,
job=job,
error=evaluation.gate_error,
)
await _record_evaluation_result_if_present(
db,
job=job,
evaluation=evaluation,
artifact=ArtifactKind.STORYBOOK_PAGES,
)
if evaluation.blocking:
raise HTTPException(
status_code=500,
detail=f"故事书质量检查失败: {evaluation.gate_error or 'evaluation blocked'}",
)
await _record_job_event_if_present(
db,
job=job,
@@ -1025,28 +1092,50 @@ async def _generate_asset_generation_service_with_job(
if not requested_assets:
raise HTTPException(status_code=400, detail="资源任务缺少 assets。")
plan = build_asset_plan(
output_mode="asset_generation",
assets=requested_assets,
)
await record_workflow_plan(
db,
job=job,
plan=plan,
)
story = await get_story_detail(int(story_id), job.user_id, db)
if "image" in requested_assets:
async def complete_image() -> AssetCompletionResult:
if story.mode == "storybook":
await _complete_storybook_image_assets(story, db, job=job)
else:
await _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
)
return await _complete_storybook_image_assets(story, db, job=job)
if "audio" in requested_assets:
await _complete_audio_asset(
return await _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
)
async def complete_audio() -> AssetCompletionResult:
return await _complete_audio_asset(
story,
db,
raise_on_failure=True,
job=job,
)
asset_plan_result = await run_asset_plan(
plan,
image_task=complete_image if "image" in requested_assets else None,
audio_task=complete_audio if "audio" in requested_assets else None,
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_plan_result,
)
story = await get_story_detail(story.id, job.user_id, db)
await finish_generation_job(
db,
@@ -1096,7 +1185,7 @@ async def retry_generation_job_service(
)
await _dispatch_generation_job(db, job=retry_job)
await db.refresh(retry_job)
return generation_job_to_summary(retry_job)
return public_generation_job_to_summary(retry_job)
async def _generate_generation_service_with_job(
@@ -1109,6 +1198,11 @@ async def _generate_generation_service_with_job(
"""Run the unified generation workflow after the tracking job has been created."""
if request.output_mode == "storybook":
await record_workflow_plan(
db,
job=job,
plan=build_storybook_plan(generate_images=request.generate_images),
)
storybook = await generate_storybook_service(
StorybookRequest(
keywords=request.data,
@@ -1155,6 +1249,9 @@ async def _generate_generation_service_with_job(
retryable_assets=saved_story.retryable_assets,
)
if request.output_mode == "story" and not request.generate_images:
return await _execute_story_without_assets_plan(request, user_id, db, job=job)
generate_request = GenerateRequest(
type=request.type,
data=request.data,
@@ -1164,6 +1261,11 @@ async def _generate_generation_service_with_job(
)
if request.generate_images:
await record_workflow_plan(
db,
job=job,
plan=build_story_plan(generate_images=True),
)
story = await generate_full_story_service(generate_request, user_id, db, job=job)
saved_story = await get_story_detail(story.id, user_id, db)
await _record_postprocessing_event_if_needed(db, job=job, story=saved_story)
@@ -1222,6 +1324,54 @@ async def _generate_generation_service_with_job(
universe_id=story.universe_id,
retryable_assets=story.retryable_assets,
)
async def _execute_story_without_assets_plan(
request: GenerationRequest,
user_id: str,
db: AsyncSession,
*,
job,
) -> GenerationResponse:
"""Execute the minimal text-story workflow through an explicit plan."""
plan = build_story_plan(generate_images=False)
await record_workflow_plan(db, job=job, plan=plan)
generate_request = GenerateRequest(
type=request.type,
data=request.data,
education_theme=request.education_theme,
child_profile_id=request.child_profile_id,
universe_id=request.universe_id,
)
story = await generate_and_save_story(generate_request, user_id, db, job=job)
await _record_postprocessing_event_if_needed(db, job=job, story=story)
await finish_generation_job(
db,
job=job,
story=story,
current_step="generation_completed",
message="Story generation completed with a persisted readable narrative.",
)
return GenerationResponse(
id=story.id,
generation_job_id=job.id,
title=story.title,
mode=story.mode,
story_text=story.story_text,
cover_prompt=story.cover_prompt,
image_url=story.image_url,
cover_url=story.image_url,
generation_status=story.generation_status,
text_status=story.text_status,
image_status=story.image_status,
audio_status=story.audio_status,
last_error=story.last_error,
child_profile_id=story.child_profile_id,
universe_id=story.universe_id,
retryable_assets=story.retryable_assets,
)
async def list_stories(
@@ -1321,36 +1471,7 @@ async def queue_story_asset_generation(
)
await _dispatch_generation_job(db, job=job)
await db.refresh(job)
return generation_job_to_summary(job)
async def _retry_cover_image_asset(story: Story, db: AsyncSession, *, job=None) -> None:
"""Retry cover generation for a text story."""
await _complete_cover_image_asset(
story,
db,
last_error_prefix="封面生成失败",
log_event="cover_asset_retry_failed",
job=job,
)
async def _retry_storybook_image_assets(
story: Story,
db: AsyncSession,
*,
job=None,
) -> None:
"""Retry missing storybook cover/page images."""
await _complete_storybook_image_assets(story, db, job=job)
async def _retry_audio_asset(story: Story, db: AsyncSession, *, job=None) -> None:
"""Retry audio generation while preserving persisted status on provider failure."""
await _complete_audio_asset(story, db, raise_on_failure=False, job=job)
return public_generation_job_to_summary(job)
async def retry_story_assets(
@@ -1374,6 +1495,15 @@ async def retry_story_assets(
try:
story = await get_story_detail(story_id, user_id, db)
plan = build_asset_plan(
output_mode="asset_retry",
assets=requested_assets,
)
await record_workflow_plan(
db,
job=job,
plan=plan,
)
await record_generation_event(
db,
job=job,
@@ -1384,14 +1514,37 @@ async def retry_story_assets(
metadata={"assets": requested_assets},
)
if "image" in requested_assets:
async def retry_image() -> AssetCompletionResult:
if story.mode == "storybook":
await _retry_storybook_image_assets(story, db, job=job)
else:
await _retry_cover_image_asset(story, db, job=job)
return await _complete_storybook_image_assets(story, db, job=job)
if "audio" in requested_assets:
await _retry_audio_asset(story, db, job=job)
return await _complete_cover_image_asset(
story,
db,
last_error_prefix="封面生成失败",
log_event="cover_asset_retry_failed",
job=job,
)
async def retry_audio() -> AssetCompletionResult:
return await _complete_audio_asset(
story,
db,
raise_on_failure=False,
job=job,
)
asset_plan_result = await run_asset_plan(
plan,
image_task=retry_image if "image" in requested_assets else None,
audio_task=retry_audio if "audio" in requested_assets else None,
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_plan_result,
)
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
@@ -1448,13 +1601,29 @@ async def generate_story_cover(
try:
story = await get_story_detail(story_id, user_id, db)
image_result = await _complete_cover_image_asset(
story,
plan = build_asset_plan(output_mode="asset_generation", assets=["image"])
await record_workflow_plan(
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
plan=plan,
)
asset_result = await run_asset_plan(
plan,
image_task=lambda: _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
),
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_result,
)
image_result = asset_result.task_results[0] if asset_result.task_results else None
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
db,
@@ -1464,7 +1633,11 @@ async def generate_story_cover(
message="Cover image generation completed.",
metadata={"assets": ["image"]},
)
if image_result.succeeded and isinstance(image_result.value, str):
if (
image_result is not None
and image_result.succeeded
and isinstance(image_result.value, str)
):
return image_result.value
except HTTPException as exc:
await finish_generation_job(
@@ -1501,12 +1674,28 @@ async def generate_story_audio(
try:
story = await get_story_detail(story_id, user_id, db)
audio_result = await _complete_audio_asset(
story,
plan = build_asset_plan(output_mode="asset_generation", assets=["audio"])
await record_workflow_plan(
db,
raise_on_failure=True,
job=job,
plan=plan,
)
asset_result = await run_asset_plan(
plan,
audio_task=lambda: _complete_audio_asset(
story,
db,
raise_on_failure=True,
job=job,
),
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_result,
)
audio_result = asset_result.task_results[0] if asset_result.task_results else None
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
db,
@@ -1516,7 +1705,11 @@ async def generate_story_audio(
message="Story audio generation completed.",
metadata={"assets": ["audio"]},
)
if audio_result.succeeded and isinstance(audio_result.value, bytes):
if (
audio_result is not None
and audio_result.succeeded
and isinstance(audio_result.value, bytes)
):
return audio_result.value
except HTTPException as exc:
await finish_generation_job(

View File

@@ -0,0 +1,400 @@
[
{
"id": "story-safe-theme-pass",
"artifact": "story",
"description": "完整、儿童安全且清晰包含教育主题的普通故事。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "story"]
},
"input": {
"keywords": "小兔子, 月光花园",
"education_theme": "复盘"
},
"output": {
"mode": "generated",
"title": "小兔子的月光花园",
"story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。",
"cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-long-safe-pass",
"artifact": "story",
"description": "较长但仍适合亲子共读的普通故事。",
"coverage": {
"age_band": "7-8",
"content_shape": "long_story",
"risk_area": "length_boundary",
"tags": ["theme_present", "long_text", "story"]
},
"input": {
"keywords": "小海豚, 图书馆",
"education_theme": "合作"
},
"output": {
"mode": "generated",
"title": "小海豚的蓝色图书馆",
"story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。",
"cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-missing-text-blocks",
"artifact": "story",
"description": "故事正文缺失会被确定性质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "empty_story",
"risk_area": "schema_error",
"tags": ["missing_text", "story", "blocking"]
},
"input": {
"keywords": "小熊, 星星"
},
"output": {
"mode": "generated",
"title": "小熊找星星",
"story_text": "",
"cover_prompt_suggestion": "A bear looking at friendly stars"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_story_text"
]
}
},
{
"id": "story-missing-cover-prompt-blocks",
"artifact": "story",
"description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "schema_error",
"tags": ["missing_cover_prompt", "story", "blocking"]
},
"input": {
"keywords": "小松鼠, 风筝",
"education_theme": "勇敢"
},
"output": {
"mode": "generated",
"title": "小松鼠的风筝",
"story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。",
"cover_prompt_suggestion": ""
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_cover_prompt"
]
}
},
{
"id": "story-unsafe-term-blocks",
"artifact": "story",
"description": "明显不适合儿童的风险词会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "short_story",
"risk_area": "safety_error",
"tags": ["unsafe_term", "story", "blocking"]
},
"input": {
"keywords": "小猫, 城堡"
},
"output": {
"mode": "generated",
"title": "小猫的城堡",
"story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。",
"cover_prompt_suggestion": "A cat near a castle"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "story-short-high-threshold-blocks",
"artifact": "story",
"description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "very_short_story",
"risk_area": "readability_warning",
"tags": ["short_text", "threshold_block", "story"]
},
"input": {
"keywords": "小鹿, 书签",
"education_theme": "耐心",
"minimum_score": 0.82
},
"output": {
"mode": "generated",
"title": "小鹿的书签",
"story_text": "小鹿学会了耐心等待。",
"cover_prompt_suggestion": "A deer with a golden bookmark"
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.7,
"max_overall_score": 0.8,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"正文长度"
]
}
},
{
"id": "storybook-safe-theme-pass",
"artifact": "storybook",
"description": "完整、儿童安全且包含教育主题的绘本分页输出。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_3_pages",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "storybook"]
},
"input": {
"keywords": "小狐狸, 彩虹桥",
"education_theme": "合作"
},
"output": {
"title": "彩虹桥上的合作",
"main_character": "小狐狸米米",
"art_style": "温暖水彩",
"cover_prompt": "A warm watercolor fox near a rainbow bridge",
"pages": [
{
"page_number": 1,
"text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
"image_prompt": "A little fox finds a rainbow bridge"
},
{
"page_number": 2,
"text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。",
"image_prompt": "Forest friends work together"
},
{
"page_number": 3,
"text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。",
"image_prompt": "Friends carrying leaves together"
}
]
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "storybook-duplicate-page-blocks",
"artifact": "storybook",
"description": "重复页码的绘本结构会被质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_invalid_pages",
"risk_area": "schema_error",
"tags": ["duplicate_page", "storybook", "blocking"]
},
"input": {
"keywords": "小熊, 森林"
},
"output": {
"title": "森林里的小熊",
"main_character": "小熊布布",
"art_style": "水彩",
"cover_prompt": "A bear in a forest",
"pages": [
{
"page_number": 1,
"text": "布布在森林里找到一颗松果。",
"image_prompt": "Bear finds a pinecone"
},
{
"page_number": 1,
"text": "布布把松果带给朋友一起观察。",
"image_prompt": "Bear shares the pinecone"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"invalid_storybook_page_number"
]
}
},
{
"id": "storybook-missing-page-blocks",
"artifact": "storybook",
"description": "没有分页内容的绘本会被结构质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "storybook_empty_pages",
"risk_area": "schema_error",
"tags": ["missing_page", "storybook", "blocking"]
},
"input": {
"keywords": "小鸟, 云朵"
},
"output": {
"title": "小鸟和云朵",
"main_character": "小鸟啾啾",
"art_style": "柔和水彩",
"cover_prompt": "A bird near soft clouds",
"pages": []
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_storybook_page"
]
}
},
{
"id": "storybook-unsafe-term-blocks",
"artifact": "storybook",
"description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "safety_error",
"tags": ["unsafe_term", "storybook", "blocking"]
},
"input": {
"keywords": "小兔子, 山洞"
},
"output": {
"title": "山洞里的声音",
"main_character": "小兔子米粒",
"art_style": "温暖水彩",
"cover_prompt": "A rabbit near a cave",
"pages": [
{
"page_number": 1,
"text": "米粒走到山洞边,听见奇怪的声音。",
"image_prompt": "Rabbit near a cave"
},
{
"page_number": 2,
"text": "洞里出现血腥画面,米粒吓得跑开。",
"image_prompt": "Rabbit running away"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "storybook-short-page-warning",
"artifact": "storybook",
"description": "分页正文过短时保留内部警告,用于评测回归。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "readability_warning",
"tags": ["short_page_text", "threshold_block", "storybook"]
},
"input": {
"keywords": "小羊, 风铃",
"minimum_score": 0.85
},
"output": {
"title": "风铃响了",
"main_character": "小羊团团",
"art_style": "柔和蜡笔",
"cover_prompt": "A lamb listening to a wind chime",
"pages": [
{
"page_number": 1,
"text": "风响。",
"image_prompt": "Wind chime rings"
},
{
"page_number": 2,
"text": "团团笑。",
"image_prompt": "Lamb smiles"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.8,
"max_overall_score": 0.82,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"分页正文长度"
]
}
}
]

View File

@@ -0,0 +1,610 @@
# Test Cases: Harness Evaluation Driven Generation
## Overview
- **Feature**: Harness evaluation driven generation
- **Requirements Source**: `docs/technical/harness-engineering-modernization.md`
- **Test Coverage**: evaluation scoring, blocking quality failures, workflow plan events, trace aggregation, state transitions, internal golden replay, admin-only analytics, admin-only executor coverage summary, admin-only harness readiness
- **Last Updated**: 2026-06-23
## Test Case Categories
### 1. Functional Tests
#### TC-F-001: 普通故事无图片生成写入评测事件
- **Requirement**: H7-3, H7-4
- **Priority**: High
- **Preconditions**:
- 用户已登录。
- 文本 provider 返回完整、儿童安全的故事。
- **Test Steps**:
1. 调用 `POST /api/generations`,设置 `output_mode=story``generate_images=false`
2. 执行 worker 任务。
3. 查询 job detail。
- **Expected Results**:
- job 状态为 `completed`
- event 顺序包含 `workflow_planned`
- event 顺序包含 `evaluation_completed`
- `evaluation_completed.event_metadata.passed=true`
- `evaluation_completed.event_metadata.overall_score >= 0.7`
- **Postconditions**: 故事已持久化,`story_id` 写入 job。
#### TC-F-003: 用户 Trace summary 不返回评测摘要
- **Requirement**: H7-4, H7B-1
- **Priority**: High
- **Preconditions**:
- 故事已有 `evaluation_completed` job event。
- **Test Steps**:
1. 调用 `GET /api/generations/{story_id}/trace-summary`
2. 检查响应字段。
- **Expected Results**:
- 响应不包含 `evaluation` 字段。
- `by_step` 不包含 `evaluation`
- `by_artifact` 不因 `evaluation_completed` 增加 `story_text` 计数。
- `failed_events` 不统计 `evaluation_completed`
- `total_events` 不统计 `evaluation_completed`,避免通过事件数量泄露内部评测步骤。
- **Postconditions**: 无数据修改。
#### TC-F-004: 用户 Job detail 不返回评测事件
- **Requirement**: H7-4, H7B-2
- **Priority**: High
- **Preconditions**:
- job 已记录 `evaluation_completed` 事件。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查 `events` 列表。
- **Expected Results**:
- `events` 不包含 `evaluation_completed`
- 响应不包含评测分数、维度分数、通过率或阻断阈值。
- **Postconditions**: 内部数据库事件不被删除。
#### TC-F-002: 完整故事输出获得通过评分
- **Requirement**: H7-1
- **Priority**: High
- **Preconditions**:
- 构造完整 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 `EvaluationResult`
- **Expected Results**:
- `passed=true`
- `blocking=false`
- scores 包含 `structure``safety``age_fit``educational_value``readability`
- **Postconditions**: 无持久化副作用。
#### TC-F-005: 完整绘本输出获得通过评分
- **Requirement**: H7-1, H7C-1
- **Priority**: High
- **Preconditions**:
- 构造完整 `Storybook`
- **Test Steps**:
1. 调用 `evaluate_storybook_output`
2. 读取 `EvaluationResult`
- **Expected Results**:
- `passed=true`
- `blocking=false`
- scores 包含 `structure``safety``age_fit``educational_value``readability`
- **Postconditions**: 无持久化副作用。
#### TC-F-006: 内部 golden cases 可回放且全部符合预期
- **Requirement**: H7-7, H7-8
- **Priority**: High
- **Preconditions**:
- `backend/app/services/harness/fixtures/evaluation_golden_cases.json` 存在。
- fixture 只由后端测试、内部工具或 admin-only readiness 读取。
- **Test Steps**:
1. 调用 `replay_evaluation_golden_cases`
2. 读取 `EvaluationReplaySuiteResult`
- **Expected Results**:
- `passed=true`
- `failed_case_ids` 为空。
- 普通故事和绘本样本都被覆盖。
- 样本覆盖完整普通故事、较长普通故事、空正文、缺失封面提示词、安全风险词、短文本阈值阻断、绘本重复页码、绘本缺页、绘本安全风险和绘本短分页。
- 结果不通过任何用户端 API 返回。
- **Postconditions**: 无持久化副作用。
#### TC-F-007: 内部 golden replay 覆盖摘要稳定
- **Requirement**: H7-8
- **Priority**: High
- **Preconditions**:
- golden replay suite 已执行。
- **Test Steps**:
1. 调用 `coverage_summary`
2. 检查 artifact、age_band、risk_area、tags 和 outcome 分布。
- **Expected Results**:
- artifact 覆盖 `story=6``storybook=5`
- age_band 覆盖 `3-4``5-6``7-8``unknown`
- risk_area 覆盖 `happy_path``schema_error``safety_error``readability_warning``length_boundary`
- outcome 覆盖 `passed=3``blocked=8`
- 覆盖摘要不通过任何用户端 API 返回。
- **Postconditions**: 无持久化副作用。
### 2. Edge Case Tests
#### TC-E-001: 很短故事通过结构但产生低龄阅读体验警告
- **Requirement**: H7-1
- **Priority**: Medium
- **Preconditions**:
- 构造标题、正文、封面提示词完整但正文很短的 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 warnings 和维度分数。
- **Expected Results**:
- 不触发质量门异常。
- `age_fit``readability` 分数低于完整故事。
- warnings 包含阅读体验提示。
- **Postconditions**: 无持久化副作用。
#### TC-E-002: 内部 golden replay 能报告预期不匹配
- **Requirement**: H7-7
- **Priority**: Medium
- **Preconditions**:
- 构造一个实际得分低于期望阈值的 `EvaluationReplayCase`
- **Test Steps**:
1. 调用 `run_evaluation_replay_cases`
2. 读取 `failure_report`
- **Expected Results**:
- `passed=false`
- `failed_case_ids` 包含该 case id。
- `failure_report` 包含 `overall_score` 差异。
- **Postconditions**: 无持久化副作用。
### 3. Error Handling Tests
#### TC-ERR-001: 空正文阻断持久化
- **Requirement**: H7-4
- **Priority**: High
- **Preconditions**:
- 文本 provider 返回空 `story_text`
- **Test Steps**:
1. 执行 worker 任务。
2. 查询 job 和 story 表。
3. 查询 job events。
- **Expected Results**:
- job 状态为 `failed`
- 没有 story 被持久化。
- events 包含 `quality_gate_failed`
- events 包含 `evaluation_completed`
- `evaluation_completed.event_metadata.blocking=true`
- **Postconditions**: 用户可重试该 job。
#### TC-ERR-002: 不适龄风险词阻断生成
- **Requirement**: H7-1
- **Priority**: High
- **Preconditions**:
- 构造包含明显不适龄风险词的 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 `quality_gate` metadata。
- **Expected Results**:
- `passed=false`
- `blocking=true`
- `quality_gate.issues[0].failure_category=safety_error`
- **Postconditions**: 无持久化副作用。
#### TC-ERR-003: 绘本结构错误阻断生成
- **Requirement**: H7-1, H7C-1
- **Priority**: High
- **Preconditions**:
- 构造页码重复或页面缺失的 `Storybook`
- **Test Steps**:
1. 调用 `evaluate_storybook_output`
2. 读取 `quality_gate` metadata。
- **Expected Results**:
- `passed=false`
- `blocking=true`
- `quality_gate.issues[0].code=invalid_storybook_page_number` 或对应结构错误。
- **Postconditions**: 无持久化副作用。
### 4. State Transition Tests
#### TC-ST-001: 普通故事无图片路径事件顺序稳定
- **Requirement**: H7-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询 events。
- **Expected Results**:
- event 顺序为 `request_accepted``worker_started``workflow_planned``context_prepared``evaluation_completed``narrative_generated``story_saved``generation_completed`
- **Postconditions**: job `current_step=generation_completed`
#### TC-ST-002: 普通故事带图片路径记录可恢复资产计划
- **Requirement**: H9-1, H9-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- 请求设置 `output_mode=story``generate_images=true`
- 文本 provider 返回合格故事,图片 provider 返回封面 URL。
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询内部 events。
3. 读取 `workflow_planned.event_metadata.plan`
- **Expected Results**:
- event 顺序为 `request_accepted``worker_started``workflow_planned``context_prepared``evaluation_completed``narrative_generated``story_saved``cover_image_started``cover_image_succeeded``generation_completed`
- `plan.mode=story_with_assets`
- plan tasks 包含 `evaluate_narrative`
- plan tasks 包含 `generate_cover_image`
- `generate_cover_image.required=false`
- `generate_cover_image.recoverable=true`
- **Postconditions**: job `current_step=generation_completed`,故事 `image_status=ready`
#### TC-ST-003: 绘本路径记录绘本计划快照
- **Requirement**: H9-2, H9-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- 请求设置 `output_mode=storybook`
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询内部 events。
3. 读取 `workflow_planned.event_metadata.plan`
- **Expected Results**:
- event 顺序包含 `workflow_planned`,且位于 `worker_started``context_prepared` 之间。
- `plan.mode=storybook`
- plan tasks 包含 `generate_storybook_pages`
- plan tasks 包含 `evaluate_storybook_pages`
-`generate_images=true`plan tasks 包含 `generate_storybook_images`
- `generate_storybook_images.required=false`
- `generate_storybook_images.recoverable=true`
- **Postconditions**: job `current_step=generation_completed`
#### TC-ST-004: 绘本生成内部记录评测但用户事件脱敏
- **Requirement**: H7C-1, H7B-2, H9-4
- **Priority**: High
- **Preconditions**:
- 绘本生成 job 已执行完成。
- **Test Steps**:
1. 直接查询内部 `generation_job_events`
2. 调用 `GET /api/generations/jobs/{job_id}`
- **Expected Results**:
- 内部事件包含 `evaluation_completed`
- 内部 `evaluation_completed.event_metadata.artifact=storybook_pages`
- 用户 API events 不包含 `evaluation_completed`
- 用户 API 响应不包含 `overall_score`、维度分数、阈值或 golden replay 字段。
- **Postconditions**: job 完成,绘本已持久化。
#### TC-ST-005: 资产生成和重试路径记录资产计划快照
- **Requirement**: H10-1, H10-2, H10-3
- **Priority**: High
- **Preconditions**:
- 故事已有可生成或可重试的图片/音频资源。
- **Test Steps**:
1. 执行 `asset_generation` worker 任务。
2. 调用 `/api/generations/{story_id}/retry-assets`
3. 按 id 查询内部 events。
- **Expected Results**:
- `asset_generation` 事件顺序包含 `workflow_planned`
- `asset_generation``plan.mode=asset_generation`
- `asset_retry` 事件顺序包含 `workflow_planned`
- `asset_retry``plan.mode=asset_retry`
- 图片和音频任务在 plan 中为 `required=false``recoverable=true`
- **Postconditions**: 资源状态按原有语义更新。
#### TC-ST-006: 用户事件 metadata 使用白名单脱敏
- **Requirement**: H10-4, H10-5
- **Priority**: High
- **Preconditions**:
- 内部 job events 包含原始 `plan.tasks``result_snapshot`、内部阈值或内部错误详情。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查 `events[*].event_metadata`
- **Expected Results**:
- 用户响应保留 `step``artifact``asset``assets``failure_category` 等可解释字段。
- `workflow_planned` 只返回 `plan_mode``planned_task_count``recoverable_task_count`
- 用户响应不包含原始 `plan``tasks``result_snapshot`、内部阈值、内部错误原文。
- 用户响应仍不包含 `evaluation_completed``overall_score`、维度分数或 golden replay 字段。
- **Postconditions**: 内部数据库事件不被修改。
#### TC-ST-007: 用户 request payload 使用白名单脱敏
- **Requirement**: H11-1, H11-4
- **Priority**: High
- **Preconditions**:
- 生成 job 的 `request_payload` 同时包含用户输入、公开控制字段、内部调度 token、Provider override 和评测策略。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查响应中的 `request_payload`
- **Expected Results**:
- 用户响应只保留 `output_mode``input_type``type``story_id``assets``page_count``generate_images` 等安全控制字段。
- 用户响应不包含原始 `data``education_theme`、内部调度 token、Provider override 或 evaluation policy。
- 内部数据库中的完整 request payload 不被修改。
- **Postconditions**: 用户端仍可根据公开字段展示任务进度和可用操作。
#### TC-ST-008: 资产 plan runner 按 WorkflowPlan 顺序执行任务
- **Requirement**: H12-1, H12-5
- **Priority**: High
- **Preconditions**:
- 构造 `asset_generation``asset_retry` plan包含图片和音频 task。
- **Test Steps**:
1. 调用 `run_asset_plan(...)`
2. 记录 image/audio handler 的调用顺序。
3. 检查 runner 返回的 executed/ignored task keys。
- **Expected Results**:
- 图片和音频 handler 按 plan 中 `WorkflowTask` 顺序执行。
- `start_asset_*``complete_asset_*` 这类非资产生产 task 被记录为 ignored不触发 provider handler。
- 未知非资产 task 默认 ignored不影响已知资产 task。
- **Postconditions**: 无数据库修改。
#### TC-ST-009: 后台资产生成由 plan runner 执行组合资产
- **Requirement**: H12-2, H12-5
- **Priority**: High
- **Preconditions**:
- 已持久化故事同时具备可生成图片和音频的输入。
- 创建 `asset_generation` job`assets=["audio", "image"]`
- **Test Steps**:
1. 调用 worker 执行该 job。
2. 查询 job events 和 story 状态。
- **Expected Results**:
- event stream 为 `workflow_planned` 后依次出现音频和图片生成事件。
- plan tasks 顺序包含 `complete_audio_asset``complete_image_asset`
- story 的 `audio_status``image_status` 均为 `ready`
- 用户 API 仍只暴露 coarse plan metadata不返回原始 `plan.tasks`
- **Postconditions**: job 完成,资源状态与原有语义一致。
#### TC-ST-010: 用户侧过滤 executor coverage 内部事件
- **Requirement**: H13-4, H13-5
- **Priority**: High
- **Preconditions**:
- 生成 job 包含内部 `executor_completed` 事件。
- `executor_completed.event_metadata` 包含 task keys 和 result assets。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 调用 `GET /api/generations/{story_id}/jobs`
3. 调用 `GET /api/generations/{story_id}/trace-summary`
- **Expected Results**:
- 用户 job detail 不包含 `executor_completed`
- 用户 job detail 不包含 `executed_task_keys``ignored_task_keys` 或具体 task key。
- 当 job 当前步骤短暂停留在 `executor_completed` 时,用户 summary 显示为安全公开的 `workflow_planned` 进度。
- 用户 trace summary 不包含 `executor_completed` 或具体 task key。
- 用户 trace summary 的 `total_events` 不统计内部 `executor_completed`
- **Postconditions**: 内部数据库事件不被修改。
### 5. Admin-Only Analytics Tests
#### TC-ADM-001: 管理端评测 analytics 聚合内部评测事件
- **Requirement**: H8-1, H8-2
- **Priority**: High
- **Preconditions**:
- 数据库存在多个用户的 `evaluation_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics`
2. 检查聚合结果。
- **Expected Results**:
- 返回通过数、阻断数、通过率和平均分。
- 返回 artifact、output mode、score band、dimension score、quality gate issue、failure category 和 warning 聚合。
- 不返回故事正文、prompt、单条 evaluation event 或评分 reason。
- **Postconditions**: 无数据修改。
#### TC-ADM-002: 管理端评测 analytics 支持过滤
- **Requirement**: H8-3
- **Priority**: Medium
- **Preconditions**:
- 数据库存在新旧评测事件以及不同 artifact。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics?days=7`
2. 调用 `GET /admin/evaluations/analytics?artifact=story_text`
3. 调用非法 artifact。
- **Expected Results**:
- `days` 过滤只统计窗口内事件。
- `artifact` 过滤只统计对应 artifact。
- 非法 artifact 返回 `422`
- **Postconditions**: 无数据修改。
#### TC-ADM-003: 管理端评测 analytics 需要 admin 鉴权
- **Requirement**: H8-2
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics`
- **Expected Results**:
- 返回 `401`
- 不返回任何评测统计。
- **Postconditions**: 无数据修改。
#### TC-ADM-004: 管理端完整生成 trace 返回内部事件流
- **Requirement**: H11-2, H11-3, H11-4
- **Priority**: High
- **Preconditions**:
- 数据库存在包含 `workflow_planned``evaluation_completed` 的生成 job。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
2. 检查 request payload 与 event stream。
- **Expected Results**:
- 返回完整 request payload包括原始用户输入和内部调度字段。
- 返回完整 `workflow_planned.event_metadata.plan.tasks`
- 返回 `evaluation_completed` 事件及其内部评分 metadata。
- 响应包含 `user_id`,便于管理控制面审计。
- **Postconditions**: 无数据修改。
#### TC-ADM-005: 管理端完整生成 trace 需要 admin 鉴权
- **Requirement**: H11-3
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
- **Expected Results**:
- 返回 `401`
- 不返回 request payload 或内部 event metadata。
- **Postconditions**: 无数据修改。
#### TC-ADM-006: 管理端 executor coverage 聚合内部执行事件
- **Requirement**: H13-1, H13-2, H13-3, H13-5
- **Priority**: High
- **Preconditions**:
- 数据库存在多个 `executor_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/executors/coverage`
2. 调用 `GET /admin/executors/coverage?plan_mode=asset_retry`
3. 调用非法 plan mode。
- **Expected Results**:
- 返回 total runs、planned/executed/ignored task counts 和 coverage ratio。
- 返回 plan mode、output mode、executed task keys、ignored task keys 和 result assets 聚合。
- `plan_mode` 过滤只统计对应 executor run。
- 非法 plan mode 返回 `422`
- **Postconditions**: 无数据修改。
#### TC-ADM-007: 管理端 executor coverage 需要 admin 鉴权
- **Requirement**: H13-3
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/executors/coverage`
- **Expected Results**:
- 返回 `401`
- 不返回 executor task keys 或 coverage metadata。
- **Postconditions**: 无数据修改。
#### TC-ADM-008: 管理端完整生成 trace 返回单 job executor coverage 摘要
- **Requirement**: H14-1, H14-2, H14-4
- **Priority**: High
- **Preconditions**:
- 数据库存在包含 `executor_completed` 事件的生成 job。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
2. 检查 `executor_coverage`
- **Expected Results**:
- 响应包含 `executor_coverage.scope=admin_internal_job_executor_coverage`
- `executor_coverage` 只统计当前 job 的 runs、planned/executed/ignored task counts 和 coverage ratio。
- `executor_coverage.executed_task_keys``ignored_task_keys``result_assets` 与当前 job 的内部 executor event 一致。
- 完整 event stream 仍保留 `executor_completed`,便于 admin 调试。
- **Postconditions**: 无数据修改。
#### TC-ADM-009: 管理端 harness readiness 聚合内部质量门
- **Requirement**: H15-1, H15-2, H15-3, H15-4
- **Priority**: High
- **Preconditions**:
- app 内部 harness fixture 存在 golden replay cases。
- 数据库存在至少一条通过的 `evaluation_completed` 事件。
- 数据库存在至少一条 `executor_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/harness/readiness`
2. 检查 readiness status、checks 和聚合摘要。
- **Expected Results**:
- `status=ready`
- checks 包含 `golden_replay``runtime_evaluation_samples``runtime_evaluation_quality``executor_coverage_samples``executor_coverage_ratio`
- golden replay 显示全部通过。
- evaluation analytics 与 executor coverage 只以聚合形式返回。
- 响应不包含故事标题、正文、prompt、score reason 或 quality gate message。
- **Postconditions**: 无数据修改。
#### TC-ADM-010: 管理端 harness readiness 阻断低质量运行样本并需要 admin 鉴权
- **Requirement**: H15-2, H15-3, H15-4, H15-5
- **Priority**: High
- **Preconditions**:
- 数据库存在低质量或 blocking 的 `evaluation_completed` 事件。
- executor coverage 运行样本缺失或不足。
- **Test Steps**:
1. 通过 admin guard 调用 `GET /admin/harness/readiness`
2. 未提供 admin Basic Auth 调用同一路径。
- **Expected Results**:
- 有 admin 权限时返回 `status=blocked`
- `runtime_evaluation_quality.status=blocked`
- executor 样本缺失时对应 check 为 `needs_attention`
- 无 admin 权限时返回 `401`
- 响应不包含 quality gate message 或单条事件明细。
- **Postconditions**: 无数据修改。
## Test Coverage Matrix
| Requirement ID | Test Cases | Coverage Status |
| --- | --- | --- |
| H7-1 | TC-F-002, TC-F-005, TC-E-001, TC-ERR-002, TC-ERR-003 | Complete |
| H7-2 | TC-F-001, TC-ST-001 | Complete |
| H7-3 | TC-F-001, TC-ST-001 | Complete |
| H7-4 | TC-F-003, TC-ERR-001 | Complete |
| H7-5 | This document | Complete |
| H7-7 | TC-F-006, TC-E-002 | Complete |
| H7-8 | TC-F-006, TC-F-007 | Complete |
| H7B-1 | TC-F-003 | Complete |
| H7B-2 | TC-F-004 | Complete |
| H7C-1 | TC-F-005, TC-ERR-003, TC-ST-002 | Complete |
| H8-1 | TC-ADM-001 | Complete |
| H8-2 | TC-ADM-001, TC-ADM-003 | Complete |
| H8-3 | TC-ADM-002 | Complete |
| H8-4 | TC-F-003, TC-F-004, TC-ADM-001 | Complete |
| H9-1 | TC-ST-002 | Complete |
| H9-2 | TC-ST-003 | Complete |
| H9-3 | TC-ST-001, TC-ST-002, TC-ST-003 | Complete |
| H9-4 | TC-F-003, TC-F-004, TC-ST-004 | Complete |
| H10-1 | TC-ST-005 | Complete |
| H10-2 | TC-ST-005 | Complete |
| H10-3 | TC-ST-005 | Complete |
| H10-4 | TC-ST-006 | Complete |
| H10-5 | TC-ST-005, TC-ST-006 | Complete |
| H11-1 | TC-ST-007 | Complete |
| H11-2 | TC-ADM-004 | Complete |
| H11-3 | TC-ADM-004, TC-ADM-005 | Complete |
| H11-4 | TC-ST-007, TC-ADM-004, TC-ADM-005 | Complete |
| H11-5 | This document, `docs/planning/harness-stage-11-report.md` | Complete |
| H12-1 | TC-ST-008 | Complete |
| H12-2 | TC-ST-009 | Complete |
| H12-3 | TC-ST-005, TC-ST-008 | Complete |
| H12-4 | TC-ST-005, backend story endpoint regression tests | Complete |
| H12-5 | TC-ST-008, TC-ST-009 | Complete |
| H13-1 | TC-ADM-006 | Complete |
| H13-2 | TC-ST-009, TC-ADM-006 | Complete |
| H13-3 | TC-ADM-006, TC-ADM-007 | Complete |
| H13-4 | TC-ST-010 | Complete |
| H13-5 | TC-ST-010, TC-ADM-006, TC-ADM-007 | Complete |
| H14-1 | TC-ADM-006, TC-ADM-008 | Complete |
| H14-2 | TC-ADM-008 | Complete |
| H14-3 | TC-ST-010 | Complete |
| H14-4 | TC-ST-010, TC-ADM-008 | Complete |
| H14-5 | This document, `docs/planning/harness-stage-14-report.md` | Complete |
| H15-1 | TC-F-006, TC-ADM-009 | Complete |
| H15-2 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-3 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-4 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-5 | This document, `docs/planning/harness-stage-15-report.md` | Complete |
## Notes
- 当前自动化已覆盖 TC-F-001、TC-F-002、TC-F-003、TC-F-004、TC-F-005、TC-F-006、TC-F-007、TC-E-002、TC-ERR-001、TC-ERR-002、TC-ERR-003、TC-ST-001、TC-ST-002、TC-ST-003、TC-ST-004、TC-ST-005、TC-ST-006、TC-ST-007、TC-ST-008、TC-ST-009、TC-ST-010、TC-ADM-001、TC-ADM-002、TC-ADM-003、TC-ADM-004、TC-ADM-005、TC-ADM-006、TC-ADM-007、TC-ADM-008、TC-ADM-009、TC-ADM-010。
- TC-E-001 可在下一轮补成显式单测。
- 所有 `evaluation_completed`、golden replay 和评分维度数据均按内部质量资产处理,不应进入用户端接口或用户前端。
- `GET /admin/evaluations/analytics` 只允许 admin-only 聚合摘要不应返回原始内容、prompt、单条事件或评分 reason。
- `GET /admin/generations/jobs/{job_id}/trace` 是 admin-only 调试和审查接口,可返回完整内部链路,不应被用户前端调用。
- `GET /admin/executors/coverage` 是 admin-only executor 覆盖率接口,可返回 task keys 和 result assets不应被用户前端调用。
- `GET /admin/generations/jobs/{job_id}/trace` 可返回当前 job 的 `executor_coverage` 摘要;该摘要与 task keys 一样属于内部执行资产。
- `GET /admin/harness/readiness` 是 admin-only harness 上线前审查摘要,可返回聚合 readiness、thresholds、golden coverage、evaluation analytics 和 executor coverage不应返回正文、prompt、score reason、quality gate message 或单条事件明细。

View File

@@ -27,6 +27,17 @@ def _build_admin_test_app(db_session) -> FastAPI:
return app
def _build_admin_auth_required_test_app(db_session) -> FastAPI:
app = FastAPI()
app.include_router(admin_providers.router, prefix="/admin")
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
return app
async def _create_story(
db_session,
*,
@@ -51,6 +62,38 @@ async def _create_story(
return story
async def _record_evaluation_event(
db_session,
*,
user_id: str,
story_id: int,
output_mode: str,
artifact: str,
status: str,
metadata: dict,
):
job = await create_generation_job(
db_session,
user_id=user_id,
output_mode=output_mode,
input_type="keywords",
request_payload={"data": "测试"},
story_id=story_id,
)
return await record_generation_event(
db_session,
job=job,
story_id=story_id,
event_type="evaluation_completed",
status=status,
metadata={
"step": "evaluation",
"artifact": artifact,
**metadata,
},
)
async def test_admin_provider_analytics_aggregate_across_users(db_session, test_user):
second_user = User(
id="github:67890",
@@ -197,6 +240,616 @@ async def test_admin_provider_analytics_aggregate_across_users(db_session, test_
]
async def test_admin_evaluation_analytics_aggregate_internal_events(
db_session,
test_user,
):
second_user = User(
id="google:evaluation-user",
name="Evaluation User",
avatar_url="https://example.com/eval.png",
provider="google",
)
db_session.add(second_user)
await db_session.commit()
story = await _create_story(db_session, user_id=test_user.id, title="评测故事")
storybook = await _create_story(
db_session,
user_id=second_user.id,
title="评测绘本",
mode="storybook",
)
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.92,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "完整"},
{"dimension": "readability", "score": 0.84, "reason": "可读"},
],
"warnings": [],
},
)
await _record_evaluation_event(
db_session,
user_id=second_user.id,
story_id=storybook.id,
output_mode="storybook",
artifact="storybook_pages",
status="failed",
metadata={
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [
{"dimension": "structure", "score": 0.0, "reason": "结构失败"},
{"dimension": "safety", "score": 0.0, "reason": "安全失败"},
],
"quality_gate": {
"issues": [
{
"code": "unsafe_child_content",
"message": "风险词",
"failure_category": "safety_error",
"field": "pages",
}
]
},
"warnings": ["绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_evaluations"
assert data["total_evaluations"] == 2
assert data["passed_evaluations"] == 1
assert data["blocked_evaluations"] == 1
assert data["pass_rate"] == 0.5
assert data["average_score"] == 0.46
assert data["job_count"] == 2
assert data["story_count"] == 2
assert data["user_count"] == 2
assert data["by_artifact"] == [
{"artifact": "story_text", "count": 1},
{"artifact": "storybook_pages", "count": 1},
]
assert data["by_output_mode"] == [
{"output_mode": "story", "count": 1},
{"output_mode": "storybook", "count": 1},
]
assert data["score_bands"] == [
{"band": "blocked_quality_gate", "count": 1},
{"band": "excellent", "count": 1},
]
assert data["dimension_scores"] == [
{"dimension": "structure", "average_score": 0.5, "count": 2},
{"dimension": "readability", "average_score": 0.84, "count": 1},
{"dimension": "safety", "average_score": 0.0, "count": 1},
]
assert data["quality_gate_issues"] == [
{"code": "unsafe_child_content", "count": 1},
]
assert data["failure_categories"] == [
{"category": "safety_error", "count": 1},
]
assert data["warnings"] == [
{
"message": "绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。",
"count": 1,
},
]
assert "评测故事" not in str(data)
assert "风险词" not in str(data)
assert "完整" not in str(data)
async def test_admin_evaluation_analytics_support_days_and_artifact_filters(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="旧评测")
storybook = await _create_story(
db_session,
user_id=test_user.id,
title="新评测",
mode="storybook",
)
old_event = await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.96,
"passed": True,
"blocking": False,
"scores": [{"dimension": "structure", "score": 1.0, "reason": "完整"}],
"warnings": [],
},
)
old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10)
await db_session.commit()
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=storybook.id,
output_mode="storybook",
artifact="storybook_pages",
status="failed",
metadata={
"overall_score": 0.72,
"passed": False,
"blocking": True,
"scores": [{"dimension": "readability", "score": 0.62, "reason": "过短"}],
"warnings": ["分页正文长度偏短"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics?days=7")
assert response.status_code == 200
data = response.json()
assert data["window_days"] == 7
assert data["total_evaluations"] == 1
assert data["artifact"] is None
assert data["by_artifact"] == [{"artifact": "storybook_pages", "count": 1}]
response = await client.get(
"/admin/evaluations/analytics?artifact=story_text"
)
assert response.status_code == 200
data = response.json()
assert data["artifact"] == "story_text"
assert data["total_evaluations"] == 1
assert data["average_score"] == 0.96
response = await client.get("/admin/evaluations/analytics?artifact=image")
assert response.status_code == 422
async def test_admin_evaluation_analytics_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics")
assert response.status_code == 401
async def test_admin_generation_job_trace_returns_internal_event_stream(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="内部链路故事")
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "月亮森林",
"internal_dispatch_token": "admin-visible-token",
"provider_override": "internal-provider",
"evaluation_policy": {"threshold": 0.9},
},
story_id=story.id,
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="workflow_planned",
status="succeeded",
metadata={
"step": "request_acceptance",
"artifact": "none",
"plan": {
"mode": "story",
"tasks": [
{
"key": "generate_narrative",
"step": "text_generation",
"artifact": "story_text",
"required": True,
"recoverable": False,
}
],
},
"internal_threshold": 0.9,
},
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="evaluation_completed",
status="succeeded",
metadata={
"step": "evaluation",
"artifact": "story_text",
"overall_score": 0.94,
"passed": True,
"blocking": False,
"scores": [{"dimension": "structure", "score": 1.0}],
},
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get(f"/admin/generations/jobs/{job.id}/trace")
assert response.status_code == 200
data = response.json()
assert data["id"] == job.id
assert data["user_id"] == test_user.id
assert data["request_payload"]["data"] == "月亮森林"
assert data["request_payload"]["internal_dispatch_token"] == "admin-visible-token"
assert data["request_payload"]["evaluation_policy"] == {"threshold": 0.9}
event_types = [event["event_type"] for event in data["events"]]
assert event_types == [
"request_accepted",
"workflow_planned",
"evaluation_completed",
"executor_completed",
]
workflow_event = data["events"][1]
assert workflow_event["event_metadata"]["plan"]["tasks"][0]["key"] == (
"generate_narrative"
)
assert workflow_event["event_metadata"]["internal_threshold"] == 0.9
evaluation_event = data["events"][2]
assert evaluation_event["event_metadata"]["overall_score"] == 0.94
assert evaluation_event["event_metadata"]["scores"] == [
{"dimension": "structure", "score": 1.0}
]
executor_event = data["events"][3]
assert executor_event["event_metadata"]["executed_task_keys"] == [
"complete_image_asset"
]
assert executor_event["event_metadata"]["result_assets"] == ["cover_image"]
executor_coverage = data["executor_coverage"]
assert executor_coverage["scope"] == "admin_internal_job_executor_coverage"
assert executor_coverage["total_runs"] == 1
assert executor_coverage["total_planned_tasks"] == 3
assert executor_coverage["total_executed_tasks"] == 1
assert executor_coverage["total_ignored_tasks"] == 2
assert executor_coverage["coverage_ratio"] == 0.3333
assert executor_coverage["job_count"] == 1
assert executor_coverage["story_count"] == 1
assert executor_coverage["user_count"] == 1
assert executor_coverage["by_plan_mode"] == [
{"plan_mode": "asset_generation", "count": 1}
]
assert executor_coverage["by_output_mode"] == [
{"output_mode": "story", "count": 1}
]
assert executor_coverage["executed_task_keys"] == [
{"task_key": "complete_image_asset", "count": 1}
]
assert executor_coverage["ignored_task_keys"] == [
{"task_key": "complete_asset_generation", "count": 1},
{"task_key": "start_asset_generation", "count": 1},
]
assert executor_coverage["result_assets"] == [
{"asset": "cover_image", "count": 1}
]
async def test_admin_generation_job_trace_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/generations/jobs/missing-job/trace")
assert response.status_code == 401
async def test_admin_executor_coverage_aggregates_internal_events(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="执行器覆盖故事")
asset_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_generation",
input_type="audio,image",
request_payload={"story_id": story.id, "assets": ["audio", "image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=asset_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 4,
"executed_task_count": 2,
"ignored_task_count": 2,
"executed_task_keys": ["complete_audio_asset", "complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["audio", "cover_image"],
},
)
retry_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_retry",
input_type="image",
request_payload={"story_id": story.id, "assets": ["image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=retry_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_retry",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": ["start_asset_retry", "complete_asset_retry"],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/executors/coverage")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_executor_coverage"
assert data["total_runs"] == 2
assert data["total_planned_tasks"] == 7
assert data["total_executed_tasks"] == 3
assert data["total_ignored_tasks"] == 4
assert data["coverage_ratio"] == 0.4286
assert data["job_count"] == 2
assert data["story_count"] == 1
assert data["user_count"] == 1
assert data["by_plan_mode"] == [
{"plan_mode": "asset_generation", "count": 1},
{"plan_mode": "asset_retry", "count": 1},
]
assert data["executed_task_keys"] == [
{"task_key": "complete_image_asset", "count": 2},
{"task_key": "complete_audio_asset", "count": 1},
]
assert data["result_assets"] == [
{"asset": "cover_image", "count": 2},
{"asset": "audio", "count": 1},
]
response = await client.get("/admin/executors/coverage?plan_mode=asset_retry")
assert response.status_code == 200
data = response.json()
assert data["plan_mode"] == "asset_retry"
assert data["total_runs"] == 1
assert data["total_planned_tasks"] == 3
assert data["total_executed_tasks"] == 1
response = await client.get("/admin/executors/coverage?plan_mode=story")
assert response.status_code == 422
async def test_admin_executor_coverage_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/executors/coverage")
assert response.status_code == 401
async def test_admin_harness_readiness_returns_ready_when_internal_gates_pass(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="readiness 故事")
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.92,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "内部 reason"},
{"dimension": "readability", "score": 0.84, "reason": "内部 reason"},
],
"warnings": [],
},
)
asset_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": story.id, "assets": ["image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=asset_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_harness_readiness"
assert data["status"] == "ready"
assert data["thresholds"] == {
"min_runtime_evaluations": 1,
"min_executor_runs": 1,
"min_evaluation_pass_rate": 0.7,
"min_evaluation_average_score": 0.7,
"min_executor_coverage_ratio": 0.2,
}
assert {check["code"]: check["status"] for check in data["checks"]} == {
"golden_replay": "ready",
"runtime_evaluation_samples": "ready",
"runtime_evaluation_quality": "ready",
"executor_coverage_samples": "ready",
"executor_coverage_ratio": "ready",
}
assert data["golden_replay"]["passed"] is True
assert data["golden_replay"]["total_cases"] == 11
assert data["evaluation_analytics"]["total_evaluations"] == 1
assert data["evaluation_analytics"]["pass_rate"] == 1.0
assert data["executor_coverage"]["total_runs"] == 1
assert data["executor_coverage"]["coverage_ratio"] == 0.3333
assert "内部 reason" not in str(data)
assert "readiness 故事" not in str(data)
async def test_admin_harness_readiness_blocks_low_runtime_quality(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="低质量 readiness")
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="failed",
metadata={
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [{"dimension": "structure", "score": 0.0, "reason": "缺失"}],
"quality_gate": {
"issues": [
{
"code": "missing_story_text",
"message": "正文缺失",
"failure_category": "schema_error",
"field": "story_text",
}
]
},
"warnings": [],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 200
data = response.json()
assert data["status"] == "blocked"
checks = {check["code"]: check for check in data["checks"]}
assert checks["golden_replay"]["status"] == "ready"
assert checks["runtime_evaluation_samples"]["status"] == "ready"
assert checks["runtime_evaluation_quality"]["status"] == "blocked"
assert checks["executor_coverage_samples"]["status"] == "needs_attention"
assert checks["executor_coverage_ratio"]["status"] == "needs_attention"
assert data["evaluation_analytics"]["blocked_evaluations"] == 1
assert data["executor_coverage"]["total_runs"] == 0
assert "正文缺失" not in str(data)
assert "低质量 readiness" not in str(data)
async def test_admin_harness_readiness_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 401
async def test_admin_provider_analytics_support_days_and_capability_filters(
db_session,
test_user,

View File

@@ -123,14 +123,19 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"story_saved",
"generation_completed",
]
assert events[2].event_metadata["has_memory_context"] is False
assert events[3].event_metadata["title"] == "小兔子的冒险"
assert events[4].story_id == job.story_id
assert events[2].event_metadata["plan"]["mode"] == "story"
assert events[3].event_metadata["has_memory_context"] is False
assert events[4].event_metadata["passed"] is True
assert events[4].event_metadata["overall_score"] >= 0.7
assert events[5].event_metadata["title"] == "小兔子的冒险"
assert events[6].story_id == job.story_id
detail_response = await client.get(f"/api/generations/jobs/{job.id}")
assert detail_response.status_code == 200
@@ -143,11 +148,16 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [event["event_type"] for event in detail["events"]] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"narrative_generated",
"story_saved",
"generation_completed",
]
assert all(
event["event_type"] != "evaluation_completed"
for event in detail["events"]
)
story_response = await client.get(f"/api/generations/{job.story_id}")
assert story_response.status_code == 200
@@ -161,6 +171,13 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [item["id"] for item in job_list] == [job.id]
assert job_list[0]["progress_percent"] == 100
assert job_list[0]["is_terminal"] is True
trace_response = await client.get(
f"/api/generations/{job.story_id}/trace-summary"
)
assert trace_response.status_code == 200
trace = trace_response.json()
assert "evaluation" not in trace
finally:
app.dependency_overrides.clear()
@@ -220,13 +237,88 @@ async def test_generation_worker_records_quality_gate_failure_without_persisting
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"quality_gate_failed",
"evaluation_completed",
"generation_failed",
]
quality_event = events[3]
quality_event = events[4]
assert quality_event.event_metadata["step"] == "narrative_generation"
assert quality_event.event_metadata["issues"][0]["code"] == "missing_story_text"
evaluation_event = events[5]
assert evaluation_event.event_metadata["step"] == "evaluation"
assert evaluation_event.event_metadata["passed"] is False
assert evaluation_event.event_metadata["blocking"] is True
async def test_story_with_images_worker_records_plan_before_assets(
db_session,
test_user,
mock_text_provider,
mock_image_provider,
):
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "小兔子, 森林",
"generate_images": True,
},
)
await run_generation_job_service(job.id, db_session)
refreshed_job = (
await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id))
).scalar_one()
assert refreshed_job.story_id is not None
assert refreshed_job.status == "completed"
assert refreshed_job.current_step == "generation_completed"
assert refreshed_job.result_snapshot["image_status"] == "ready"
events = (
await db_session.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"story_saved",
"cover_image_started",
"cover_image_succeeded",
"generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "story_with_assets"
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_narrative",
"evaluate_narrative",
"persist_story",
"generate_cover_image",
"queue_postprocessing",
"complete_generation",
]
cover_task = next(task for task in plan["tasks"] if task["key"] == "generate_cover_image")
assert cover_task["required"] is False
assert cover_task["recoverable"] is True
assert events[4].event_metadata["passed"] is True
assert events[8].event_metadata["asset"] == "cover_image"
mock_text_provider.assert_called_once()
mock_image_provider.assert_called_once()
async def test_asset_retry_records_job_events_and_updates_retryable_assets(
@@ -279,12 +371,30 @@ async def test_asset_retry_records_job_events_and_updates_retryable_assets(
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"workflow_planned",
"asset_retry_started",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_retry_completed",
]
assert events[3].event_metadata["asset"] == "cover_image"
plan = events[1].event_metadata["plan"]
assert plan["mode"] == "asset_retry"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_retry",
"complete_image_asset",
"complete_asset_retry",
]
image_task = next(
task for task in plan["tasks"] if task["key"] == "complete_image_asset"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
assert events[4].event_metadata["asset"] == "cover_image"
assert events[5].event_metadata["plan_mode"] == "asset_retry"
assert events[5].event_metadata["executed_task_keys"] == [
"complete_image_asset"
]
finally:
app.dependency_overrides.clear()
@@ -365,10 +475,110 @@ async def test_asset_generation_job_worker_completes_cover_image(
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "asset_generation"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_generation",
"complete_image_asset",
"complete_asset_generation",
]
image_task = next(
task for task in plan["tasks"] if task["key"] == "complete_image_asset"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
executor_event = events[5]
assert executor_event.event_metadata["plan_mode"] == "asset_generation"
assert executor_event.event_metadata["executed_task_keys"] == [
"complete_image_asset"
]
assert executor_event.event_metadata["ignored_task_keys"] == [
"start_asset_generation",
"complete_asset_generation",
]
assert executor_event.event_metadata["result_assets"] == ["cover_image"]
async def test_asset_generation_job_worker_executes_assets_in_plan_order(
db_session,
test_story,
mock_tts_provider,
):
job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="asset_generation",
input_type="audio,image",
request_payload={"story_id": test_story.id, "assets": ["audio", "image"]},
story_id=test_story.id,
)
with patch(
"app.services.story_service.generate_image",
new_callable=AsyncMock,
) as mock_generate_image:
mock_generate_image.return_value = "https://example.com/plan-cover.png"
await run_generation_job_service(job.id, db_session)
refreshed_job = (
await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id))
).scalar_one()
assert refreshed_job.status == "completed"
assert refreshed_job.current_step == "asset_generation_completed"
assert refreshed_job.result_snapshot["image_status"] == "ready"
assert refreshed_job.result_snapshot["audio_status"] == "ready"
story = (
await db_session.execute(
select(Story).where(Story.id == test_story.id)
)
).scalar_one()
assert story.image_url == "https://example.com/plan-cover.png"
assert story.audio_status == "ready"
assert story.audio_path is not None
events = (
await db_session.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"audio_started",
"audio_succeeded",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "asset_generation"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_generation",
"complete_audio_asset",
"complete_image_asset",
"complete_asset_generation",
]
assert events[4].event_metadata["asset"] == "audio"
assert events[6].event_metadata["asset"] == "cover_image"
assert events[7].event_metadata["executed_task_keys"] == [
"complete_audio_asset",
"complete_image_asset",
]
assert events[7].event_metadata["result_assets"] == ["audio", "cover_image"]
mock_tts_provider.assert_awaited_once()
mock_generate_image.assert_awaited_once()
async def test_cancel_queued_asset_generation_job_marks_it_canceled(
@@ -538,7 +748,9 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"storybook_images_started",
"storybook_cover_image_succeeded",
@@ -548,13 +760,45 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve
"story_saved",
"generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "storybook"
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_storybook_pages",
"evaluate_storybook_pages",
"generate_storybook_images",
"persist_storybook",
"queue_postprocessing",
"complete_generation",
]
image_task = next(
task
for task in plan["tasks"]
if task["key"] == "generate_storybook_images"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
assert events[4].event_metadata["passed"] is True
assert events[4].event_metadata["artifact"] == "storybook_pages"
page_events = [
event
for event in events
if event.event_type == "storybook_page_image_succeeded"
]
assert [event.event_metadata["page_number"] for event in page_events] == [1, 2]
assert events[8].event_metadata["completed_pages"] == [1, 2]
assert events[10].event_metadata["completed_pages"] == [1, 2]
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
detail_response = await client.get(
f"/api/generations/jobs/{job.id}"
)
assert detail_response.status_code == 200
detail = detail_response.json()
assert "evaluation_completed" not in [
event["event_type"] for event in detail["events"]
]
finally:
app.dependency_overrides.clear()
@@ -716,6 +960,414 @@ async def test_story_provider_stats_aggregate_job_events(
app.dependency_overrides.clear()
async def test_story_trace_summary_aggregates_steps_artifacts_and_failure_categories(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_retry",
input_type="image",
request_payload={"assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="cover_image_started",
status="running",
metadata={
"step": "image_generation",
"artifact": "cover_image",
"failure_category": None,
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="cover_image_failed",
status="failed",
metadata={
"step": "image_generation",
"artifact": "cover_image",
"failure_category": "provider_error",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="quality_gate_failed",
status="failed",
metadata={
"step": "narrative_generation",
"artifact": "story_text",
"failure_category": "schema_error",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="evaluation_completed",
status="failed",
metadata={
"step": "evaluation",
"artifact": "story_text",
"failure_category": "schema_error",
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [
{
"dimension": "structure",
"score": 0.0,
"reason": "故事结构未通过质量门。",
},
{
"dimension": "safety",
"score": 0.0,
"reason": "内容未通过儿童安全或结构完整性检查。",
},
],
},
)
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/trace-summary"
)
assert response.status_code == 200
data = response.json()
assert data["story_id"] == degraded_story_with_text.id
assert data["total_events"] == 4
assert data["failed_events"] == 2
assert data["by_step"] == [
{"name": "image_generation", "count": 2},
{"name": "narrative_generation", "count": 1},
]
assert data["by_artifact"] == [
{"name": "cover_image", "count": 2},
{"name": "story_text", "count": 1},
]
assert data["failure_categories"] == [
{"name": "provider_error", "count": 1},
{"name": "schema_error", "count": 1},
]
assert "evaluation" not in data
assert "overall_score" not in str(data)
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_hides_internal_evaluation_step(
db_session,
auth_token,
test_user,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "小兔子",
"generate_images": False,
},
)
await record_generation_event(
db_session,
job=job,
event_type="evaluation_completed",
status="succeeded",
metadata={
"step": "evaluation",
"artifact": "story_text",
"overall_score": 0.96,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "完整。"},
],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
assert data["current_step"] == "narrative_generated"
assert data["progress_label"] == "正文已生成"
assert [event["event_type"] for event in data["events"]] == [
"request_accepted"
]
assert "evaluation_completed" not in str(data)
assert "overall_score" not in str(data)
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_sanitizes_request_payload(
db_session,
auth_token,
test_user,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"input_type": "keywords",
"type": "keywords",
"data": "不要回传原始关键词",
"education_theme": "勇气",
"generate_images": True,
"page_count": 6,
"child_profile_id": "child-public-id",
"universe_id": "universe-public-id",
"internal_dispatch_token": "secret-dispatch-token",
"provider_override": "internal-provider",
"evaluation_policy": {"threshold": 0.9},
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
assert data["request_payload"] == {
"child_profile_id": "child-public-id",
"generate_images": True,
"input_type": "keywords",
"output_mode": "story",
"page_count": 6,
"type": "keywords",
"universe_id": "universe-public-id",
}
payload_dump = str(data["request_payload"])
assert "不要回传原始关键词" not in payload_dump
assert "education_theme" not in payload_dump
assert "secret-dispatch-token" not in payload_dump
assert "internal-provider" not in payload_dump
assert "evaluation_policy" not in payload_dump
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_sanitizes_public_event_metadata(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="workflow_planned",
status="succeeded",
metadata={
"step": "request_acceptance",
"artifact": "none",
"plan": {
"mode": "asset_generation",
"tasks": [
{
"key": "complete_image_asset",
"step": "image_generation",
"artifact": "image",
"required": False,
"recoverable": True,
}
],
},
"internal_threshold": 0.72,
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="asset_generation_completed",
status="completed",
metadata={
"assets": ["image"],
"result_snapshot": {
"story_id": degraded_story_with_text.id,
"last_error": "internal provider detail",
},
"error": "internal provider detail",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
workflow_event = next(
event for event in data["events"] if event["event_type"] == "workflow_planned"
)
assert workflow_event["event_metadata"] == {
"artifact": "none",
"plan_mode": "asset_generation",
"planned_task_count": 1,
"recoverable_task_count": 1,
"step": "request_acceptance",
}
completion_event = next(
event
for event in data["events"]
if event["event_type"] == "asset_generation_completed"
)
assert completion_event["event_metadata"] == {"assets": ["image"]}
assert "plan" not in workflow_event["event_metadata"]
assert "tasks" not in str(data["events"])
assert "internal_threshold" not in str(data["events"])
assert "result_snapshot" not in str(data["events"])
assert "internal provider detail" not in str(data["events"])
assert "executor_completed" not in str(data["events"])
assert "complete_image_asset" not in str(data["events"])
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_summary_hides_internal_executor_step(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"executed_task_keys": ["complete_image_asset"],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
detail_response = await client.get(f"/api/generations/jobs/{job.id}")
list_response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/jobs"
)
trace_summary_response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/trace-summary"
)
assert detail_response.status_code == 200
detail = detail_response.json()
assert detail["current_step"] == "workflow_planned"
assert detail["progress_label"] == "工作流已规划"
assert "executor_completed" not in str(detail)
assert "complete_image_asset" not in str(detail)
assert list_response.status_code == 200
listed_job = next(item for item in list_response.json() if item["id"] == job.id)
assert listed_job["current_step"] == "workflow_planned"
assert listed_job["progress_label"] == "工作流已规划"
assert trace_summary_response.status_code == 200
trace_summary = trace_summary_response.json()
assert "executor_completed" not in str(trace_summary)
assert "complete_image_asset" not in str(trace_summary)
assert trace_summary["total_events"] == 1
finally:
app.dependency_overrides.clear()
async def test_user_provider_analytics_aggregate_across_stories(
db_session,
auth_token,

View File

@@ -1,5 +1,7 @@
"""Tests for generation harness runtime support."""
from pathlib import Path
import pytest
from sqlalchemy import select
@@ -7,8 +9,21 @@ from app.db.models import GenerationJob, GenerationJobEvent
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.generation_jobs import create_generation_job, record_generation_event
from app.services.harness.artifacts import AssetCompletionResult
from app.services.harness.control import ExecutionControl, GenerationJobCanceledError
from app.services.harness.evaluation_replay import (
EvaluationReplayArtifact,
EvaluationReplayCase,
ExpectedEvaluation,
replay_evaluation_golden_cases,
run_evaluation_replay_cases,
)
from app.services.harness.evaluators import evaluate_story_output, evaluate_storybook_output
from app.services.harness.executor import run_asset_plan
from app.services.harness.plans import (
WorkflowMode,
WorkflowPlan,
WorkflowTask,
build_asset_plan,
build_story_plan,
build_storybook_plan,
@@ -27,12 +42,18 @@ from app.services.harness.types import (
normalize_trace_metadata,
step_for_event,
)
from app.services.story_status import StoryAssetStatus
FIXTURES_DIR = (
Path(__file__).parents[1] / "app" / "services" / "harness" / "fixtures"
)
def test_event_type_maps_to_standard_workflow_step():
assert step_for_event("request_accepted") == WorkflowStep.REQUEST_ACCEPTANCE
assert step_for_event("context_prepared") == WorkflowStep.CONTEXT_PREPARATION
assert step_for_event("narrative_generated") == WorkflowStep.NARRATIVE_GENERATION
assert step_for_event("evaluation_completed") == WorkflowStep.EVALUATION
assert step_for_event("story_saved") == WorkflowStep.STORY_PERSISTENCE
assert step_for_event("provider_call_succeeded") == WorkflowStep.PROVIDER_INVOCATION
assert step_for_event("quality_gate_failed") == WorkflowStep.NARRATIVE_GENERATION
@@ -46,6 +67,7 @@ def test_event_type_maps_to_standard_workflow_step():
def test_event_type_maps_to_standard_artifact():
assert artifact_for_event("narrative_generated") == ArtifactKind.STORY_TEXT
assert artifact_for_event("quality_gate_failed") == ArtifactKind.STORY_TEXT
assert artifact_for_event("evaluation_completed") == ArtifactKind.STORY_TEXT
assert artifact_for_event("cover_image_succeeded") == ArtifactKind.COVER_IMAGE
assert artifact_for_event("storybook_page_image_failed") == ArtifactKind.PAGE_IMAGE
assert artifact_for_event("audio_cache_hit") == ArtifactKind.AUDIO
@@ -108,6 +130,13 @@ def test_story_plan_without_assets_snapshot():
"required": True,
"recoverable": False,
},
{
"key": "evaluate_narrative",
"step": "evaluation",
"artifact": "story_text",
"required": True,
"recoverable": False,
},
{
"key": "persist_story",
"step": "story_persistence",
@@ -137,7 +166,7 @@ def test_story_plan_with_assets_marks_cover_recoverable():
plan = build_story_plan(generate_images=True).to_snapshot()
assert plan["mode"] == "story_with_assets"
assert plan["tasks"][3] == {
assert plan["tasks"][4] == {
"key": "generate_cover_image",
"step": "image_generation",
"artifact": "cover_image",
@@ -153,13 +182,14 @@ def test_storybook_plan_with_images_marks_storybook_images_recoverable():
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_storybook_pages",
"evaluate_storybook_pages",
"generate_storybook_images",
"persist_storybook",
"queue_postprocessing",
"complete_generation",
]
assert plan["tasks"][2]["artifact"] == "image"
assert plan["tasks"][2]["recoverable"] is True
assert plan["tasks"][3]["artifact"] == "image"
assert plan["tasks"][3]["recoverable"] is True
def test_asset_retry_plan_deduplicates_assets():
@@ -200,6 +230,86 @@ def test_asset_retry_plan_deduplicates_assets():
}
@pytest.mark.asyncio
async def test_run_asset_plan_executes_asset_tasks_in_plan_order():
calls: list[str] = []
async def image_task() -> AssetCompletionResult:
calls.append("image")
return AssetCompletionResult(
asset="cover_image",
status=StoryAssetStatus.READY,
value="https://example.com/cover.png",
)
async def audio_task() -> AssetCompletionResult:
calls.append("audio")
return AssetCompletionResult(
asset="audio",
status=StoryAssetStatus.READY,
value=b"audio",
)
result = await run_asset_plan(
build_asset_plan(output_mode="asset_generation", assets=["audio", "image"]),
image_task=image_task,
audio_task=audio_task,
)
assert calls == ["audio", "image"]
assert result.executed_task_keys == ("complete_audio_asset", "complete_image_asset")
assert result.ignored_task_keys == (
"start_asset_generation",
"complete_asset_generation",
)
assert [item.asset for item in result.task_results] == ["audio", "cover_image"]
@pytest.mark.asyncio
async def test_run_asset_plan_ignores_unknown_non_asset_tasks():
calls: list[str] = []
plan = WorkflowPlan(
mode=WorkflowMode.ASSET_RETRY,
tasks=(
WorkflowTask(
key="start_asset_retry",
step=WorkflowStep.ASSET_RETRY,
artifact=ArtifactKind.NONE,
),
WorkflowTask(
key="complete_video_asset",
step=WorkflowStep.UNKNOWN,
artifact=ArtifactKind.UNKNOWN,
required=False,
recoverable=True,
),
WorkflowTask(
key="complete_asset_retry",
step=WorkflowStep.ASSET_RETRY,
artifact=ArtifactKind.NONE,
),
),
)
async def image_task() -> AssetCompletionResult:
calls.append("image")
return AssetCompletionResult(
asset="cover_image",
status=StoryAssetStatus.READY,
)
result = await run_asset_plan(plan, image_task=image_task)
assert calls == []
assert result.task_results == ()
assert result.executed_task_keys == ()
assert result.ignored_task_keys == (
"start_asset_retry",
"complete_video_asset",
"complete_asset_retry",
)
def test_story_quality_gate_accepts_complete_child_safe_story():
validate_story_output(
StoryOutput(
@@ -211,6 +321,166 @@ def test_story_quality_gate_accepts_complete_child_safe_story():
)
def test_story_evaluator_scores_complete_child_safe_story():
result = evaluate_story_output(
StoryOutput(
mode="generated",
title="小兔子的月光花园",
story_text="小兔子在花园里学会了和朋友轮流分享水壶,也学会了复盘今天的努力。",
cover_prompt_suggestion="A gentle moonlit garden with a rabbit",
),
education_theme="复盘",
)
assert result.passed is True
assert result.blocking is False
assert result.overall_score >= 0.9
assert result.to_metadata()["scores"][0]["dimension"] == "structure"
def test_story_evaluator_blocks_quality_gate_failure():
result = evaluate_story_output(
StoryOutput(
mode="generated",
title="空白故事",
story_text="",
cover_prompt_suggestion="A cover",
)
)
assert result.passed is False
assert result.blocking is True
assert result.overall_score == 0.0
assert result.gate_error is not None
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == "missing_story_text"
def test_storybook_evaluator_scores_complete_child_safe_storybook():
result = evaluate_storybook_output(
Storybook(
title="森林里的复盘星星",
main_character="小兔子露露",
art_style="温暖水彩",
cover_prompt="A warm watercolor forest cover",
pages=[
StorybookPage(
page_number=1,
text="露露在森林里发现一颗会提醒她复盘的小星星。",
image_prompt="Lulu finds a star",
),
StorybookPage(
page_number=2,
text="她回想今天的努力,学会下次先和朋友商量。",
image_prompt="Lulu thinking with friends",
),
],
),
education_theme="复盘",
)
assert result.passed is True
assert result.blocking is False
assert result.overall_score >= 0.9
def test_storybook_evaluator_blocks_quality_gate_failure():
result = evaluate_storybook_output(
Storybook(
title="森林绘本",
main_character="小兔子",
art_style="水彩",
cover_prompt="A forest cover",
pages=[
StorybookPage(page_number=1, text="第一页。", image_prompt="page 1"),
StorybookPage(page_number=1, text="第二页。", image_prompt="page 2"),
],
)
)
assert result.passed is False
assert result.blocking is True
assert result.gate_error is not None
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == (
"invalid_storybook_page_number"
)
def test_evaluation_golden_cases_replay_successfully():
result = replay_evaluation_golden_cases(
FIXTURES_DIR / "evaluation_golden_cases.json"
)
assert result.passed is True, result.failure_report()
assert result.failed_case_ids == ()
assert len(result.cases) == 11
assert {
case.artifact
for case in result.cases
} == {
EvaluationReplayArtifact.STORY,
EvaluationReplayArtifact.STORYBOOK,
}
def test_evaluation_golden_cases_report_internal_coverage_summary():
result = replay_evaluation_golden_cases(
FIXTURES_DIR / "evaluation_golden_cases.json"
)
summary = result.coverage_summary()
assert summary["artifact"] == {
"storybook": 5,
"story": 6,
}
assert summary["age_band"] == {
"3-4": 4,
"5-6": 4,
"unknown": 2,
"7-8": 1,
}
assert summary["risk_area"] == {
"schema_error": 4,
"happy_path": 2,
"readability_warning": 2,
"safety_error": 2,
"length_boundary": 1,
}
assert summary["outcome"] == {
"blocked": 8,
"passed": 3,
}
assert summary["tags"]["story"] == 6
assert summary["tags"]["storybook"] == 5
assert summary["tags"]["blocking"] == 6
assert summary["tags"]["threshold_block"] == 2
def test_evaluation_replay_reports_expectation_mismatch():
case = EvaluationReplayCase(
case_id="expectation-mismatch",
artifact=EvaluationReplayArtifact.STORY,
input_payload={"keywords": "小兔子"},
output_payload={
"mode": "generated",
"title": "小兔子的花园",
"story_text": "小兔子学会了和朋友分享水壶。",
"cover_prompt_suggestion": "A rabbit sharing a watering can",
},
expected=ExpectedEvaluation(
passed=True,
blocking=False,
min_overall_score=0.99,
),
)
result = run_evaluation_replay_cases([case])
assert result.passed is False
assert result.failed_case_ids == ("expectation-mismatch",)
assert "expected overall_score >=" in result.failure_report()
def test_story_quality_gate_rejects_missing_story_text():
output = StoryOutput(
mode="generated",