Expand generation harness observability
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
from typing import Literal
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
@@ -9,6 +10,10 @@ from app.core.admin_auth import admin_guard
|
||||
from app.db.admin_models import Provider
|
||||
from app.db.database import get_db
|
||||
from app.services.adapters.registry import AdapterRegistry
|
||||
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
|
||||
from app.services.admin_executor_coverage import get_admin_executor_coverage
|
||||
from app.services.admin_generation_trace import get_admin_generation_job_trace
|
||||
from app.services.admin_harness_readiness import get_admin_harness_readiness
|
||||
from app.services.admin_provider_analytics import get_admin_provider_analytics
|
||||
from app.services.cost_tracker import cost_tracker
|
||||
from app.services.provider_policy import DEFAULT_PROVIDERS, list_capability_policies
|
||||
@@ -103,6 +108,169 @@ class ProviderAnalyticsResponse(BaseModel):
|
||||
by_user: list[ProviderAnalyticsUserBucket]
|
||||
failure_reasons: list[ProviderAnalyticsFailureReason]
|
||||
|
||||
|
||||
class EvaluationAnalyticsArtifactBucket(BaseModel):
|
||||
artifact: str
|
||||
count: int
|
||||
|
||||
|
||||
class EvaluationAnalyticsOutputModeBucket(BaseModel):
|
||||
output_mode: str
|
||||
count: int
|
||||
|
||||
|
||||
class EvaluationAnalyticsScoreBandBucket(BaseModel):
|
||||
band: str
|
||||
count: int
|
||||
|
||||
|
||||
class EvaluationAnalyticsDimensionScore(BaseModel):
|
||||
dimension: str
|
||||
average_score: float
|
||||
count: int
|
||||
|
||||
|
||||
class EvaluationAnalyticsQualityGateIssue(BaseModel):
|
||||
code: str
|
||||
count: int
|
||||
|
||||
|
||||
class EvaluationAnalyticsFailureCategory(BaseModel):
|
||||
category: str
|
||||
count: int
|
||||
|
||||
|
||||
class EvaluationAnalyticsWarning(BaseModel):
|
||||
message: str
|
||||
count: int
|
||||
|
||||
|
||||
class EvaluationAnalyticsResponse(BaseModel):
|
||||
scope: str
|
||||
window_days: int | None = None
|
||||
artifact: str | None = None
|
||||
total_evaluations: int
|
||||
passed_evaluations: int
|
||||
blocked_evaluations: int
|
||||
pass_rate: float
|
||||
average_score: float | None = None
|
||||
job_count: int
|
||||
story_count: int
|
||||
user_count: int
|
||||
by_artifact: list[EvaluationAnalyticsArtifactBucket]
|
||||
by_output_mode: list[EvaluationAnalyticsOutputModeBucket]
|
||||
score_bands: list[EvaluationAnalyticsScoreBandBucket]
|
||||
dimension_scores: list[EvaluationAnalyticsDimensionScore]
|
||||
quality_gate_issues: list[EvaluationAnalyticsQualityGateIssue]
|
||||
failure_categories: list[EvaluationAnalyticsFailureCategory]
|
||||
warnings: list[EvaluationAnalyticsWarning]
|
||||
|
||||
|
||||
class ExecutorCoveragePlanModeBucket(BaseModel):
|
||||
plan_mode: str
|
||||
count: int
|
||||
|
||||
|
||||
class ExecutorCoverageOutputModeBucket(BaseModel):
|
||||
output_mode: str
|
||||
count: int
|
||||
|
||||
|
||||
class ExecutorCoverageTaskKeyBucket(BaseModel):
|
||||
task_key: str
|
||||
count: int
|
||||
|
||||
|
||||
class ExecutorCoverageAssetBucket(BaseModel):
|
||||
asset: str
|
||||
count: int
|
||||
|
||||
|
||||
class ExecutorCoverageResponse(BaseModel):
|
||||
scope: str
|
||||
window_days: int | None = None
|
||||
plan_mode: str | None = None
|
||||
total_runs: int
|
||||
total_planned_tasks: int
|
||||
total_executed_tasks: int
|
||||
total_ignored_tasks: int
|
||||
coverage_ratio: float
|
||||
job_count: int
|
||||
story_count: int
|
||||
user_count: int
|
||||
by_plan_mode: list[ExecutorCoveragePlanModeBucket]
|
||||
by_output_mode: list[ExecutorCoverageOutputModeBucket]
|
||||
executed_task_keys: list[ExecutorCoverageTaskKeyBucket]
|
||||
ignored_task_keys: list[ExecutorCoverageTaskKeyBucket]
|
||||
result_assets: list[ExecutorCoverageAssetBucket]
|
||||
|
||||
|
||||
class AdminGenerationJobEventResponse(BaseModel):
|
||||
id: int
|
||||
job_id: str
|
||||
story_id: int | None = None
|
||||
event_type: str
|
||||
status: str
|
||||
message: str | None = None
|
||||
event_metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class AdminGenerationJobTraceResponse(BaseModel):
|
||||
id: str
|
||||
user_id: str
|
||||
story_id: int | None = None
|
||||
output_mode: str
|
||||
input_type: str
|
||||
status: str
|
||||
current_step: str
|
||||
progress_percent: int
|
||||
progress_label: str
|
||||
is_terminal: bool
|
||||
can_cancel: bool = False
|
||||
can_retry: bool = False
|
||||
result_snapshot: dict[str, Any] = Field(default_factory=dict)
|
||||
error_message: str | None = None
|
||||
request_payload: dict[str, Any] = Field(default_factory=dict)
|
||||
executor_coverage: ExecutorCoverageResponse
|
||||
events: list[AdminGenerationJobEventResponse] = Field(default_factory=list)
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
||||
class HarnessReadinessCheck(BaseModel):
|
||||
code: str
|
||||
status: Literal["ready", "needs_attention", "blocked"]
|
||||
message: str
|
||||
details: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class HarnessReadinessGoldenReplay(BaseModel):
|
||||
passed: bool
|
||||
total_cases: int
|
||||
failed_case_ids: list[str]
|
||||
coverage_summary: dict[str, dict[str, int]] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class HarnessReadinessThresholds(BaseModel):
|
||||
min_runtime_evaluations: int
|
||||
min_executor_runs: int
|
||||
min_evaluation_pass_rate: float
|
||||
min_evaluation_average_score: float
|
||||
min_executor_coverage_ratio: float
|
||||
|
||||
|
||||
class HarnessReadinessResponse(BaseModel):
|
||||
scope: str
|
||||
window_days: int | None = None
|
||||
status: Literal["ready", "needs_attention", "blocked"]
|
||||
thresholds: HarnessReadinessThresholds
|
||||
checks: list[HarnessReadinessCheck]
|
||||
golden_replay: HarnessReadinessGoldenReplay
|
||||
evaluation_analytics: EvaluationAnalyticsResponse
|
||||
executor_coverage: ExecutorCoverageResponse
|
||||
|
||||
|
||||
@router.get("/providers/adapters")
|
||||
async def list_available_adapters():
|
||||
"""获取所有可用的适配器类型 (定义的类)。"""
|
||||
@@ -137,6 +305,55 @@ async def get_provider_analytics(
|
||||
)
|
||||
|
||||
|
||||
@router.get("/evaluations/analytics", response_model=EvaluationAnalyticsResponse)
|
||||
async def get_evaluation_analytics(
|
||||
days: int | None = Query(default=None, ge=1, le=365),
|
||||
artifact: Literal["story_text", "storybook_pages"] | None = Query(default=None),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""获取内部内容评测摘要,仅供管理控制面使用。"""
|
||||
return await get_admin_evaluation_analytics(
|
||||
db,
|
||||
days=days,
|
||||
artifact=artifact,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/executors/coverage", response_model=ExecutorCoverageResponse)
|
||||
async def get_executor_coverage(
|
||||
days: int | None = Query(default=None, ge=1, le=365),
|
||||
plan_mode: Literal["asset_generation", "asset_retry"] | None = Query(default=None),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""获取内部 executor 执行覆盖率,仅供管理控制面使用。"""
|
||||
return await get_admin_executor_coverage(
|
||||
db,
|
||||
days=days,
|
||||
plan_mode=plan_mode,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/harness/readiness", response_model=HarnessReadinessResponse)
|
||||
async def get_harness_readiness(
|
||||
days: int | None = Query(default=None, ge=1, le=365),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""获取内部 harness readiness 审查摘要,仅供管理控制面使用。"""
|
||||
return await get_admin_harness_readiness(db, days=days)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/generations/jobs/{job_id}/trace",
|
||||
response_model=AdminGenerationJobTraceResponse,
|
||||
)
|
||||
async def get_generation_job_trace(
|
||||
job_id: str,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""获取完整内部生成链路,仅供管理控制面排查与审查使用。"""
|
||||
return await get_admin_generation_job_trace(db, job_id=job_id)
|
||||
|
||||
|
||||
@router.get("/providers", response_model=list[ProviderResponse])
|
||||
async def list_providers(db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(select(Provider))
|
||||
|
||||
@@ -24,6 +24,7 @@ from app.schemas.story_schemas import (
|
||||
GenerationProviderStatsResponse,
|
||||
GenerationRequest,
|
||||
GenerationResponse,
|
||||
GenerationTraceSummaryResponse,
|
||||
StoryAssetRetryRequest,
|
||||
StoryAudioStatusResponse,
|
||||
StorybookRequest,
|
||||
@@ -37,6 +38,7 @@ from app.services import story_service
|
||||
from app.services.generation_jobs import (
|
||||
get_generation_job_detail,
|
||||
get_story_provider_stats,
|
||||
get_story_trace_summary,
|
||||
get_user_generation_ops_summary,
|
||||
get_user_provider_analytics,
|
||||
list_story_generation_jobs,
|
||||
@@ -181,6 +183,25 @@ async def get_generation_provider_stats(
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/generations/{story_id}/trace-summary",
|
||||
response_model=GenerationTraceSummaryResponse,
|
||||
)
|
||||
async def get_generation_trace_summary(
|
||||
story_id: int,
|
||||
days: int | None = Query(default=None, ge=1, le=365),
|
||||
user: User = Depends(require_user),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""Get workflow trace summary aggregated from generation job events."""
|
||||
return await get_story_trace_summary(
|
||||
db,
|
||||
story_id=story_id,
|
||||
user_id=user.id,
|
||||
days=days,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/generations/{story_id}", response_model=StoryDetailResponse)
|
||||
async def get_generation(
|
||||
story_id: int,
|
||||
|
||||
@@ -244,6 +244,25 @@ class GenerationProviderStatsResponse(BaseModel):
|
||||
failure_reasons: list[GenerationProviderFailureReasonResponse] = Field(default_factory=list)
|
||||
|
||||
|
||||
class GenerationTraceBucketResponse(BaseModel):
|
||||
"""Aggregated generation trace bucket."""
|
||||
|
||||
name: str
|
||||
count: int
|
||||
|
||||
|
||||
class GenerationTraceSummaryResponse(BaseModel):
|
||||
"""Workflow trace summary aggregated from generation job events."""
|
||||
|
||||
story_id: int
|
||||
window_days: int | None = None
|
||||
total_events: int
|
||||
failed_events: int
|
||||
by_step: list[GenerationTraceBucketResponse] = Field(default_factory=list)
|
||||
by_artifact: list[GenerationTraceBucketResponse] = Field(default_factory=list)
|
||||
failure_categories: list[GenerationTraceBucketResponse] = Field(default_factory=list)
|
||||
|
||||
|
||||
class GenerationProviderAnalyticsResponse(BaseModel):
|
||||
"""Provider call stats aggregated across one user's generation history."""
|
||||
|
||||
|
||||
204
backend/app/services/admin_evaluation_analytics.py
Normal file
204
backend/app/services/admin_evaluation_analytics.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""Admin-only analytics for internal generation evaluation events."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db.models import GenerationJob, GenerationJobEvent
|
||||
|
||||
|
||||
def _as_float(value: Any) -> float | None:
|
||||
if isinstance(value, int | float):
|
||||
return float(value)
|
||||
return None
|
||||
|
||||
|
||||
def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{key_name: name, "count": count}
|
||||
for name, count in sorted(
|
||||
counts.items(),
|
||||
key=lambda item: (-item[1], item[0]),
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _average_bucket(
|
||||
totals: dict[str, float],
|
||||
counts: dict[str, int],
|
||||
*,
|
||||
key_name: str,
|
||||
) -> list[dict[str, Any]]:
|
||||
rows = [
|
||||
{
|
||||
key_name: name,
|
||||
"average_score": round(totals[name] / counts[name], 4),
|
||||
"count": counts[name],
|
||||
}
|
||||
for name in totals
|
||||
if counts.get(name)
|
||||
]
|
||||
rows.sort(key=lambda item: (-int(item["count"]), str(item[key_name])))
|
||||
return rows
|
||||
|
||||
|
||||
def _score_band(score: float) -> str:
|
||||
if score >= 0.9:
|
||||
return "excellent"
|
||||
if score >= 0.8:
|
||||
return "good"
|
||||
if score >= 0.7:
|
||||
return "pass"
|
||||
if score > 0:
|
||||
return "blocked_low_score"
|
||||
return "blocked_quality_gate"
|
||||
|
||||
|
||||
def _metadata_scores(metadata: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
raw_scores = metadata.get("scores")
|
||||
if not isinstance(raw_scores, list):
|
||||
return []
|
||||
return [score for score in raw_scores if isinstance(score, dict)]
|
||||
|
||||
|
||||
def _quality_gate_issues(metadata: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
quality_gate = metadata.get("quality_gate")
|
||||
if not isinstance(quality_gate, dict):
|
||||
return []
|
||||
raw_issues = quality_gate.get("issues")
|
||||
if not isinstance(raw_issues, list):
|
||||
return []
|
||||
return [issue for issue in raw_issues if isinstance(issue, dict)]
|
||||
|
||||
|
||||
async def get_admin_evaluation_analytics(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
days: int | None = None,
|
||||
artifact: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Aggregate internal evaluation results for the admin control plane."""
|
||||
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None
|
||||
|
||||
query = (
|
||||
select(GenerationJobEvent, GenerationJob)
|
||||
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
|
||||
.where(GenerationJobEvent.event_type == "evaluation_completed")
|
||||
.order_by(GenerationJobEvent.id)
|
||||
)
|
||||
if cutoff is not None:
|
||||
query = query.where(GenerationJobEvent.created_at >= cutoff)
|
||||
|
||||
rows = (await db.execute(query)).all()
|
||||
|
||||
total_evaluations = 0
|
||||
passed_evaluations = 0
|
||||
blocked_evaluations = 0
|
||||
score_total = 0.0
|
||||
score_count = 0
|
||||
job_ids: set[str] = set()
|
||||
story_ids: set[int] = set()
|
||||
user_ids: set[str] = set()
|
||||
artifacts: dict[str, int] = {}
|
||||
output_modes: dict[str, int] = {}
|
||||
score_bands: dict[str, int] = {}
|
||||
dimension_totals: dict[str, float] = {}
|
||||
dimension_counts: dict[str, int] = {}
|
||||
quality_gate_codes: dict[str, int] = {}
|
||||
failure_categories: dict[str, int] = {}
|
||||
warning_counts: dict[str, int] = {}
|
||||
|
||||
for event, job in rows:
|
||||
metadata = event.event_metadata or {}
|
||||
event_artifact = str(metadata.get("artifact") or "unknown")
|
||||
if artifact is not None and event_artifact != artifact:
|
||||
continue
|
||||
|
||||
total_evaluations += 1
|
||||
job_ids.add(job.id)
|
||||
user_ids.add(job.user_id)
|
||||
if event.story_id is not None:
|
||||
story_ids.add(int(event.story_id))
|
||||
elif job.story_id is not None:
|
||||
story_ids.add(int(job.story_id))
|
||||
|
||||
artifacts[event_artifact] = artifacts.get(event_artifact, 0) + 1
|
||||
output_modes[job.output_mode] = output_modes.get(job.output_mode, 0) + 1
|
||||
|
||||
passed = metadata.get("passed") is True
|
||||
blocking = metadata.get("blocking") is True
|
||||
if passed:
|
||||
passed_evaluations += 1
|
||||
if blocking:
|
||||
blocked_evaluations += 1
|
||||
|
||||
overall_score = _as_float(metadata.get("overall_score"))
|
||||
if overall_score is not None:
|
||||
score_total += overall_score
|
||||
score_count += 1
|
||||
band = _score_band(overall_score)
|
||||
score_bands[band] = score_bands.get(band, 0) + 1
|
||||
|
||||
for score in _metadata_scores(metadata):
|
||||
dimension = score.get("dimension")
|
||||
dimension_score = _as_float(score.get("score"))
|
||||
if not isinstance(dimension, str) or dimension_score is None:
|
||||
continue
|
||||
dimension_totals[dimension] = dimension_totals.get(dimension, 0.0) + dimension_score
|
||||
dimension_counts[dimension] = dimension_counts.get(dimension, 0) + 1
|
||||
|
||||
for issue in _quality_gate_issues(metadata):
|
||||
code = issue.get("code")
|
||||
if isinstance(code, str) and code:
|
||||
quality_gate_codes[code] = quality_gate_codes.get(code, 0) + 1
|
||||
failure_category = issue.get("failure_category")
|
||||
if isinstance(failure_category, str) and failure_category:
|
||||
failure_categories[failure_category] = (
|
||||
failure_categories.get(failure_category, 0) + 1
|
||||
)
|
||||
|
||||
warnings = metadata.get("warnings")
|
||||
if isinstance(warnings, list):
|
||||
for warning in warnings:
|
||||
if isinstance(warning, str) and warning:
|
||||
warning_counts[warning] = warning_counts.get(warning, 0) + 1
|
||||
|
||||
return {
|
||||
"scope": "admin_internal_evaluations",
|
||||
"window_days": days,
|
||||
"artifact": artifact,
|
||||
"total_evaluations": total_evaluations,
|
||||
"passed_evaluations": passed_evaluations,
|
||||
"blocked_evaluations": blocked_evaluations,
|
||||
"pass_rate": (
|
||||
round(passed_evaluations / total_evaluations, 4)
|
||||
if total_evaluations
|
||||
else 0.0
|
||||
),
|
||||
"average_score": round(score_total / score_count, 4) if score_count else None,
|
||||
"job_count": len(job_ids),
|
||||
"story_count": len(story_ids),
|
||||
"user_count": len(user_ids),
|
||||
"by_artifact": _sorted_count_buckets(artifacts, key_name="artifact"),
|
||||
"by_output_mode": _sorted_count_buckets(output_modes, key_name="output_mode"),
|
||||
"score_bands": _sorted_count_buckets(score_bands, key_name="band"),
|
||||
"dimension_scores": _average_bucket(
|
||||
dimension_totals,
|
||||
dimension_counts,
|
||||
key_name="dimension",
|
||||
),
|
||||
"quality_gate_issues": _sorted_count_buckets(
|
||||
quality_gate_codes,
|
||||
key_name="code",
|
||||
),
|
||||
"failure_categories": _sorted_count_buckets(
|
||||
failure_categories,
|
||||
key_name="category",
|
||||
),
|
||||
"warnings": _sorted_count_buckets(warning_counts, key_name="message"),
|
||||
}
|
||||
147
backend/app/services/admin_executor_coverage.py
Normal file
147
backend/app/services/admin_executor_coverage.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Admin-only analytics for internal workflow executor coverage."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterable
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db.models import GenerationJob, GenerationJobEvent
|
||||
|
||||
|
||||
def _as_int(value: Any) -> int:
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(value)
|
||||
return 0
|
||||
|
||||
|
||||
def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{key_name: name, "count": count}
|
||||
for name, count in sorted(
|
||||
counts.items(),
|
||||
key=lambda item: (-item[1], item[0]),
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _iter_strings(value: Any) -> Iterable[str]:
|
||||
if not isinstance(value, list | tuple | set):
|
||||
return
|
||||
|
||||
for item in value:
|
||||
if isinstance(item, str) and item:
|
||||
yield item
|
||||
|
||||
|
||||
def summarize_executor_coverage_rows(
|
||||
rows: Iterable[tuple[GenerationJobEvent, GenerationJob]],
|
||||
*,
|
||||
days: int | None = None,
|
||||
plan_mode: str | None = None,
|
||||
scope: str = "admin_internal_executor_coverage",
|
||||
) -> dict[str, Any]:
|
||||
"""Aggregate internal executor coverage rows into an admin-only summary."""
|
||||
|
||||
total_runs = 0
|
||||
total_planned_tasks = 0
|
||||
total_executed_tasks = 0
|
||||
total_ignored_tasks = 0
|
||||
job_ids: set[str] = set()
|
||||
story_ids: set[int] = set()
|
||||
user_ids: set[str] = set()
|
||||
by_plan_mode: dict[str, int] = {}
|
||||
by_output_mode: dict[str, int] = {}
|
||||
executed_task_keys: dict[str, int] = {}
|
||||
ignored_task_keys: dict[str, int] = {}
|
||||
result_assets: dict[str, int] = {}
|
||||
|
||||
for event, job in rows:
|
||||
metadata = event.event_metadata or {}
|
||||
event_plan_mode = str(metadata.get("plan_mode") or "unknown")
|
||||
if plan_mode is not None and event_plan_mode != plan_mode:
|
||||
continue
|
||||
|
||||
total_runs += 1
|
||||
job_ids.add(job.id)
|
||||
user_ids.add(job.user_id)
|
||||
if event.story_id is not None:
|
||||
story_ids.add(int(event.story_id))
|
||||
elif job.story_id is not None:
|
||||
story_ids.add(int(job.story_id))
|
||||
|
||||
by_plan_mode[event_plan_mode] = by_plan_mode.get(event_plan_mode, 0) + 1
|
||||
by_output_mode[job.output_mode] = by_output_mode.get(job.output_mode, 0) + 1
|
||||
|
||||
total_planned_tasks += _as_int(metadata.get("planned_task_count"))
|
||||
total_executed_tasks += _as_int(metadata.get("executed_task_count"))
|
||||
total_ignored_tasks += _as_int(metadata.get("ignored_task_count"))
|
||||
|
||||
for key in _iter_strings(metadata.get("executed_task_keys")):
|
||||
executed_task_keys[key] = executed_task_keys.get(key, 0) + 1
|
||||
|
||||
for key in _iter_strings(metadata.get("ignored_task_keys")):
|
||||
ignored_task_keys[key] = ignored_task_keys.get(key, 0) + 1
|
||||
|
||||
for asset in _iter_strings(metadata.get("result_assets")):
|
||||
result_assets[asset] = result_assets.get(asset, 0) + 1
|
||||
|
||||
coverage_ratio = (
|
||||
round(total_executed_tasks / total_planned_tasks, 4)
|
||||
if total_planned_tasks
|
||||
else 0.0
|
||||
)
|
||||
|
||||
return {
|
||||
"scope": scope,
|
||||
"window_days": days,
|
||||
"plan_mode": plan_mode,
|
||||
"total_runs": total_runs,
|
||||
"total_planned_tasks": total_planned_tasks,
|
||||
"total_executed_tasks": total_executed_tasks,
|
||||
"total_ignored_tasks": total_ignored_tasks,
|
||||
"coverage_ratio": coverage_ratio,
|
||||
"job_count": len(job_ids),
|
||||
"story_count": len(story_ids),
|
||||
"user_count": len(user_ids),
|
||||
"by_plan_mode": _sorted_count_buckets(by_plan_mode, key_name="plan_mode"),
|
||||
"by_output_mode": _sorted_count_buckets(by_output_mode, key_name="output_mode"),
|
||||
"executed_task_keys": _sorted_count_buckets(
|
||||
executed_task_keys,
|
||||
key_name="task_key",
|
||||
),
|
||||
"ignored_task_keys": _sorted_count_buckets(
|
||||
ignored_task_keys,
|
||||
key_name="task_key",
|
||||
),
|
||||
"result_assets": _sorted_count_buckets(result_assets, key_name="asset"),
|
||||
}
|
||||
|
||||
|
||||
async def get_admin_executor_coverage(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
days: int | None = None,
|
||||
plan_mode: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Aggregate internal executor coverage events for the admin control plane."""
|
||||
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None
|
||||
query = (
|
||||
select(GenerationJobEvent, GenerationJob)
|
||||
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
|
||||
.where(GenerationJobEvent.event_type == "executor_completed")
|
||||
.order_by(GenerationJobEvent.id)
|
||||
)
|
||||
if cutoff is not None:
|
||||
query = query.where(GenerationJobEvent.created_at >= cutoff)
|
||||
|
||||
rows = (await db.execute(query)).all()
|
||||
return summarize_executor_coverage_rows(rows, days=days, plan_mode=plan_mode)
|
||||
52
backend/app/services/admin_generation_trace.py
Normal file
52
backend/app/services/admin_generation_trace.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""Admin-only generation trace detail service."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db.models import GenerationJob, GenerationJobEvent
|
||||
from app.services.admin_executor_coverage import summarize_executor_coverage_rows
|
||||
from app.services.generation_jobs import (
|
||||
generation_event_to_response,
|
||||
generation_job_to_summary,
|
||||
)
|
||||
|
||||
|
||||
async def get_admin_generation_job_trace(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
job_id: str,
|
||||
) -> dict[str, Any]:
|
||||
"""Return a complete internal generation trace for the admin control plane."""
|
||||
|
||||
job = (
|
||||
await db.execute(select(GenerationJob).where(GenerationJob.id == job_id))
|
||||
).scalar_one_or_none()
|
||||
if job is None:
|
||||
raise HTTPException(status_code=404, detail="Generation job not found")
|
||||
|
||||
events = (
|
||||
await db.execute(
|
||||
select(GenerationJobEvent)
|
||||
.where(GenerationJobEvent.job_id == job.id)
|
||||
.order_by(GenerationJobEvent.id)
|
||||
)
|
||||
).scalars().all()
|
||||
executor_rows = [
|
||||
(event, job) for event in events if event.event_type == "executor_completed"
|
||||
]
|
||||
|
||||
return {
|
||||
**generation_job_to_summary(job),
|
||||
"user_id": job.user_id,
|
||||
"request_payload": job.request_payload or {},
|
||||
"executor_coverage": summarize_executor_coverage_rows(
|
||||
executor_rows,
|
||||
scope="admin_internal_job_executor_coverage",
|
||||
),
|
||||
"events": [generation_event_to_response(event) for event in events],
|
||||
}
|
||||
262
backend/app/services/admin_harness_readiness.py
Normal file
262
backend/app/services/admin_harness_readiness.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""Admin-only readiness audit for harness-driven generation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
|
||||
from app.services.admin_executor_coverage import get_admin_executor_coverage
|
||||
from app.services.harness.evaluation_replay import replay_evaluation_golden_cases
|
||||
|
||||
_GOLDEN_CASES_PATH = (
|
||||
Path(__file__).resolve().parent
|
||||
/ "harness"
|
||||
/ "fixtures"
|
||||
/ "evaluation_golden_cases.json"
|
||||
)
|
||||
|
||||
_MIN_RUNTIME_EVALUATIONS = 1
|
||||
_MIN_EXECUTOR_RUNS = 1
|
||||
_MIN_EVALUATION_PASS_RATE = 0.7
|
||||
_MIN_EVALUATION_AVERAGE_SCORE = 0.7
|
||||
_MIN_EXECUTOR_COVERAGE_RATIO = 0.2
|
||||
|
||||
|
||||
def _check(
|
||||
*,
|
||||
code: str,
|
||||
status: str,
|
||||
message: str,
|
||||
details: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"code": code,
|
||||
"status": status,
|
||||
"message": message,
|
||||
"details": details or {},
|
||||
}
|
||||
|
||||
|
||||
def _overall_status(checks: list[dict[str, Any]]) -> str:
|
||||
statuses = {check["status"] for check in checks}
|
||||
if "blocked" in statuses:
|
||||
return "blocked"
|
||||
if "needs_attention" in statuses:
|
||||
return "needs_attention"
|
||||
return "ready"
|
||||
|
||||
|
||||
def _run_golden_replay() -> dict[str, Any]:
|
||||
if not _GOLDEN_CASES_PATH.exists():
|
||||
return {
|
||||
"passed": False,
|
||||
"total_cases": 0,
|
||||
"failed_case_ids": ["fixture_missing"],
|
||||
"coverage_summary": {},
|
||||
}
|
||||
|
||||
result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH)
|
||||
return {
|
||||
"passed": result.passed,
|
||||
"total_cases": len(result.cases),
|
||||
"failed_case_ids": list(result.failed_case_ids),
|
||||
"coverage_summary": result.coverage_summary(),
|
||||
}
|
||||
|
||||
|
||||
def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]:
|
||||
if golden_replay["passed"] and golden_replay["total_cases"] > 0:
|
||||
return _check(
|
||||
code="golden_replay",
|
||||
status="ready",
|
||||
message="内部 golden replay 全部通过。",
|
||||
details={
|
||||
"total_cases": golden_replay["total_cases"],
|
||||
"failed_case_count": len(golden_replay["failed_case_ids"]),
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="golden_replay",
|
||||
status="blocked",
|
||||
message="内部 golden replay 未通过,暂停扩大 harness 接管范围。",
|
||||
details={
|
||||
"total_cases": golden_replay["total_cases"],
|
||||
"failed_case_count": len(golden_replay["failed_case_ids"]),
|
||||
"failed_case_ids": golden_replay["failed_case_ids"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
|
||||
total = int(evaluation_analytics["total_evaluations"])
|
||||
if total >= _MIN_RUNTIME_EVALUATIONS:
|
||||
return _check(
|
||||
code="runtime_evaluation_samples",
|
||||
status="ready",
|
||||
message="当前窗口已有内部 evaluation 运行样本。",
|
||||
details={
|
||||
"total_evaluations": total,
|
||||
"min_required": _MIN_RUNTIME_EVALUATIONS,
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="runtime_evaluation_samples",
|
||||
status="needs_attention",
|
||||
message="当前窗口缺少内部 evaluation 运行样本,建议先跑生成烟测。",
|
||||
details={
|
||||
"total_evaluations": total,
|
||||
"min_required": _MIN_RUNTIME_EVALUATIONS,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
|
||||
total = int(evaluation_analytics["total_evaluations"])
|
||||
pass_rate = float(evaluation_analytics["pass_rate"])
|
||||
average_score = evaluation_analytics["average_score"]
|
||||
|
||||
if total == 0:
|
||||
return _check(
|
||||
code="runtime_evaluation_quality",
|
||||
status="needs_attention",
|
||||
message="暂无运行期 evaluation 质量样本。",
|
||||
details={
|
||||
"total_evaluations": total,
|
||||
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
||||
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
||||
},
|
||||
)
|
||||
|
||||
if pass_rate < _MIN_EVALUATION_PASS_RATE or (
|
||||
average_score is not None
|
||||
and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE
|
||||
):
|
||||
return _check(
|
||||
code="runtime_evaluation_quality",
|
||||
status="blocked",
|
||||
message="运行期 evaluation 质量未达到内部 readiness 门槛。",
|
||||
details={
|
||||
"pass_rate": pass_rate,
|
||||
"average_score": average_score,
|
||||
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
|
||||
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
||||
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="runtime_evaluation_quality",
|
||||
status="ready",
|
||||
message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。",
|
||||
details={
|
||||
"pass_rate": pass_rate,
|
||||
"average_score": average_score,
|
||||
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
|
||||
total_runs = int(executor_coverage["total_runs"])
|
||||
if total_runs >= _MIN_EXECUTOR_RUNS:
|
||||
return _check(
|
||||
code="executor_coverage_samples",
|
||||
status="ready",
|
||||
message="当前窗口已有 executor coverage 运行样本。",
|
||||
details={
|
||||
"total_runs": total_runs,
|
||||
"min_required": _MIN_EXECUTOR_RUNS,
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="executor_coverage_samples",
|
||||
status="needs_attention",
|
||||
message="当前窗口缺少 executor coverage 样本,建议先跑资产生成或重试烟测。",
|
||||
details={
|
||||
"total_runs": total_runs,
|
||||
"min_required": _MIN_EXECUTOR_RUNS,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
|
||||
total_runs = int(executor_coverage["total_runs"])
|
||||
coverage_ratio = float(executor_coverage["coverage_ratio"])
|
||||
|
||||
if total_runs == 0:
|
||||
return _check(
|
||||
code="executor_coverage_ratio",
|
||||
status="needs_attention",
|
||||
message="暂无 executor coverage 运行样本。",
|
||||
details={
|
||||
"total_runs": total_runs,
|
||||
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
||||
},
|
||||
)
|
||||
|
||||
if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO:
|
||||
return _check(
|
||||
code="executor_coverage_ratio",
|
||||
status="blocked",
|
||||
message="executor coverage ratio 未达到内部 readiness 门槛。",
|
||||
details={
|
||||
"coverage_ratio": coverage_ratio,
|
||||
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
||||
"total_planned_tasks": executor_coverage["total_planned_tasks"],
|
||||
"total_executed_tasks": executor_coverage["total_executed_tasks"],
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="executor_coverage_ratio",
|
||||
status="ready",
|
||||
message="executor coverage ratio 达到内部 readiness 门槛。",
|
||||
details={
|
||||
"coverage_ratio": coverage_ratio,
|
||||
"total_planned_tasks": executor_coverage["total_planned_tasks"],
|
||||
"total_executed_tasks": executor_coverage["total_executed_tasks"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
async def get_admin_harness_readiness(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
days: int | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Return an admin-only readiness audit for harness release decisions."""
|
||||
|
||||
golden_replay = _run_golden_replay()
|
||||
evaluation_analytics = await get_admin_evaluation_analytics(db, days=days)
|
||||
executor_coverage = await get_admin_executor_coverage(db, days=days)
|
||||
|
||||
checks = [
|
||||
_golden_replay_check(golden_replay),
|
||||
_evaluation_sample_check(evaluation_analytics),
|
||||
_evaluation_quality_check(evaluation_analytics),
|
||||
_executor_sample_check(executor_coverage),
|
||||
_executor_ratio_check(executor_coverage),
|
||||
]
|
||||
|
||||
return {
|
||||
"scope": "admin_internal_harness_readiness",
|
||||
"window_days": days,
|
||||
"status": _overall_status(checks),
|
||||
"thresholds": {
|
||||
"min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS,
|
||||
"min_executor_runs": _MIN_EXECUTOR_RUNS,
|
||||
"min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
||||
"min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
||||
"min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
||||
},
|
||||
"checks": checks,
|
||||
"golden_replay": golden_replay,
|
||||
"evaluation_analytics": evaluation_analytics,
|
||||
"executor_coverage": executor_coverage,
|
||||
}
|
||||
@@ -90,11 +90,13 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]:
|
||||
|
||||
progress_map: dict[str, tuple[int, str]] = {
|
||||
"request_accepted": (5, "已接收请求"),
|
||||
"workflow_planned": (8, "工作流已规划"),
|
||||
"retry_queued": (8, "重新排队中"),
|
||||
"worker_started": (12, "后台任务已开始"),
|
||||
"cancel_requested": (15, "已请求取消"),
|
||||
"context_prepared": (20, "上下文已准备"),
|
||||
"narrative_generated": (45, "正文已生成"),
|
||||
"evaluation_completed": (52, "内容评测已完成"),
|
||||
"story_saved": (60, "主记录已保存"),
|
||||
"provider_call_started": (65, "Provider 调用中"),
|
||||
"provider_call_succeeded": (72, "Provider 调用成功"),
|
||||
@@ -307,6 +309,137 @@ def generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any]:
|
||||
}
|
||||
|
||||
|
||||
_PUBLIC_EVENT_METADATA_KEYS = {
|
||||
"adapter",
|
||||
"artifact",
|
||||
"asset",
|
||||
"assets",
|
||||
"attempted_cover",
|
||||
"audio_status",
|
||||
"blocks_main_result",
|
||||
"capability",
|
||||
"completed_pages",
|
||||
"cover_prompt_present",
|
||||
"estimated_cost_usd",
|
||||
"failed_pages",
|
||||
"failure_category",
|
||||
"generation_status",
|
||||
"has_memory_context",
|
||||
"image_status",
|
||||
"input_type",
|
||||
"latency_ms",
|
||||
"mode",
|
||||
"output_mode",
|
||||
"page_count",
|
||||
"page_number",
|
||||
"recoverable",
|
||||
"requested_from_step",
|
||||
"retryable",
|
||||
"scope",
|
||||
"stale_after_minutes",
|
||||
"status",
|
||||
"step",
|
||||
"strategy",
|
||||
"text_status",
|
||||
}
|
||||
|
||||
_PUBLIC_REQUEST_PAYLOAD_KEYS = {
|
||||
"assets",
|
||||
"child_profile_id",
|
||||
"generate_images",
|
||||
"input_type",
|
||||
"output_mode",
|
||||
"page_count",
|
||||
"story_id",
|
||||
"type",
|
||||
"universe_id",
|
||||
}
|
||||
|
||||
|
||||
def _public_metadata_value(value: Any) -> Any:
|
||||
"""Return a JSON-safe public value or None when the value is internal."""
|
||||
|
||||
if isinstance(value, str | int | float | bool) or value is None:
|
||||
return value
|
||||
if isinstance(value, list):
|
||||
public_items = [
|
||||
item
|
||||
for item in value
|
||||
if isinstance(item, str | int | float | bool) or item is None
|
||||
]
|
||||
return public_items
|
||||
return None
|
||||
|
||||
|
||||
def public_generation_request_payload(job: GenerationJob) -> dict[str, Any]:
|
||||
"""Return request payload fields safe for user-facing job details."""
|
||||
|
||||
payload = job.request_payload or {}
|
||||
public_payload: dict[str, Any] = {}
|
||||
|
||||
for key in sorted(_PUBLIC_REQUEST_PAYLOAD_KEYS):
|
||||
if key not in payload:
|
||||
continue
|
||||
value = _public_metadata_value(payload[key])
|
||||
if value is not None:
|
||||
public_payload[key] = value
|
||||
|
||||
return public_payload
|
||||
|
||||
|
||||
def _public_plan_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Expose only coarse workflow plan metadata to user-facing responses."""
|
||||
|
||||
plan = metadata.get("plan")
|
||||
if not isinstance(plan, dict):
|
||||
return {}
|
||||
|
||||
public: dict[str, Any] = {}
|
||||
mode = plan.get("mode")
|
||||
if isinstance(mode, str):
|
||||
public["plan_mode"] = mode
|
||||
|
||||
tasks = plan.get("tasks")
|
||||
if isinstance(tasks, list):
|
||||
public["planned_task_count"] = len(tasks)
|
||||
public["recoverable_task_count"] = sum(
|
||||
1
|
||||
for task in tasks
|
||||
if isinstance(task, dict) and task.get("recoverable") is True
|
||||
)
|
||||
|
||||
return public
|
||||
|
||||
|
||||
def public_generation_event_metadata(event: GenerationJobEvent) -> dict[str, Any]:
|
||||
"""Return event metadata safe for user-facing job event streams."""
|
||||
|
||||
metadata = event.event_metadata or {}
|
||||
public_metadata: dict[str, Any] = {}
|
||||
|
||||
for key in sorted(_PUBLIC_EVENT_METADATA_KEYS):
|
||||
if key not in metadata:
|
||||
continue
|
||||
value = _public_metadata_value(metadata[key])
|
||||
if value is not None:
|
||||
public_metadata[key] = value
|
||||
|
||||
if event.event_type == "workflow_planned":
|
||||
public_metadata.update(_public_plan_metadata(metadata))
|
||||
|
||||
return public_metadata
|
||||
|
||||
|
||||
def public_generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any] | None:
|
||||
"""Convert a generation event for user-facing APIs with internal data removed."""
|
||||
|
||||
if event.event_type in {"evaluation_completed", "executor_completed"}:
|
||||
return None
|
||||
response = generation_event_to_response(event)
|
||||
response["event_metadata"] = public_generation_event_metadata(event)
|
||||
return response
|
||||
|
||||
|
||||
def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
|
||||
"""Convert a generation job ORM object to an API summary dict."""
|
||||
|
||||
@@ -328,6 +461,23 @@ def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
|
||||
}
|
||||
|
||||
|
||||
def public_generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
|
||||
"""Convert a generation job for user-facing APIs with internal steps hidden."""
|
||||
|
||||
summary = generation_job_to_summary(job)
|
||||
if summary["current_step"] == "evaluation_completed":
|
||||
summary["current_step"] = "narrative_generated"
|
||||
summary["progress_percent"] = 45
|
||||
summary["progress_label"] = "正文已生成"
|
||||
summary["is_terminal"] = False
|
||||
elif summary["current_step"] == "executor_completed":
|
||||
summary["current_step"] = "workflow_planned"
|
||||
summary["progress_percent"] = 8
|
||||
summary["progress_label"] = "工作流已规划"
|
||||
summary["is_terminal"] = False
|
||||
return summary
|
||||
|
||||
|
||||
async def get_generation_job_for_user(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
@@ -362,13 +512,13 @@ async def request_generation_job_cancel(
|
||||
raise HTTPException(status_code=409, detail="当前任务不支持取消")
|
||||
|
||||
if job.status == "canceled":
|
||||
return generation_job_to_summary(job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
if _is_terminal_status(job.status):
|
||||
raise HTTPException(status_code=409, detail="当前任务已终止,无法取消")
|
||||
|
||||
if job.current_step == "cancel_requested":
|
||||
return generation_job_to_summary(job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
if job.current_step in {"request_accepted", "retry_queued"}:
|
||||
story = None
|
||||
@@ -391,7 +541,7 @@ async def request_generation_job_cancel(
|
||||
error_message="Generation canceled by user before worker execution started.",
|
||||
message="Generation job was canceled before worker execution started.",
|
||||
)
|
||||
return generation_job_to_summary(job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
previous_step = job.current_step
|
||||
job.error_message = "Cancellation requested by user."
|
||||
@@ -407,7 +557,7 @@ async def request_generation_job_cancel(
|
||||
)
|
||||
await db.commit()
|
||||
await db.refresh(job)
|
||||
return generation_job_to_summary(job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
|
||||
async def get_generation_job_detail(
|
||||
@@ -437,9 +587,13 @@ async def get_generation_job_detail(
|
||||
).scalars().all()
|
||||
|
||||
return {
|
||||
**generation_job_to_summary(job),
|
||||
"request_payload": job.request_payload or {},
|
||||
"events": [generation_event_to_response(event) for event in events],
|
||||
**public_generation_job_to_summary(job),
|
||||
"request_payload": public_generation_request_payload(job),
|
||||
"events": [
|
||||
response
|
||||
for event in events
|
||||
if (response := public_generation_event_to_response(event)) is not None
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -461,7 +615,7 @@ async def list_story_generation_jobs(
|
||||
.order_by(desc(GenerationJob.created_at), desc(GenerationJob.id))
|
||||
)
|
||||
).scalars().all()
|
||||
return [generation_job_to_summary(job) for job in jobs]
|
||||
return [public_generation_job_to_summary(job) for job in jobs]
|
||||
|
||||
|
||||
async def get_active_story_generation_job(
|
||||
@@ -513,6 +667,59 @@ def _as_float(value: Any) -> float | None:
|
||||
return None
|
||||
|
||||
|
||||
def _sorted_buckets(counts: dict[str, int]) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{"name": name, "count": count}
|
||||
for name, count in sorted(
|
||||
counts.items(),
|
||||
key=lambda item: (-item[1], item[0]),
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _aggregate_trace_events(events: list[GenerationJobEvent]) -> dict[str, Any]:
|
||||
"""Aggregate workflow trace metadata across job events."""
|
||||
|
||||
by_step: dict[str, int] = {}
|
||||
by_artifact: dict[str, int] = {}
|
||||
failure_categories: dict[str, int] = {}
|
||||
failed_events = 0
|
||||
total_events = 0
|
||||
|
||||
for event in events:
|
||||
if event.event_type in {"evaluation_completed", "executor_completed"}:
|
||||
continue
|
||||
|
||||
total_events += 1
|
||||
metadata = event.event_metadata or {}
|
||||
step = metadata.get("step")
|
||||
artifact = metadata.get("artifact")
|
||||
failure_category = metadata.get("failure_category")
|
||||
|
||||
if isinstance(step, str) and step:
|
||||
by_step[step] = by_step.get(step, 0) + 1
|
||||
|
||||
if isinstance(artifact, str) and artifact and artifact != "none":
|
||||
by_artifact[artifact] = by_artifact.get(artifact, 0) + 1
|
||||
|
||||
if event.status == "failed":
|
||||
failed_events += 1
|
||||
category = (
|
||||
failure_category
|
||||
if isinstance(failure_category, str) and failure_category
|
||||
else "unknown_error"
|
||||
)
|
||||
failure_categories[category] = failure_categories.get(category, 0) + 1
|
||||
|
||||
return {
|
||||
"total_events": total_events,
|
||||
"failed_events": failed_events,
|
||||
"by_step": _sorted_buckets(by_step),
|
||||
"by_artifact": _sorted_buckets(by_artifact),
|
||||
"failure_categories": _sorted_buckets(failure_categories),
|
||||
}
|
||||
|
||||
|
||||
def _aggregate_provider_events(
|
||||
events: list[GenerationJobEvent],
|
||||
*,
|
||||
@@ -679,6 +886,38 @@ async def get_story_provider_stats(
|
||||
}
|
||||
|
||||
|
||||
async def get_story_trace_summary(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
story_id: int,
|
||||
user_id: str,
|
||||
days: int | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Aggregate workflow trace metadata from all user-owned jobs for one story."""
|
||||
|
||||
query = (
|
||||
select(GenerationJobEvent)
|
||||
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
|
||||
.where(
|
||||
GenerationJob.story_id == story_id,
|
||||
GenerationJob.user_id == user_id,
|
||||
)
|
||||
.order_by(GenerationJobEvent.id)
|
||||
)
|
||||
|
||||
if days is not None:
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
|
||||
query = query.where(GenerationJobEvent.created_at >= cutoff)
|
||||
|
||||
events = (await db.execute(query)).scalars().all()
|
||||
|
||||
return {
|
||||
"story_id": story_id,
|
||||
"window_days": days,
|
||||
**_aggregate_trace_events(events),
|
||||
}
|
||||
|
||||
|
||||
async def get_user_provider_analytics(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
|
||||
322
backend/app/services/harness/evaluation_replay.py
Normal file
322
backend/app/services/harness/evaluation_replay.py
Normal file
@@ -0,0 +1,322 @@
|
||||
"""Internal golden-case replay support for harness evaluations.
|
||||
|
||||
The replay helpers are intentionally not wired to user-facing APIs. They exist
|
||||
to make evaluation behavior reproducible in tests and internal tooling.
|
||||
"""
|
||||
|
||||
import json
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from enum import StrEnum
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
|
||||
from app.services.adapters.storybook.primary import Storybook, StorybookPage
|
||||
from app.services.adapters.text.models import StoryOutput
|
||||
from app.services.harness.evaluators import (
|
||||
EvaluationDimension,
|
||||
EvaluationResult,
|
||||
evaluate_story_output,
|
||||
evaluate_storybook_output,
|
||||
)
|
||||
|
||||
|
||||
class EvaluationReplayArtifact(StrEnum):
|
||||
"""Artifacts supported by deterministic evaluation replay."""
|
||||
|
||||
STORY = "story"
|
||||
STORYBOOK = "storybook"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExpectedEvaluation:
|
||||
"""Expected evaluation outcome for one golden case."""
|
||||
|
||||
passed: bool
|
||||
blocking: bool
|
||||
min_overall_score: float | None = None
|
||||
max_overall_score: float | None = None
|
||||
required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple)
|
||||
quality_gate_codes: tuple[str, ...] = field(default_factory=tuple)
|
||||
warning_substrings: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
@classmethod
|
||||
def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation":
|
||||
"""Build expectations from a JSON-safe payload."""
|
||||
|
||||
return cls(
|
||||
passed=bool(payload["passed"]),
|
||||
blocking=bool(payload["blocking"]),
|
||||
min_overall_score=payload.get("min_overall_score"),
|
||||
max_overall_score=payload.get("max_overall_score"),
|
||||
required_dimensions=tuple(
|
||||
EvaluationDimension(dimension)
|
||||
for dimension in payload.get("required_dimensions", [])
|
||||
),
|
||||
quality_gate_codes=tuple(payload.get("quality_gate_codes", [])),
|
||||
warning_substrings=tuple(payload.get("warning_substrings", [])),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationReplayCoverage:
|
||||
"""Internal coverage labels for one golden replay case."""
|
||||
|
||||
age_band: str = "unknown"
|
||||
content_shape: str = "unknown"
|
||||
risk_area: str = "unknown"
|
||||
tags: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
@classmethod
|
||||
def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage":
|
||||
"""Build coverage labels from a JSON-safe payload."""
|
||||
|
||||
payload = payload or {}
|
||||
return cls(
|
||||
age_band=str(payload.get("age_band", "unknown")),
|
||||
content_shape=str(payload.get("content_shape", "unknown")),
|
||||
risk_area=str(payload.get("risk_area", "unknown")),
|
||||
tags=tuple(str(tag) for tag in payload.get("tags", [])),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationReplayCase:
|
||||
"""One internal golden evaluation case."""
|
||||
|
||||
case_id: str
|
||||
artifact: EvaluationReplayArtifact
|
||||
output_payload: dict[str, Any]
|
||||
expected: ExpectedEvaluation
|
||||
education_theme: str | None = None
|
||||
minimum_score: float = 0.7
|
||||
description: str = ""
|
||||
input_payload: dict[str, Any] = field(default_factory=dict)
|
||||
coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage)
|
||||
|
||||
@classmethod
|
||||
def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase":
|
||||
"""Build a replay case from a JSON-safe payload."""
|
||||
|
||||
input_payload = dict(payload.get("input", {}))
|
||||
minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7))
|
||||
education_theme = input_payload.get("education_theme", payload.get("education_theme"))
|
||||
|
||||
return cls(
|
||||
case_id=str(payload["id"]),
|
||||
artifact=EvaluationReplayArtifact(payload["artifact"]),
|
||||
description=str(payload.get("description", "")),
|
||||
input_payload=input_payload,
|
||||
output_payload=dict(payload["output"]),
|
||||
education_theme=education_theme,
|
||||
minimum_score=float(minimum_score),
|
||||
expected=ExpectedEvaluation.from_payload(payload["expected"]),
|
||||
coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")),
|
||||
)
|
||||
|
||||
def evaluate(self) -> EvaluationResult:
|
||||
"""Run the deterministic evaluator for this case."""
|
||||
|
||||
if self.artifact == EvaluationReplayArtifact.STORY:
|
||||
return evaluate_story_output(
|
||||
_story_output_from_payload(self.output_payload),
|
||||
education_theme=self.education_theme,
|
||||
minimum_score=self.minimum_score,
|
||||
)
|
||||
|
||||
return evaluate_storybook_output(
|
||||
_storybook_from_payload(self.output_payload),
|
||||
education_theme=self.education_theme,
|
||||
minimum_score=self.minimum_score,
|
||||
)
|
||||
|
||||
def replay(self) -> "EvaluationReplayCaseResult":
|
||||
"""Evaluate the case and compare it with expected outcomes."""
|
||||
|
||||
evaluation = self.evaluate()
|
||||
failures = tuple(_compare_evaluation(self, evaluation))
|
||||
return EvaluationReplayCaseResult(
|
||||
case_id=self.case_id,
|
||||
artifact=self.artifact,
|
||||
coverage=self.coverage,
|
||||
evaluation=evaluation,
|
||||
failures=failures,
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationReplayCaseResult:
|
||||
"""Replay result for one golden case."""
|
||||
|
||||
case_id: str
|
||||
artifact: EvaluationReplayArtifact
|
||||
coverage: EvaluationReplayCoverage
|
||||
evaluation: EvaluationResult
|
||||
failures: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def expectations_met(self) -> bool:
|
||||
"""Return whether the case matched all expectations."""
|
||||
|
||||
return not self.failures
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationReplaySuiteResult:
|
||||
"""Replay result for a set of golden cases."""
|
||||
|
||||
cases: tuple[EvaluationReplayCaseResult, ...]
|
||||
|
||||
@property
|
||||
def passed(self) -> bool:
|
||||
"""Return whether every replay case matched expectations."""
|
||||
|
||||
return all(case.expectations_met for case in self.cases)
|
||||
|
||||
@property
|
||||
def failed_case_ids(self) -> tuple[str, ...]:
|
||||
"""Return case IDs with expectation mismatches."""
|
||||
|
||||
return tuple(case.case_id for case in self.cases if not case.expectations_met)
|
||||
|
||||
def failure_report(self) -> str:
|
||||
"""Return a compact failure report for assertion messages."""
|
||||
|
||||
lines: list[str] = []
|
||||
for case in self.cases:
|
||||
for failure in case.failures:
|
||||
lines.append(f"{case.case_id}: {failure}")
|
||||
return "\n".join(lines)
|
||||
|
||||
def coverage_summary(self) -> dict[str, dict[str, int]]:
|
||||
"""Return internal coverage counts for golden replay review."""
|
||||
|
||||
return {
|
||||
"artifact": _count_values(case.artifact.value for case in self.cases),
|
||||
"age_band": _count_values(case.coverage.age_band for case in self.cases),
|
||||
"content_shape": _count_values(
|
||||
case.coverage.content_shape for case in self.cases
|
||||
),
|
||||
"risk_area": _count_values(case.coverage.risk_area for case in self.cases),
|
||||
"tags": _count_values(
|
||||
tag for case in self.cases for tag in case.coverage.tags
|
||||
),
|
||||
"outcome": _count_values(
|
||||
"passed" if case.evaluation.passed else "blocked"
|
||||
for case in self.cases
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]:
|
||||
"""Load internal golden replay cases from a JSON file."""
|
||||
|
||||
raw_cases = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
if not isinstance(raw_cases, list):
|
||||
raise ValueError("Evaluation replay fixture must be a JSON array.")
|
||||
return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases)
|
||||
|
||||
|
||||
def run_evaluation_replay_cases(
|
||||
cases: Iterable[EvaluationReplayCase],
|
||||
) -> EvaluationReplaySuiteResult:
|
||||
"""Run a set of internal golden evaluation replay cases."""
|
||||
|
||||
return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases))
|
||||
|
||||
|
||||
def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult:
|
||||
"""Load and run internal golden evaluation replay cases."""
|
||||
|
||||
return run_evaluation_replay_cases(load_evaluation_replay_cases(path))
|
||||
|
||||
|
||||
def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput:
|
||||
return StoryOutput(
|
||||
mode=payload.get("mode", "generated"),
|
||||
title=payload.get("title", ""),
|
||||
story_text=payload.get("story_text", ""),
|
||||
cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""),
|
||||
)
|
||||
|
||||
|
||||
def _storybook_from_payload(payload: dict[str, Any]) -> Storybook:
|
||||
pages = [
|
||||
StorybookPage(
|
||||
page_number=page.get("page_number", index + 1),
|
||||
text=page.get("text", ""),
|
||||
image_prompt=page.get("image_prompt", ""),
|
||||
image_url=page.get("image_url"),
|
||||
)
|
||||
for index, page in enumerate(payload.get("pages", []))
|
||||
]
|
||||
|
||||
return Storybook(
|
||||
title=payload.get("title", ""),
|
||||
main_character=payload.get("main_character", ""),
|
||||
art_style=payload.get("art_style", ""),
|
||||
pages=pages,
|
||||
cover_prompt=payload.get("cover_prompt", ""),
|
||||
cover_url=payload.get("cover_url"),
|
||||
)
|
||||
|
||||
|
||||
def _count_values(values: Iterable[str]) -> dict[str, int]:
|
||||
counts = Counter(value for value in values if value)
|
||||
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
|
||||
|
||||
|
||||
def _compare_evaluation(
|
||||
case: EvaluationReplayCase,
|
||||
evaluation: EvaluationResult,
|
||||
) -> list[str]:
|
||||
expected = case.expected
|
||||
failures: list[str] = []
|
||||
|
||||
if evaluation.passed != expected.passed:
|
||||
failures.append(f"expected passed={expected.passed}, got {evaluation.passed}")
|
||||
|
||||
if evaluation.blocking != expected.blocking:
|
||||
failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}")
|
||||
|
||||
if (
|
||||
expected.min_overall_score is not None
|
||||
and evaluation.overall_score < expected.min_overall_score
|
||||
):
|
||||
failures.append(
|
||||
"expected overall_score >= "
|
||||
f"{expected.min_overall_score}, got {evaluation.overall_score}"
|
||||
)
|
||||
|
||||
if (
|
||||
expected.max_overall_score is not None
|
||||
and evaluation.overall_score > expected.max_overall_score
|
||||
):
|
||||
failures.append(
|
||||
"expected overall_score <= "
|
||||
f"{expected.max_overall_score}, got {evaluation.overall_score}"
|
||||
)
|
||||
|
||||
actual_dimensions = {score.dimension for score in evaluation.scores}
|
||||
missing_dimensions = [
|
||||
dimension.value
|
||||
for dimension in expected.required_dimensions
|
||||
if dimension not in actual_dimensions
|
||||
]
|
||||
if missing_dimensions:
|
||||
failures.append(f"missing dimensions: {', '.join(missing_dimensions)}")
|
||||
|
||||
actual_quality_gate_codes = tuple(
|
||||
issue.code.value for issue in evaluation.gate_error.issues
|
||||
) if evaluation.gate_error is not None else ()
|
||||
if actual_quality_gate_codes != expected.quality_gate_codes:
|
||||
failures.append(
|
||||
"expected quality_gate_codes="
|
||||
f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}"
|
||||
)
|
||||
|
||||
for expected_warning in expected.warning_substrings:
|
||||
if not any(expected_warning in warning for warning in evaluation.warnings):
|
||||
failures.append(f"missing warning containing: {expected_warning}")
|
||||
|
||||
return failures
|
||||
267
backend/app/services/harness/evaluators.py
Normal file
267
backend/app/services/harness/evaluators.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Deterministic evaluation helpers for generated child-facing content."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
|
||||
from app.services.adapters.storybook.primary import Storybook
|
||||
from app.services.adapters.text.models import StoryOutput
|
||||
from app.services.harness.quality_gates import (
|
||||
QualityGateError,
|
||||
validate_story_output,
|
||||
validate_storybook_output,
|
||||
)
|
||||
|
||||
|
||||
class EvaluationDimension(StrEnum):
|
||||
"""Stable dimensions used by harness evaluations."""
|
||||
|
||||
STRUCTURE = "structure"
|
||||
SAFETY = "safety"
|
||||
AGE_FIT = "age_fit"
|
||||
EDUCATIONAL_VALUE = "educational_value"
|
||||
READABILITY = "readability"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationScore:
|
||||
"""One scored evaluation dimension."""
|
||||
|
||||
dimension: EvaluationDimension
|
||||
score: float
|
||||
reason: str
|
||||
|
||||
def to_metadata(self) -> dict[str, Any]:
|
||||
"""Return a JSON-safe metadata payload."""
|
||||
|
||||
return {
|
||||
"dimension": self.dimension.value,
|
||||
"score": self.score,
|
||||
"reason": self.reason,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvaluationResult:
|
||||
"""Deterministic evaluation result for one generated artifact."""
|
||||
|
||||
overall_score: float
|
||||
passed: bool
|
||||
blocking: bool
|
||||
scores: tuple[EvaluationScore, ...]
|
||||
gate_error: QualityGateError | None = None
|
||||
warnings: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
def to_metadata(self) -> dict[str, Any]:
|
||||
"""Return a JSON-safe metadata payload."""
|
||||
|
||||
metadata: dict[str, Any] = {
|
||||
"overall_score": self.overall_score,
|
||||
"passed": self.passed,
|
||||
"blocking": self.blocking,
|
||||
"scores": [score.to_metadata() for score in self.scores],
|
||||
"warnings": list(self.warnings),
|
||||
}
|
||||
if self.gate_error is not None:
|
||||
metadata["quality_gate"] = self.gate_error.to_metadata()
|
||||
return metadata
|
||||
|
||||
|
||||
def _clamp_score(value: float) -> float:
|
||||
return max(0.0, min(1.0, round(value, 2)))
|
||||
|
||||
|
||||
def _story_text_readability_score(story_text: str) -> float:
|
||||
"""Score text length with a conservative 3-8 age readability heuristic."""
|
||||
|
||||
normalized_length = len(story_text.strip())
|
||||
if normalized_length < 30:
|
||||
return 0.45
|
||||
if normalized_length > 2500:
|
||||
return 0.72
|
||||
if normalized_length > 1800:
|
||||
return 0.84
|
||||
return 0.96
|
||||
|
||||
|
||||
def _educational_value_score(story_text: str, education_theme: str | None) -> float:
|
||||
if not education_theme:
|
||||
return 0.82
|
||||
return 0.96 if education_theme.strip() in story_text else 0.88
|
||||
|
||||
|
||||
def _storybook_readability_score(page_texts: list[str]) -> float:
|
||||
if not page_texts:
|
||||
return 0.0
|
||||
|
||||
page_lengths = [len(text.strip()) for text in page_texts]
|
||||
if any(length < 8 for length in page_lengths):
|
||||
return 0.62
|
||||
if any(length > 320 for length in page_lengths):
|
||||
return 0.78
|
||||
if any(length > 220 for length in page_lengths):
|
||||
return 0.88
|
||||
return 0.96
|
||||
|
||||
|
||||
def _storybook_educational_value_score(
|
||||
page_texts: list[str],
|
||||
education_theme: str | None,
|
||||
) -> float:
|
||||
if not education_theme:
|
||||
return 0.82
|
||||
combined_text = " ".join(page_texts)
|
||||
return 0.96 if education_theme.strip() in combined_text else 0.88
|
||||
|
||||
|
||||
def evaluate_story_output(
|
||||
output: StoryOutput,
|
||||
*,
|
||||
education_theme: str | None = None,
|
||||
minimum_score: float = 0.7,
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate a generated text story before persistence."""
|
||||
|
||||
try:
|
||||
validate_story_output(output)
|
||||
except QualityGateError as exc:
|
||||
scores = (
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.STRUCTURE,
|
||||
score=0.0,
|
||||
reason="故事结构未通过质量门。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.SAFETY,
|
||||
score=0.0,
|
||||
reason="内容未通过儿童安全或结构完整性检查。",
|
||||
),
|
||||
)
|
||||
return EvaluationResult(
|
||||
overall_score=0.0,
|
||||
passed=False,
|
||||
blocking=True,
|
||||
scores=scores,
|
||||
gate_error=exc,
|
||||
)
|
||||
|
||||
readability_score = _story_text_readability_score(output.story_text)
|
||||
educational_score = _educational_value_score(output.story_text, education_theme)
|
||||
warnings: list[str] = []
|
||||
|
||||
if readability_score < 0.8:
|
||||
warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。")
|
||||
|
||||
scores = (
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.STRUCTURE,
|
||||
score=1.0,
|
||||
reason="标题、正文和封面提示词完整。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.SAFETY,
|
||||
score=1.0,
|
||||
reason="未命中确定性儿童安全风险词。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.AGE_FIT,
|
||||
score=readability_score,
|
||||
reason="根据正文长度估算低龄儿童阅读适配度。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
|
||||
score=educational_score,
|
||||
reason="根据教育主题是否清晰融入正文估算。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.READABILITY,
|
||||
score=readability_score,
|
||||
reason="根据正文长度估算朗读和亲子共读流畅度。",
|
||||
),
|
||||
)
|
||||
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
|
||||
|
||||
return EvaluationResult(
|
||||
overall_score=overall_score,
|
||||
passed=overall_score >= minimum_score,
|
||||
blocking=overall_score < minimum_score,
|
||||
scores=scores,
|
||||
warnings=tuple(warnings),
|
||||
)
|
||||
|
||||
|
||||
def evaluate_storybook_output(
|
||||
output: Storybook,
|
||||
*,
|
||||
education_theme: str | None = None,
|
||||
minimum_score: float = 0.7,
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate generated storybook structure before persistence."""
|
||||
|
||||
try:
|
||||
validate_storybook_output(output)
|
||||
except QualityGateError as exc:
|
||||
scores = (
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.STRUCTURE,
|
||||
score=0.0,
|
||||
reason="绘本结构未通过质量门。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.SAFETY,
|
||||
score=0.0,
|
||||
reason="绘本内容未通过儿童安全或结构完整性检查。",
|
||||
),
|
||||
)
|
||||
return EvaluationResult(
|
||||
overall_score=0.0,
|
||||
passed=False,
|
||||
blocking=True,
|
||||
scores=scores,
|
||||
gate_error=exc,
|
||||
)
|
||||
|
||||
page_texts = [page.text for page in output.pages]
|
||||
readability_score = _storybook_readability_score(page_texts)
|
||||
educational_score = _storybook_educational_value_score(page_texts, education_theme)
|
||||
warnings: list[str] = []
|
||||
|
||||
if readability_score < 0.8:
|
||||
warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。")
|
||||
|
||||
scores = (
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.STRUCTURE,
|
||||
score=1.0,
|
||||
reason="绘本标题、分页和页码结构完整。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.SAFETY,
|
||||
score=1.0,
|
||||
reason="未命中确定性儿童安全风险词。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.AGE_FIT,
|
||||
score=readability_score,
|
||||
reason="根据每页正文长度估算低龄儿童翻页阅读适配度。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
|
||||
score=educational_score,
|
||||
reason="根据教育主题是否清晰融入分页正文估算。",
|
||||
),
|
||||
EvaluationScore(
|
||||
dimension=EvaluationDimension.READABILITY,
|
||||
score=readability_score,
|
||||
reason="根据分页正文长度估算亲子共读流畅度。",
|
||||
),
|
||||
)
|
||||
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
|
||||
|
||||
return EvaluationResult(
|
||||
overall_score=overall_score,
|
||||
passed=overall_score >= minimum_score,
|
||||
blocking=overall_score < minimum_score,
|
||||
scores=scores,
|
||||
warnings=tuple(warnings),
|
||||
)
|
||||
150
backend/app/services/harness/executor.py
Normal file
150
backend/app/services/harness/executor.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Small-step workflow executor helpers for generation harness adoption."""
|
||||
|
||||
from collections.abc import Awaitable, Callable
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.services.harness.artifacts import AssetCompletionResult
|
||||
from app.services.harness.plans import WorkflowPlan
|
||||
from app.services.harness.trace import TraceRecorder
|
||||
from app.services.harness.types import ArtifactKind, WorkflowStep
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.db.models import GenerationJob
|
||||
|
||||
AssetTask = Callable[[], Awaitable[AssetCompletionResult]]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AssetPlanRunResult:
|
||||
"""Result of executing asset-producing tasks from one workflow plan."""
|
||||
|
||||
task_results: tuple[AssetCompletionResult, ...]
|
||||
executed_task_keys: tuple[str, ...]
|
||||
ignored_task_keys: tuple[str, ...]
|
||||
|
||||
@property
|
||||
def result_assets(self) -> tuple[str, ...]:
|
||||
"""Assets returned by executed task handlers."""
|
||||
|
||||
return tuple(result.asset for result in self.task_results)
|
||||
|
||||
def to_metadata(self, plan: WorkflowPlan) -> dict[str, Any]:
|
||||
"""Return internal executor coverage metadata for admin-only analytics."""
|
||||
|
||||
return {
|
||||
"plan_mode": plan.mode.value,
|
||||
"planned_task_count": len(plan.tasks),
|
||||
"executed_task_count": len(self.executed_task_keys),
|
||||
"ignored_task_count": len(self.ignored_task_keys),
|
||||
"result_count": len(self.task_results),
|
||||
"executed_task_keys": list(self.executed_task_keys),
|
||||
"ignored_task_keys": list(self.ignored_task_keys),
|
||||
"result_assets": list(self.result_assets),
|
||||
}
|
||||
|
||||
|
||||
async def record_workflow_plan(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
job: "GenerationJob | None",
|
||||
plan: WorkflowPlan,
|
||||
) -> None:
|
||||
"""Persist a workflow plan snapshot for a tracked job."""
|
||||
|
||||
await TraceRecorder(db).record_step(
|
||||
job=job,
|
||||
event_type="workflow_planned",
|
||||
status="succeeded",
|
||||
message="Workflow plan selected for this generation request.",
|
||||
metadata={"plan": plan.to_snapshot()},
|
||||
step=WorkflowStep.REQUEST_ACCEPTANCE,
|
||||
artifact=ArtifactKind.NONE,
|
||||
blocks_main_result=True,
|
||||
)
|
||||
|
||||
|
||||
async def record_evaluation_result(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
job: "GenerationJob | None",
|
||||
story_id: int | None = None,
|
||||
metadata: dict[str, Any],
|
||||
status: str,
|
||||
artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
|
||||
) -> None:
|
||||
"""Persist a deterministic evaluation result for a tracked job."""
|
||||
|
||||
await TraceRecorder(db).record_step(
|
||||
job=job,
|
||||
story_id=story_id,
|
||||
event_type="evaluation_completed",
|
||||
status=status,
|
||||
message="Generated content evaluation completed.",
|
||||
metadata=metadata,
|
||||
step=WorkflowStep.EVALUATION,
|
||||
artifact=artifact,
|
||||
blocks_main_result=status != "succeeded",
|
||||
)
|
||||
|
||||
|
||||
async def record_executor_result(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
job: "GenerationJob | None",
|
||||
plan: WorkflowPlan,
|
||||
result: AssetPlanRunResult,
|
||||
) -> None:
|
||||
"""Persist internal executor coverage metadata for a tracked job."""
|
||||
|
||||
await TraceRecorder(db).record_step(
|
||||
job=job,
|
||||
event_type="executor_completed",
|
||||
status="succeeded",
|
||||
message="Workflow executor completed planned asset tasks.",
|
||||
metadata=result.to_metadata(plan),
|
||||
step=WorkflowStep.UNKNOWN,
|
||||
artifact=ArtifactKind.NONE,
|
||||
blocks_main_result=False,
|
||||
)
|
||||
|
||||
|
||||
async def run_asset_plan(
|
||||
plan: WorkflowPlan,
|
||||
*,
|
||||
image_task: AssetTask | None = None,
|
||||
audio_task: AssetTask | None = None,
|
||||
) -> AssetPlanRunResult:
|
||||
"""Execute asset-producing tasks in the order declared by a workflow plan."""
|
||||
|
||||
if plan.mode.value not in {"asset_generation", "asset_retry"}:
|
||||
raise ValueError("run_asset_plan only supports asset workflow plans")
|
||||
|
||||
task_results: list[AssetCompletionResult] = []
|
||||
executed_task_keys: list[str] = []
|
||||
ignored_task_keys: list[str] = []
|
||||
|
||||
for task in plan.tasks:
|
||||
if task.key == "complete_image_asset":
|
||||
if image_task is None:
|
||||
raise ValueError("Asset workflow plan requires an image task handler")
|
||||
task_results.append(await image_task())
|
||||
executed_task_keys.append(task.key)
|
||||
continue
|
||||
|
||||
if task.key == "complete_audio_asset":
|
||||
if audio_task is None:
|
||||
raise ValueError("Asset workflow plan requires an audio task handler")
|
||||
task_results.append(await audio_task())
|
||||
executed_task_keys.append(task.key)
|
||||
continue
|
||||
|
||||
ignored_task_keys.append(task.key)
|
||||
|
||||
return AssetPlanRunResult(
|
||||
task_results=tuple(task_results),
|
||||
executed_task_keys=tuple(executed_task_keys),
|
||||
ignored_task_keys=tuple(ignored_task_keys),
|
||||
)
|
||||
@@ -0,0 +1,400 @@
|
||||
[
|
||||
{
|
||||
"id": "story-safe-theme-pass",
|
||||
"artifact": "story",
|
||||
"description": "完整、儿童安全且清晰包含教育主题的普通故事。",
|
||||
"coverage": {
|
||||
"age_band": "5-6",
|
||||
"content_shape": "short_story",
|
||||
"risk_area": "happy_path",
|
||||
"tags": ["theme_present", "safe", "story"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小兔子, 月光花园",
|
||||
"education_theme": "复盘"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小兔子的月光花园",
|
||||
"story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。",
|
||||
"cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
|
||||
},
|
||||
"expected": {
|
||||
"passed": true,
|
||||
"blocking": false,
|
||||
"min_overall_score": 0.9,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"age_fit",
|
||||
"educational_value",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-long-safe-pass",
|
||||
"artifact": "story",
|
||||
"description": "较长但仍适合亲子共读的普通故事。",
|
||||
"coverage": {
|
||||
"age_band": "7-8",
|
||||
"content_shape": "long_story",
|
||||
"risk_area": "length_boundary",
|
||||
"tags": ["theme_present", "long_text", "story"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小海豚, 图书馆",
|
||||
"education_theme": "合作"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小海豚的蓝色图书馆",
|
||||
"story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。",
|
||||
"cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
|
||||
},
|
||||
"expected": {
|
||||
"passed": true,
|
||||
"blocking": false,
|
||||
"min_overall_score": 0.9,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"age_fit",
|
||||
"educational_value",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-missing-text-blocks",
|
||||
"artifact": "story",
|
||||
"description": "故事正文缺失会被确定性质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "unknown",
|
||||
"content_shape": "empty_story",
|
||||
"risk_area": "schema_error",
|
||||
"tags": ["missing_text", "story", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小熊, 星星"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小熊找星星",
|
||||
"story_text": "",
|
||||
"cover_prompt_suggestion": "A bear looking at friendly stars"
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"missing_story_text"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-missing-cover-prompt-blocks",
|
||||
"artifact": "story",
|
||||
"description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "5-6",
|
||||
"content_shape": "short_story",
|
||||
"risk_area": "schema_error",
|
||||
"tags": ["missing_cover_prompt", "story", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小松鼠, 风筝",
|
||||
"education_theme": "勇敢"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小松鼠的风筝",
|
||||
"story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。",
|
||||
"cover_prompt_suggestion": ""
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"missing_cover_prompt"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-unsafe-term-blocks",
|
||||
"artifact": "story",
|
||||
"description": "明显不适合儿童的风险词会被安全质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "3-4",
|
||||
"content_shape": "short_story",
|
||||
"risk_area": "safety_error",
|
||||
"tags": ["unsafe_term", "story", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小猫, 城堡"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小猫的城堡",
|
||||
"story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。",
|
||||
"cover_prompt_suggestion": "A cat near a castle"
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"unsafe_child_content"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-short-high-threshold-blocks",
|
||||
"artifact": "story",
|
||||
"description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
|
||||
"coverage": {
|
||||
"age_band": "3-4",
|
||||
"content_shape": "very_short_story",
|
||||
"risk_area": "readability_warning",
|
||||
"tags": ["short_text", "threshold_block", "story"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小鹿, 书签",
|
||||
"education_theme": "耐心",
|
||||
"minimum_score": 0.82
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小鹿的书签",
|
||||
"story_text": "小鹿学会了耐心等待。",
|
||||
"cover_prompt_suggestion": "A deer with a golden bookmark"
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"min_overall_score": 0.7,
|
||||
"max_overall_score": 0.8,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": [],
|
||||
"warning_substrings": [
|
||||
"正文长度"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-safe-theme-pass",
|
||||
"artifact": "storybook",
|
||||
"description": "完整、儿童安全且包含教育主题的绘本分页输出。",
|
||||
"coverage": {
|
||||
"age_band": "5-6",
|
||||
"content_shape": "storybook_3_pages",
|
||||
"risk_area": "happy_path",
|
||||
"tags": ["theme_present", "safe", "storybook"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小狐狸, 彩虹桥",
|
||||
"education_theme": "合作"
|
||||
},
|
||||
"output": {
|
||||
"title": "彩虹桥上的合作",
|
||||
"main_character": "小狐狸米米",
|
||||
"art_style": "温暖水彩",
|
||||
"cover_prompt": "A warm watercolor fox near a rainbow bridge",
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
|
||||
"image_prompt": "A little fox finds a rainbow bridge"
|
||||
},
|
||||
{
|
||||
"page_number": 2,
|
||||
"text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。",
|
||||
"image_prompt": "Forest friends work together"
|
||||
},
|
||||
{
|
||||
"page_number": 3,
|
||||
"text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。",
|
||||
"image_prompt": "Friends carrying leaves together"
|
||||
}
|
||||
]
|
||||
},
|
||||
"expected": {
|
||||
"passed": true,
|
||||
"blocking": false,
|
||||
"min_overall_score": 0.9,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"age_fit",
|
||||
"educational_value",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-duplicate-page-blocks",
|
||||
"artifact": "storybook",
|
||||
"description": "重复页码的绘本结构会被质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "5-6",
|
||||
"content_shape": "storybook_invalid_pages",
|
||||
"risk_area": "schema_error",
|
||||
"tags": ["duplicate_page", "storybook", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小熊, 森林"
|
||||
},
|
||||
"output": {
|
||||
"title": "森林里的小熊",
|
||||
"main_character": "小熊布布",
|
||||
"art_style": "水彩",
|
||||
"cover_prompt": "A bear in a forest",
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "布布在森林里找到一颗松果。",
|
||||
"image_prompt": "Bear finds a pinecone"
|
||||
},
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "布布把松果带给朋友一起观察。",
|
||||
"image_prompt": "Bear shares the pinecone"
|
||||
}
|
||||
]
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"invalid_storybook_page_number"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-missing-page-blocks",
|
||||
"artifact": "storybook",
|
||||
"description": "没有分页内容的绘本会被结构质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "unknown",
|
||||
"content_shape": "storybook_empty_pages",
|
||||
"risk_area": "schema_error",
|
||||
"tags": ["missing_page", "storybook", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小鸟, 云朵"
|
||||
},
|
||||
"output": {
|
||||
"title": "小鸟和云朵",
|
||||
"main_character": "小鸟啾啾",
|
||||
"art_style": "柔和水彩",
|
||||
"cover_prompt": "A bird near soft clouds",
|
||||
"pages": []
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"missing_storybook_page"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-unsafe-term-blocks",
|
||||
"artifact": "storybook",
|
||||
"description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "3-4",
|
||||
"content_shape": "storybook_2_pages",
|
||||
"risk_area": "safety_error",
|
||||
"tags": ["unsafe_term", "storybook", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小兔子, 山洞"
|
||||
},
|
||||
"output": {
|
||||
"title": "山洞里的声音",
|
||||
"main_character": "小兔子米粒",
|
||||
"art_style": "温暖水彩",
|
||||
"cover_prompt": "A rabbit near a cave",
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "米粒走到山洞边,听见奇怪的声音。",
|
||||
"image_prompt": "Rabbit near a cave"
|
||||
},
|
||||
{
|
||||
"page_number": 2,
|
||||
"text": "洞里出现血腥画面,米粒吓得跑开。",
|
||||
"image_prompt": "Rabbit running away"
|
||||
}
|
||||
]
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"unsafe_child_content"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-short-page-warning",
|
||||
"artifact": "storybook",
|
||||
"description": "分页正文过短时保留内部警告,用于评测回归。",
|
||||
"coverage": {
|
||||
"age_band": "3-4",
|
||||
"content_shape": "storybook_2_pages",
|
||||
"risk_area": "readability_warning",
|
||||
"tags": ["short_page_text", "threshold_block", "storybook"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小羊, 风铃",
|
||||
"minimum_score": 0.85
|
||||
},
|
||||
"output": {
|
||||
"title": "风铃响了",
|
||||
"main_character": "小羊团团",
|
||||
"art_style": "柔和蜡笔",
|
||||
"cover_prompt": "A lamb listening to a wind chime",
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "风响。",
|
||||
"image_prompt": "Wind chime rings"
|
||||
},
|
||||
{
|
||||
"page_number": 2,
|
||||
"text": "团团笑。",
|
||||
"image_prompt": "Lamb smiles"
|
||||
}
|
||||
]
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"min_overall_score": 0.8,
|
||||
"max_overall_score": 0.82,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": [],
|
||||
"warning_substrings": [
|
||||
"分页正文长度"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -69,6 +69,11 @@ def build_story_plan(*, generate_images: bool) -> WorkflowPlan:
|
||||
step=WorkflowStep.NARRATIVE_GENERATION,
|
||||
artifact=ArtifactKind.STORY_TEXT,
|
||||
),
|
||||
WorkflowTask(
|
||||
key="evaluate_narrative",
|
||||
step=WorkflowStep.EVALUATION,
|
||||
artifact=ArtifactKind.STORY_TEXT,
|
||||
),
|
||||
WorkflowTask(
|
||||
key="persist_story",
|
||||
step=WorkflowStep.STORY_PERSISTENCE,
|
||||
@@ -124,6 +129,11 @@ def build_storybook_plan(*, generate_images: bool) -> WorkflowPlan:
|
||||
step=WorkflowStep.NARRATIVE_GENERATION,
|
||||
artifact=ArtifactKind.STORYBOOK_PAGES,
|
||||
),
|
||||
WorkflowTask(
|
||||
key="evaluate_storybook_pages",
|
||||
step=WorkflowStep.EVALUATION,
|
||||
artifact=ArtifactKind.STORYBOOK_PAGES,
|
||||
),
|
||||
]
|
||||
|
||||
if generate_images:
|
||||
|
||||
@@ -11,6 +11,7 @@ class WorkflowStep(StrEnum):
|
||||
WORKER_START = "worker_start"
|
||||
CONTEXT_PREPARATION = "context_preparation"
|
||||
NARRATIVE_GENERATION = "narrative_generation"
|
||||
EVALUATION = "evaluation"
|
||||
STORY_PERSISTENCE = "story_persistence"
|
||||
PROVIDER_INVOCATION = "provider_invocation"
|
||||
IMAGE_GENERATION = "image_generation"
|
||||
@@ -64,6 +65,8 @@ class StepStatus(StrEnum):
|
||||
|
||||
EVENT_STEP_MAP: dict[str, WorkflowStep] = {
|
||||
"request_accepted": WorkflowStep.REQUEST_ACCEPTANCE,
|
||||
"workflow_planned": WorkflowStep.REQUEST_ACCEPTANCE,
|
||||
"executor_completed": WorkflowStep.UNKNOWN,
|
||||
"retry_queued": WorkflowStep.REQUEST_ACCEPTANCE,
|
||||
"worker_started": WorkflowStep.WORKER_START,
|
||||
"context_prepared": WorkflowStep.CONTEXT_PREPARATION,
|
||||
@@ -73,6 +76,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
|
||||
"provider_call_succeeded": WorkflowStep.PROVIDER_INVOCATION,
|
||||
"provider_call_failed": WorkflowStep.PROVIDER_INVOCATION,
|
||||
"quality_gate_failed": WorkflowStep.NARRATIVE_GENERATION,
|
||||
"evaluation_completed": WorkflowStep.EVALUATION,
|
||||
"cover_image_started": WorkflowStep.IMAGE_GENERATION,
|
||||
"cover_image_succeeded": WorkflowStep.IMAGE_GENERATION,
|
||||
"cover_image_failed": WorkflowStep.IMAGE_GENERATION,
|
||||
@@ -100,6 +104,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
|
||||
EVENT_ARTIFACT_MAP: dict[str, ArtifactKind] = {
|
||||
"narrative_generated": ArtifactKind.STORY_TEXT,
|
||||
"quality_gate_failed": ArtifactKind.STORY_TEXT,
|
||||
"evaluation_completed": ArtifactKind.STORY_TEXT,
|
||||
"cover_image_started": ArtifactKind.COVER_IMAGE,
|
||||
"cover_image_succeeded": ArtifactKind.COVER_IMAGE,
|
||||
"cover_image_failed": ArtifactKind.COVER_IMAGE,
|
||||
|
||||
@@ -36,8 +36,8 @@ from app.services.generation_jobs import (
|
||||
ensure_no_active_story_generation_job,
|
||||
finish_generation_job,
|
||||
generation_job_can_retry,
|
||||
generation_job_to_summary,
|
||||
get_generation_job_for_user,
|
||||
public_generation_job_to_summary,
|
||||
record_generation_event,
|
||||
)
|
||||
from app.services.harness.artifacts import (
|
||||
@@ -57,12 +57,27 @@ from app.services.harness.control import (
|
||||
ExecutionControl,
|
||||
GenerationJobCanceledError,
|
||||
)
|
||||
from app.services.harness.evaluators import (
|
||||
EvaluationResult,
|
||||
evaluate_story_output,
|
||||
evaluate_storybook_output,
|
||||
)
|
||||
from app.services.harness.executor import (
|
||||
record_evaluation_result,
|
||||
record_executor_result,
|
||||
record_workflow_plan,
|
||||
run_asset_plan,
|
||||
)
|
||||
from app.services.harness.plans import (
|
||||
build_asset_plan,
|
||||
build_story_plan,
|
||||
build_storybook_plan,
|
||||
)
|
||||
from app.services.harness.quality_gates import (
|
||||
QualityGateError,
|
||||
validate_story_output,
|
||||
validate_storybook_output,
|
||||
)
|
||||
from app.services.harness.trace import TraceRecorder
|
||||
from app.services.harness.types import ArtifactKind
|
||||
from app.services.memory_service import build_enhanced_memory_context
|
||||
from app.services.provider_router import (
|
||||
generate_image,
|
||||
@@ -129,6 +144,24 @@ async def _record_quality_gate_failure_if_present(
|
||||
)
|
||||
|
||||
|
||||
async def _record_evaluation_result_if_present(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
job,
|
||||
evaluation: EvaluationResult,
|
||||
artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
|
||||
) -> None:
|
||||
"""Append deterministic evaluation metadata for tracked worker jobs."""
|
||||
|
||||
await record_evaluation_result(
|
||||
db,
|
||||
job=job,
|
||||
metadata=evaluation.to_metadata(),
|
||||
status="succeeded" if evaluation.passed else "failed",
|
||||
artifact=artifact,
|
||||
)
|
||||
|
||||
|
||||
def _asset_result_metadata(result: AssetCompletionResult) -> dict:
|
||||
"""Build JSON-safe metadata for asset workflow events."""
|
||||
|
||||
@@ -643,18 +676,33 @@ async def generate_and_save_story(
|
||||
user_id=user_id,
|
||||
generation_job=job,
|
||||
)
|
||||
validate_story_output(result)
|
||||
except QualityGateError as exc:
|
||||
await _record_quality_gate_failure_if_present(db, job=job, error=exc)
|
||||
raise HTTPException(
|
||||
status_code=502,
|
||||
detail="Story generation failed quality checks, please try again.",
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
raise HTTPException(
|
||||
status_code=502,
|
||||
detail="Story generation failed, please try again.",
|
||||
) from exc
|
||||
|
||||
evaluation = evaluate_story_output(
|
||||
result,
|
||||
education_theme=request.education_theme,
|
||||
)
|
||||
if evaluation.gate_error is not None:
|
||||
await _record_quality_gate_failure_if_present(
|
||||
db,
|
||||
job=job,
|
||||
error=evaluation.gate_error,
|
||||
)
|
||||
await _record_evaluation_result_if_present(
|
||||
db,
|
||||
job=job,
|
||||
evaluation=evaluation,
|
||||
)
|
||||
if evaluation.blocking:
|
||||
raise HTTPException(
|
||||
status_code=502,
|
||||
detail="Story generation failed quality checks, please try again.",
|
||||
)
|
||||
|
||||
await _record_job_event_if_present(
|
||||
db,
|
||||
job=job,
|
||||
@@ -758,13 +806,32 @@ async def generate_storybook_service(
|
||||
user_id=user_id,
|
||||
generation_job=job,
|
||||
)
|
||||
validate_storybook_output(storybook)
|
||||
except QualityGateError as exc:
|
||||
await _record_quality_gate_failure_if_present(db, job=job, error=exc)
|
||||
raise HTTPException(status_code=500, detail=f"故事书质量检查失败: {exc}") from exc
|
||||
except Exception as e:
|
||||
logger.error("storybook_generation_failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"故事书生成失败: {e}")
|
||||
|
||||
evaluation = evaluate_storybook_output(
|
||||
storybook,
|
||||
education_theme=request.education_theme,
|
||||
)
|
||||
if evaluation.gate_error is not None:
|
||||
await _record_quality_gate_failure_if_present(
|
||||
db,
|
||||
job=job,
|
||||
error=evaluation.gate_error,
|
||||
)
|
||||
await _record_evaluation_result_if_present(
|
||||
db,
|
||||
job=job,
|
||||
evaluation=evaluation,
|
||||
artifact=ArtifactKind.STORYBOOK_PAGES,
|
||||
)
|
||||
if evaluation.blocking:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"故事书质量检查失败: {evaluation.gate_error or 'evaluation blocked'}",
|
||||
)
|
||||
|
||||
await _record_job_event_if_present(
|
||||
db,
|
||||
job=job,
|
||||
@@ -1025,28 +1092,50 @@ async def _generate_asset_generation_service_with_job(
|
||||
if not requested_assets:
|
||||
raise HTTPException(status_code=400, detail="资源任务缺少 assets。")
|
||||
|
||||
plan = build_asset_plan(
|
||||
output_mode="asset_generation",
|
||||
assets=requested_assets,
|
||||
)
|
||||
await record_workflow_plan(
|
||||
db,
|
||||
job=job,
|
||||
plan=plan,
|
||||
)
|
||||
|
||||
story = await get_story_detail(int(story_id), job.user_id, db)
|
||||
|
||||
if "image" in requested_assets:
|
||||
async def complete_image() -> AssetCompletionResult:
|
||||
if story.mode == "storybook":
|
||||
await _complete_storybook_image_assets(story, db, job=job)
|
||||
else:
|
||||
await _complete_cover_image_asset(
|
||||
story,
|
||||
db,
|
||||
raise_on_failure=True,
|
||||
log_event="cover_generation_failed",
|
||||
job=job,
|
||||
)
|
||||
return await _complete_storybook_image_assets(story, db, job=job)
|
||||
|
||||
if "audio" in requested_assets:
|
||||
await _complete_audio_asset(
|
||||
return await _complete_cover_image_asset(
|
||||
story,
|
||||
db,
|
||||
raise_on_failure=True,
|
||||
log_event="cover_generation_failed",
|
||||
job=job,
|
||||
)
|
||||
|
||||
async def complete_audio() -> AssetCompletionResult:
|
||||
return await _complete_audio_asset(
|
||||
story,
|
||||
db,
|
||||
raise_on_failure=True,
|
||||
job=job,
|
||||
)
|
||||
|
||||
asset_plan_result = await run_asset_plan(
|
||||
plan,
|
||||
image_task=complete_image if "image" in requested_assets else None,
|
||||
audio_task=complete_audio if "audio" in requested_assets else None,
|
||||
)
|
||||
await record_executor_result(
|
||||
db,
|
||||
job=job,
|
||||
plan=plan,
|
||||
result=asset_plan_result,
|
||||
)
|
||||
|
||||
story = await get_story_detail(story.id, job.user_id, db)
|
||||
await finish_generation_job(
|
||||
db,
|
||||
@@ -1096,7 +1185,7 @@ async def retry_generation_job_service(
|
||||
)
|
||||
await _dispatch_generation_job(db, job=retry_job)
|
||||
await db.refresh(retry_job)
|
||||
return generation_job_to_summary(retry_job)
|
||||
return public_generation_job_to_summary(retry_job)
|
||||
|
||||
|
||||
async def _generate_generation_service_with_job(
|
||||
@@ -1109,6 +1198,11 @@ async def _generate_generation_service_with_job(
|
||||
"""Run the unified generation workflow after the tracking job has been created."""
|
||||
|
||||
if request.output_mode == "storybook":
|
||||
await record_workflow_plan(
|
||||
db,
|
||||
job=job,
|
||||
plan=build_storybook_plan(generate_images=request.generate_images),
|
||||
)
|
||||
storybook = await generate_storybook_service(
|
||||
StorybookRequest(
|
||||
keywords=request.data,
|
||||
@@ -1155,6 +1249,9 @@ async def _generate_generation_service_with_job(
|
||||
retryable_assets=saved_story.retryable_assets,
|
||||
)
|
||||
|
||||
if request.output_mode == "story" and not request.generate_images:
|
||||
return await _execute_story_without_assets_plan(request, user_id, db, job=job)
|
||||
|
||||
generate_request = GenerateRequest(
|
||||
type=request.type,
|
||||
data=request.data,
|
||||
@@ -1164,6 +1261,11 @@ async def _generate_generation_service_with_job(
|
||||
)
|
||||
|
||||
if request.generate_images:
|
||||
await record_workflow_plan(
|
||||
db,
|
||||
job=job,
|
||||
plan=build_story_plan(generate_images=True),
|
||||
)
|
||||
story = await generate_full_story_service(generate_request, user_id, db, job=job)
|
||||
saved_story = await get_story_detail(story.id, user_id, db)
|
||||
await _record_postprocessing_event_if_needed(db, job=job, story=saved_story)
|
||||
@@ -1222,6 +1324,54 @@ async def _generate_generation_service_with_job(
|
||||
universe_id=story.universe_id,
|
||||
retryable_assets=story.retryable_assets,
|
||||
)
|
||||
|
||||
|
||||
async def _execute_story_without_assets_plan(
|
||||
request: GenerationRequest,
|
||||
user_id: str,
|
||||
db: AsyncSession,
|
||||
*,
|
||||
job,
|
||||
) -> GenerationResponse:
|
||||
"""Execute the minimal text-story workflow through an explicit plan."""
|
||||
|
||||
plan = build_story_plan(generate_images=False)
|
||||
await record_workflow_plan(db, job=job, plan=plan)
|
||||
|
||||
generate_request = GenerateRequest(
|
||||
type=request.type,
|
||||
data=request.data,
|
||||
education_theme=request.education_theme,
|
||||
child_profile_id=request.child_profile_id,
|
||||
universe_id=request.universe_id,
|
||||
)
|
||||
story = await generate_and_save_story(generate_request, user_id, db, job=job)
|
||||
await _record_postprocessing_event_if_needed(db, job=job, story=story)
|
||||
await finish_generation_job(
|
||||
db,
|
||||
job=job,
|
||||
story=story,
|
||||
current_step="generation_completed",
|
||||
message="Story generation completed with a persisted readable narrative.",
|
||||
)
|
||||
return GenerationResponse(
|
||||
id=story.id,
|
||||
generation_job_id=job.id,
|
||||
title=story.title,
|
||||
mode=story.mode,
|
||||
story_text=story.story_text,
|
||||
cover_prompt=story.cover_prompt,
|
||||
image_url=story.image_url,
|
||||
cover_url=story.image_url,
|
||||
generation_status=story.generation_status,
|
||||
text_status=story.text_status,
|
||||
image_status=story.image_status,
|
||||
audio_status=story.audio_status,
|
||||
last_error=story.last_error,
|
||||
child_profile_id=story.child_profile_id,
|
||||
universe_id=story.universe_id,
|
||||
retryable_assets=story.retryable_assets,
|
||||
)
|
||||
|
||||
|
||||
async def list_stories(
|
||||
@@ -1321,36 +1471,7 @@ async def queue_story_asset_generation(
|
||||
)
|
||||
await _dispatch_generation_job(db, job=job)
|
||||
await db.refresh(job)
|
||||
return generation_job_to_summary(job)
|
||||
|
||||
|
||||
async def _retry_cover_image_asset(story: Story, db: AsyncSession, *, job=None) -> None:
|
||||
"""Retry cover generation for a text story."""
|
||||
|
||||
await _complete_cover_image_asset(
|
||||
story,
|
||||
db,
|
||||
last_error_prefix="封面生成失败",
|
||||
log_event="cover_asset_retry_failed",
|
||||
job=job,
|
||||
)
|
||||
|
||||
|
||||
async def _retry_storybook_image_assets(
|
||||
story: Story,
|
||||
db: AsyncSession,
|
||||
*,
|
||||
job=None,
|
||||
) -> None:
|
||||
"""Retry missing storybook cover/page images."""
|
||||
|
||||
await _complete_storybook_image_assets(story, db, job=job)
|
||||
|
||||
|
||||
async def _retry_audio_asset(story: Story, db: AsyncSession, *, job=None) -> None:
|
||||
"""Retry audio generation while preserving persisted status on provider failure."""
|
||||
|
||||
await _complete_audio_asset(story, db, raise_on_failure=False, job=job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
|
||||
async def retry_story_assets(
|
||||
@@ -1374,6 +1495,15 @@ async def retry_story_assets(
|
||||
|
||||
try:
|
||||
story = await get_story_detail(story_id, user_id, db)
|
||||
plan = build_asset_plan(
|
||||
output_mode="asset_retry",
|
||||
assets=requested_assets,
|
||||
)
|
||||
await record_workflow_plan(
|
||||
db,
|
||||
job=job,
|
||||
plan=plan,
|
||||
)
|
||||
await record_generation_event(
|
||||
db,
|
||||
job=job,
|
||||
@@ -1384,14 +1514,37 @@ async def retry_story_assets(
|
||||
metadata={"assets": requested_assets},
|
||||
)
|
||||
|
||||
if "image" in requested_assets:
|
||||
async def retry_image() -> AssetCompletionResult:
|
||||
if story.mode == "storybook":
|
||||
await _retry_storybook_image_assets(story, db, job=job)
|
||||
else:
|
||||
await _retry_cover_image_asset(story, db, job=job)
|
||||
return await _complete_storybook_image_assets(story, db, job=job)
|
||||
|
||||
if "audio" in requested_assets:
|
||||
await _retry_audio_asset(story, db, job=job)
|
||||
return await _complete_cover_image_asset(
|
||||
story,
|
||||
db,
|
||||
last_error_prefix="封面生成失败",
|
||||
log_event="cover_asset_retry_failed",
|
||||
job=job,
|
||||
)
|
||||
|
||||
async def retry_audio() -> AssetCompletionResult:
|
||||
return await _complete_audio_asset(
|
||||
story,
|
||||
db,
|
||||
raise_on_failure=False,
|
||||
job=job,
|
||||
)
|
||||
|
||||
asset_plan_result = await run_asset_plan(
|
||||
plan,
|
||||
image_task=retry_image if "image" in requested_assets else None,
|
||||
audio_task=retry_audio if "audio" in requested_assets else None,
|
||||
)
|
||||
await record_executor_result(
|
||||
db,
|
||||
job=job,
|
||||
plan=plan,
|
||||
result=asset_plan_result,
|
||||
)
|
||||
|
||||
story = await get_story_detail(story_id, user_id, db)
|
||||
await finish_generation_job(
|
||||
@@ -1448,13 +1601,29 @@ async def generate_story_cover(
|
||||
|
||||
try:
|
||||
story = await get_story_detail(story_id, user_id, db)
|
||||
image_result = await _complete_cover_image_asset(
|
||||
story,
|
||||
plan = build_asset_plan(output_mode="asset_generation", assets=["image"])
|
||||
await record_workflow_plan(
|
||||
db,
|
||||
raise_on_failure=True,
|
||||
log_event="cover_generation_failed",
|
||||
job=job,
|
||||
plan=plan,
|
||||
)
|
||||
asset_result = await run_asset_plan(
|
||||
plan,
|
||||
image_task=lambda: _complete_cover_image_asset(
|
||||
story,
|
||||
db,
|
||||
raise_on_failure=True,
|
||||
log_event="cover_generation_failed",
|
||||
job=job,
|
||||
),
|
||||
)
|
||||
await record_executor_result(
|
||||
db,
|
||||
job=job,
|
||||
plan=plan,
|
||||
result=asset_result,
|
||||
)
|
||||
image_result = asset_result.task_results[0] if asset_result.task_results else None
|
||||
story = await get_story_detail(story_id, user_id, db)
|
||||
await finish_generation_job(
|
||||
db,
|
||||
@@ -1464,7 +1633,11 @@ async def generate_story_cover(
|
||||
message="Cover image generation completed.",
|
||||
metadata={"assets": ["image"]},
|
||||
)
|
||||
if image_result.succeeded and isinstance(image_result.value, str):
|
||||
if (
|
||||
image_result is not None
|
||||
and image_result.succeeded
|
||||
and isinstance(image_result.value, str)
|
||||
):
|
||||
return image_result.value
|
||||
except HTTPException as exc:
|
||||
await finish_generation_job(
|
||||
@@ -1501,12 +1674,28 @@ async def generate_story_audio(
|
||||
|
||||
try:
|
||||
story = await get_story_detail(story_id, user_id, db)
|
||||
audio_result = await _complete_audio_asset(
|
||||
story,
|
||||
plan = build_asset_plan(output_mode="asset_generation", assets=["audio"])
|
||||
await record_workflow_plan(
|
||||
db,
|
||||
raise_on_failure=True,
|
||||
job=job,
|
||||
plan=plan,
|
||||
)
|
||||
asset_result = await run_asset_plan(
|
||||
plan,
|
||||
audio_task=lambda: _complete_audio_asset(
|
||||
story,
|
||||
db,
|
||||
raise_on_failure=True,
|
||||
job=job,
|
||||
),
|
||||
)
|
||||
await record_executor_result(
|
||||
db,
|
||||
job=job,
|
||||
plan=plan,
|
||||
result=asset_result,
|
||||
)
|
||||
audio_result = asset_result.task_results[0] if asset_result.task_results else None
|
||||
story = await get_story_detail(story_id, user_id, db)
|
||||
await finish_generation_job(
|
||||
db,
|
||||
@@ -1516,7 +1705,11 @@ async def generate_story_audio(
|
||||
message="Story audio generation completed.",
|
||||
metadata={"assets": ["audio"]},
|
||||
)
|
||||
if audio_result.succeeded and isinstance(audio_result.value, bytes):
|
||||
if (
|
||||
audio_result is not None
|
||||
and audio_result.succeeded
|
||||
and isinstance(audio_result.value, bytes)
|
||||
):
|
||||
return audio_result.value
|
||||
except HTTPException as exc:
|
||||
await finish_generation_job(
|
||||
|
||||
400
backend/tests/fixtures/evaluation_golden_cases.json
vendored
Normal file
400
backend/tests/fixtures/evaluation_golden_cases.json
vendored
Normal file
@@ -0,0 +1,400 @@
|
||||
[
|
||||
{
|
||||
"id": "story-safe-theme-pass",
|
||||
"artifact": "story",
|
||||
"description": "完整、儿童安全且清晰包含教育主题的普通故事。",
|
||||
"coverage": {
|
||||
"age_band": "5-6",
|
||||
"content_shape": "short_story",
|
||||
"risk_area": "happy_path",
|
||||
"tags": ["theme_present", "safe", "story"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小兔子, 月光花园",
|
||||
"education_theme": "复盘"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小兔子的月光花园",
|
||||
"story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。",
|
||||
"cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
|
||||
},
|
||||
"expected": {
|
||||
"passed": true,
|
||||
"blocking": false,
|
||||
"min_overall_score": 0.9,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"age_fit",
|
||||
"educational_value",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-long-safe-pass",
|
||||
"artifact": "story",
|
||||
"description": "较长但仍适合亲子共读的普通故事。",
|
||||
"coverage": {
|
||||
"age_band": "7-8",
|
||||
"content_shape": "long_story",
|
||||
"risk_area": "length_boundary",
|
||||
"tags": ["theme_present", "long_text", "story"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小海豚, 图书馆",
|
||||
"education_theme": "合作"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小海豚的蓝色图书馆",
|
||||
"story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。",
|
||||
"cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
|
||||
},
|
||||
"expected": {
|
||||
"passed": true,
|
||||
"blocking": false,
|
||||
"min_overall_score": 0.9,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"age_fit",
|
||||
"educational_value",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-missing-text-blocks",
|
||||
"artifact": "story",
|
||||
"description": "故事正文缺失会被确定性质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "unknown",
|
||||
"content_shape": "empty_story",
|
||||
"risk_area": "schema_error",
|
||||
"tags": ["missing_text", "story", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小熊, 星星"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小熊找星星",
|
||||
"story_text": "",
|
||||
"cover_prompt_suggestion": "A bear looking at friendly stars"
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"missing_story_text"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-missing-cover-prompt-blocks",
|
||||
"artifact": "story",
|
||||
"description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "5-6",
|
||||
"content_shape": "short_story",
|
||||
"risk_area": "schema_error",
|
||||
"tags": ["missing_cover_prompt", "story", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小松鼠, 风筝",
|
||||
"education_theme": "勇敢"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小松鼠的风筝",
|
||||
"story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。",
|
||||
"cover_prompt_suggestion": ""
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"missing_cover_prompt"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-unsafe-term-blocks",
|
||||
"artifact": "story",
|
||||
"description": "明显不适合儿童的风险词会被安全质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "3-4",
|
||||
"content_shape": "short_story",
|
||||
"risk_area": "safety_error",
|
||||
"tags": ["unsafe_term", "story", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小猫, 城堡"
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小猫的城堡",
|
||||
"story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。",
|
||||
"cover_prompt_suggestion": "A cat near a castle"
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"unsafe_child_content"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "story-short-high-threshold-blocks",
|
||||
"artifact": "story",
|
||||
"description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
|
||||
"coverage": {
|
||||
"age_band": "3-4",
|
||||
"content_shape": "very_short_story",
|
||||
"risk_area": "readability_warning",
|
||||
"tags": ["short_text", "threshold_block", "story"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小鹿, 书签",
|
||||
"education_theme": "耐心",
|
||||
"minimum_score": 0.82
|
||||
},
|
||||
"output": {
|
||||
"mode": "generated",
|
||||
"title": "小鹿的书签",
|
||||
"story_text": "小鹿学会了耐心等待。",
|
||||
"cover_prompt_suggestion": "A deer with a golden bookmark"
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"min_overall_score": 0.7,
|
||||
"max_overall_score": 0.8,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": [],
|
||||
"warning_substrings": [
|
||||
"正文长度"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-safe-theme-pass",
|
||||
"artifact": "storybook",
|
||||
"description": "完整、儿童安全且包含教育主题的绘本分页输出。",
|
||||
"coverage": {
|
||||
"age_band": "5-6",
|
||||
"content_shape": "storybook_3_pages",
|
||||
"risk_area": "happy_path",
|
||||
"tags": ["theme_present", "safe", "storybook"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小狐狸, 彩虹桥",
|
||||
"education_theme": "合作"
|
||||
},
|
||||
"output": {
|
||||
"title": "彩虹桥上的合作",
|
||||
"main_character": "小狐狸米米",
|
||||
"art_style": "温暖水彩",
|
||||
"cover_prompt": "A warm watercolor fox near a rainbow bridge",
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
|
||||
"image_prompt": "A little fox finds a rainbow bridge"
|
||||
},
|
||||
{
|
||||
"page_number": 2,
|
||||
"text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。",
|
||||
"image_prompt": "Forest friends work together"
|
||||
},
|
||||
{
|
||||
"page_number": 3,
|
||||
"text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。",
|
||||
"image_prompt": "Friends carrying leaves together"
|
||||
}
|
||||
]
|
||||
},
|
||||
"expected": {
|
||||
"passed": true,
|
||||
"blocking": false,
|
||||
"min_overall_score": 0.9,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"age_fit",
|
||||
"educational_value",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-duplicate-page-blocks",
|
||||
"artifact": "storybook",
|
||||
"description": "重复页码的绘本结构会被质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "5-6",
|
||||
"content_shape": "storybook_invalid_pages",
|
||||
"risk_area": "schema_error",
|
||||
"tags": ["duplicate_page", "storybook", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小熊, 森林"
|
||||
},
|
||||
"output": {
|
||||
"title": "森林里的小熊",
|
||||
"main_character": "小熊布布",
|
||||
"art_style": "水彩",
|
||||
"cover_prompt": "A bear in a forest",
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "布布在森林里找到一颗松果。",
|
||||
"image_prompt": "Bear finds a pinecone"
|
||||
},
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "布布把松果带给朋友一起观察。",
|
||||
"image_prompt": "Bear shares the pinecone"
|
||||
}
|
||||
]
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"invalid_storybook_page_number"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-missing-page-blocks",
|
||||
"artifact": "storybook",
|
||||
"description": "没有分页内容的绘本会被结构质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "unknown",
|
||||
"content_shape": "storybook_empty_pages",
|
||||
"risk_area": "schema_error",
|
||||
"tags": ["missing_page", "storybook", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小鸟, 云朵"
|
||||
},
|
||||
"output": {
|
||||
"title": "小鸟和云朵",
|
||||
"main_character": "小鸟啾啾",
|
||||
"art_style": "柔和水彩",
|
||||
"cover_prompt": "A bird near soft clouds",
|
||||
"pages": []
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"missing_storybook_page"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-unsafe-term-blocks",
|
||||
"artifact": "storybook",
|
||||
"description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
|
||||
"coverage": {
|
||||
"age_band": "3-4",
|
||||
"content_shape": "storybook_2_pages",
|
||||
"risk_area": "safety_error",
|
||||
"tags": ["unsafe_term", "storybook", "blocking"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小兔子, 山洞"
|
||||
},
|
||||
"output": {
|
||||
"title": "山洞里的声音",
|
||||
"main_character": "小兔子米粒",
|
||||
"art_style": "温暖水彩",
|
||||
"cover_prompt": "A rabbit near a cave",
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "米粒走到山洞边,听见奇怪的声音。",
|
||||
"image_prompt": "Rabbit near a cave"
|
||||
},
|
||||
{
|
||||
"page_number": 2,
|
||||
"text": "洞里出现血腥画面,米粒吓得跑开。",
|
||||
"image_prompt": "Rabbit running away"
|
||||
}
|
||||
]
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"max_overall_score": 0.0,
|
||||
"quality_gate_codes": [
|
||||
"unsafe_child_content"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "storybook-short-page-warning",
|
||||
"artifact": "storybook",
|
||||
"description": "分页正文过短时保留内部警告,用于评测回归。",
|
||||
"coverage": {
|
||||
"age_band": "3-4",
|
||||
"content_shape": "storybook_2_pages",
|
||||
"risk_area": "readability_warning",
|
||||
"tags": ["short_page_text", "threshold_block", "storybook"]
|
||||
},
|
||||
"input": {
|
||||
"keywords": "小羊, 风铃",
|
||||
"minimum_score": 0.85
|
||||
},
|
||||
"output": {
|
||||
"title": "风铃响了",
|
||||
"main_character": "小羊团团",
|
||||
"art_style": "柔和蜡笔",
|
||||
"cover_prompt": "A lamb listening to a wind chime",
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"text": "风响。",
|
||||
"image_prompt": "Wind chime rings"
|
||||
},
|
||||
{
|
||||
"page_number": 2,
|
||||
"text": "团团笑。",
|
||||
"image_prompt": "Lamb smiles"
|
||||
}
|
||||
]
|
||||
},
|
||||
"expected": {
|
||||
"passed": false,
|
||||
"blocking": true,
|
||||
"min_overall_score": 0.8,
|
||||
"max_overall_score": 0.82,
|
||||
"required_dimensions": [
|
||||
"structure",
|
||||
"safety",
|
||||
"readability"
|
||||
],
|
||||
"quality_gate_codes": [],
|
||||
"warning_substrings": [
|
||||
"分页正文长度"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
610
backend/tests/harness-evaluation-test-cases.md
Normal file
610
backend/tests/harness-evaluation-test-cases.md
Normal file
@@ -0,0 +1,610 @@
|
||||
# Test Cases: Harness Evaluation Driven Generation
|
||||
|
||||
## Overview
|
||||
|
||||
- **Feature**: Harness evaluation driven generation
|
||||
- **Requirements Source**: `docs/technical/harness-engineering-modernization.md`
|
||||
- **Test Coverage**: evaluation scoring, blocking quality failures, workflow plan events, trace aggregation, state transitions, internal golden replay, admin-only analytics, admin-only executor coverage summary, admin-only harness readiness
|
||||
- **Last Updated**: 2026-06-23
|
||||
|
||||
## Test Case Categories
|
||||
|
||||
### 1. Functional Tests
|
||||
|
||||
#### TC-F-001: 普通故事无图片生成写入评测事件
|
||||
|
||||
- **Requirement**: H7-3, H7-4
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 用户已登录。
|
||||
- 文本 provider 返回完整、儿童安全的故事。
|
||||
- **Test Steps**:
|
||||
1. 调用 `POST /api/generations`,设置 `output_mode=story`、`generate_images=false`。
|
||||
2. 执行 worker 任务。
|
||||
3. 查询 job detail。
|
||||
- **Expected Results**:
|
||||
- job 状态为 `completed`。
|
||||
- event 顺序包含 `workflow_planned`。
|
||||
- event 顺序包含 `evaluation_completed`。
|
||||
- `evaluation_completed.event_metadata.passed=true`。
|
||||
- `evaluation_completed.event_metadata.overall_score >= 0.7`。
|
||||
- **Postconditions**: 故事已持久化,`story_id` 写入 job。
|
||||
|
||||
#### TC-F-003: 用户 Trace summary 不返回评测摘要
|
||||
|
||||
- **Requirement**: H7-4, H7B-1
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 故事已有 `evaluation_completed` job event。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /api/generations/{story_id}/trace-summary`。
|
||||
2. 检查响应字段。
|
||||
- **Expected Results**:
|
||||
- 响应不包含 `evaluation` 字段。
|
||||
- `by_step` 不包含 `evaluation`。
|
||||
- `by_artifact` 不因 `evaluation_completed` 增加 `story_text` 计数。
|
||||
- `failed_events` 不统计 `evaluation_completed`。
|
||||
- `total_events` 不统计 `evaluation_completed`,避免通过事件数量泄露内部评测步骤。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-F-004: 用户 Job detail 不返回评测事件
|
||||
|
||||
- **Requirement**: H7-4, H7B-2
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- job 已记录 `evaluation_completed` 事件。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /api/generations/jobs/{job_id}`。
|
||||
2. 检查 `events` 列表。
|
||||
- **Expected Results**:
|
||||
- `events` 不包含 `evaluation_completed`。
|
||||
- 响应不包含评测分数、维度分数、通过率或阻断阈值。
|
||||
- **Postconditions**: 内部数据库事件不被删除。
|
||||
|
||||
#### TC-F-002: 完整故事输出获得通过评分
|
||||
|
||||
- **Requirement**: H7-1
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 构造完整 `StoryOutput`。
|
||||
- **Test Steps**:
|
||||
1. 调用 `evaluate_story_output`。
|
||||
2. 读取 `EvaluationResult`。
|
||||
- **Expected Results**:
|
||||
- `passed=true`。
|
||||
- `blocking=false`。
|
||||
- scores 包含 `structure`、`safety`、`age_fit`、`educational_value`、`readability`。
|
||||
- **Postconditions**: 无持久化副作用。
|
||||
|
||||
#### TC-F-005: 完整绘本输出获得通过评分
|
||||
|
||||
- **Requirement**: H7-1, H7C-1
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 构造完整 `Storybook`。
|
||||
- **Test Steps**:
|
||||
1. 调用 `evaluate_storybook_output`。
|
||||
2. 读取 `EvaluationResult`。
|
||||
- **Expected Results**:
|
||||
- `passed=true`。
|
||||
- `blocking=false`。
|
||||
- scores 包含 `structure`、`safety`、`age_fit`、`educational_value`、`readability`。
|
||||
- **Postconditions**: 无持久化副作用。
|
||||
|
||||
#### TC-F-006: 内部 golden cases 可回放且全部符合预期
|
||||
|
||||
- **Requirement**: H7-7, H7-8
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- `backend/app/services/harness/fixtures/evaluation_golden_cases.json` 存在。
|
||||
- fixture 只由后端测试、内部工具或 admin-only readiness 读取。
|
||||
- **Test Steps**:
|
||||
1. 调用 `replay_evaluation_golden_cases`。
|
||||
2. 读取 `EvaluationReplaySuiteResult`。
|
||||
- **Expected Results**:
|
||||
- `passed=true`。
|
||||
- `failed_case_ids` 为空。
|
||||
- 普通故事和绘本样本都被覆盖。
|
||||
- 样本覆盖完整普通故事、较长普通故事、空正文、缺失封面提示词、安全风险词、短文本阈值阻断、绘本重复页码、绘本缺页、绘本安全风险和绘本短分页。
|
||||
- 结果不通过任何用户端 API 返回。
|
||||
- **Postconditions**: 无持久化副作用。
|
||||
|
||||
#### TC-F-007: 内部 golden replay 覆盖摘要稳定
|
||||
|
||||
- **Requirement**: H7-8
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- golden replay suite 已执行。
|
||||
- **Test Steps**:
|
||||
1. 调用 `coverage_summary`。
|
||||
2. 检查 artifact、age_band、risk_area、tags 和 outcome 分布。
|
||||
- **Expected Results**:
|
||||
- artifact 覆盖 `story=6`、`storybook=5`。
|
||||
- age_band 覆盖 `3-4`、`5-6`、`7-8` 和 `unknown`。
|
||||
- risk_area 覆盖 `happy_path`、`schema_error`、`safety_error`、`readability_warning`、`length_boundary`。
|
||||
- outcome 覆盖 `passed=3`、`blocked=8`。
|
||||
- 覆盖摘要不通过任何用户端 API 返回。
|
||||
- **Postconditions**: 无持久化副作用。
|
||||
|
||||
### 2. Edge Case Tests
|
||||
|
||||
#### TC-E-001: 很短故事通过结构但产生低龄阅读体验警告
|
||||
|
||||
- **Requirement**: H7-1
|
||||
- **Priority**: Medium
|
||||
- **Preconditions**:
|
||||
- 构造标题、正文、封面提示词完整但正文很短的 `StoryOutput`。
|
||||
- **Test Steps**:
|
||||
1. 调用 `evaluate_story_output`。
|
||||
2. 读取 warnings 和维度分数。
|
||||
- **Expected Results**:
|
||||
- 不触发质量门异常。
|
||||
- `age_fit` 或 `readability` 分数低于完整故事。
|
||||
- warnings 包含阅读体验提示。
|
||||
- **Postconditions**: 无持久化副作用。
|
||||
|
||||
#### TC-E-002: 内部 golden replay 能报告预期不匹配
|
||||
|
||||
- **Requirement**: H7-7
|
||||
- **Priority**: Medium
|
||||
- **Preconditions**:
|
||||
- 构造一个实际得分低于期望阈值的 `EvaluationReplayCase`。
|
||||
- **Test Steps**:
|
||||
1. 调用 `run_evaluation_replay_cases`。
|
||||
2. 读取 `failure_report`。
|
||||
- **Expected Results**:
|
||||
- `passed=false`。
|
||||
- `failed_case_ids` 包含该 case id。
|
||||
- `failure_report` 包含 `overall_score` 差异。
|
||||
- **Postconditions**: 无持久化副作用。
|
||||
|
||||
### 3. Error Handling Tests
|
||||
|
||||
#### TC-ERR-001: 空正文阻断持久化
|
||||
|
||||
- **Requirement**: H7-4
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 文本 provider 返回空 `story_text`。
|
||||
- **Test Steps**:
|
||||
1. 执行 worker 任务。
|
||||
2. 查询 job 和 story 表。
|
||||
3. 查询 job events。
|
||||
- **Expected Results**:
|
||||
- job 状态为 `failed`。
|
||||
- 没有 story 被持久化。
|
||||
- events 包含 `quality_gate_failed`。
|
||||
- events 包含 `evaluation_completed`。
|
||||
- `evaluation_completed.event_metadata.blocking=true`。
|
||||
- **Postconditions**: 用户可重试该 job。
|
||||
|
||||
#### TC-ERR-002: 不适龄风险词阻断生成
|
||||
|
||||
- **Requirement**: H7-1
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 构造包含明显不适龄风险词的 `StoryOutput`。
|
||||
- **Test Steps**:
|
||||
1. 调用 `evaluate_story_output`。
|
||||
2. 读取 `quality_gate` metadata。
|
||||
- **Expected Results**:
|
||||
- `passed=false`。
|
||||
- `blocking=true`。
|
||||
- `quality_gate.issues[0].failure_category=safety_error`。
|
||||
- **Postconditions**: 无持久化副作用。
|
||||
|
||||
#### TC-ERR-003: 绘本结构错误阻断生成
|
||||
|
||||
- **Requirement**: H7-1, H7C-1
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 构造页码重复或页面缺失的 `Storybook`。
|
||||
- **Test Steps**:
|
||||
1. 调用 `evaluate_storybook_output`。
|
||||
2. 读取 `quality_gate` metadata。
|
||||
- **Expected Results**:
|
||||
- `passed=false`。
|
||||
- `blocking=true`。
|
||||
- `quality_gate.issues[0].code=invalid_storybook_page_number` 或对应结构错误。
|
||||
- **Postconditions**: 无持久化副作用。
|
||||
|
||||
### 4. State Transition Tests
|
||||
|
||||
#### TC-ST-001: 普通故事无图片路径事件顺序稳定
|
||||
|
||||
- **Requirement**: H7-3
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- job 初始状态为 `running/request_accepted`。
|
||||
- **Test Steps**:
|
||||
1. 执行 worker 任务。
|
||||
2. 按 id 查询 events。
|
||||
- **Expected Results**:
|
||||
- event 顺序为 `request_accepted`、`worker_started`、`workflow_planned`、`context_prepared`、`evaluation_completed`、`narrative_generated`、`story_saved`、`generation_completed`。
|
||||
- **Postconditions**: job `current_step=generation_completed`。
|
||||
|
||||
#### TC-ST-002: 普通故事带图片路径记录可恢复资产计划
|
||||
|
||||
- **Requirement**: H9-1, H9-3
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- job 初始状态为 `running/request_accepted`。
|
||||
- 请求设置 `output_mode=story`、`generate_images=true`。
|
||||
- 文本 provider 返回合格故事,图片 provider 返回封面 URL。
|
||||
- **Test Steps**:
|
||||
1. 执行 worker 任务。
|
||||
2. 按 id 查询内部 events。
|
||||
3. 读取 `workflow_planned.event_metadata.plan`。
|
||||
- **Expected Results**:
|
||||
- event 顺序为 `request_accepted`、`worker_started`、`workflow_planned`、`context_prepared`、`evaluation_completed`、`narrative_generated`、`story_saved`、`cover_image_started`、`cover_image_succeeded`、`generation_completed`。
|
||||
- `plan.mode=story_with_assets`。
|
||||
- plan tasks 包含 `evaluate_narrative`。
|
||||
- plan tasks 包含 `generate_cover_image`。
|
||||
- `generate_cover_image.required=false`。
|
||||
- `generate_cover_image.recoverable=true`。
|
||||
- **Postconditions**: job `current_step=generation_completed`,故事 `image_status=ready`。
|
||||
|
||||
#### TC-ST-003: 绘本路径记录绘本计划快照
|
||||
|
||||
- **Requirement**: H9-2, H9-3
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- job 初始状态为 `running/request_accepted`。
|
||||
- 请求设置 `output_mode=storybook`。
|
||||
- **Test Steps**:
|
||||
1. 执行 worker 任务。
|
||||
2. 按 id 查询内部 events。
|
||||
3. 读取 `workflow_planned.event_metadata.plan`。
|
||||
- **Expected Results**:
|
||||
- event 顺序包含 `workflow_planned`,且位于 `worker_started` 和 `context_prepared` 之间。
|
||||
- `plan.mode=storybook`。
|
||||
- plan tasks 包含 `generate_storybook_pages`。
|
||||
- plan tasks 包含 `evaluate_storybook_pages`。
|
||||
- 当 `generate_images=true` 时,plan tasks 包含 `generate_storybook_images`。
|
||||
- `generate_storybook_images.required=false`。
|
||||
- `generate_storybook_images.recoverable=true`。
|
||||
- **Postconditions**: job `current_step=generation_completed`。
|
||||
|
||||
#### TC-ST-004: 绘本生成内部记录评测但用户事件脱敏
|
||||
|
||||
- **Requirement**: H7C-1, H7B-2, H9-4
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 绘本生成 job 已执行完成。
|
||||
- **Test Steps**:
|
||||
1. 直接查询内部 `generation_job_events`。
|
||||
2. 调用 `GET /api/generations/jobs/{job_id}`。
|
||||
- **Expected Results**:
|
||||
- 内部事件包含 `evaluation_completed`。
|
||||
- 内部 `evaluation_completed.event_metadata.artifact=storybook_pages`。
|
||||
- 用户 API events 不包含 `evaluation_completed`。
|
||||
- 用户 API 响应不包含 `overall_score`、维度分数、阈值或 golden replay 字段。
|
||||
- **Postconditions**: job 完成,绘本已持久化。
|
||||
|
||||
#### TC-ST-005: 资产生成和重试路径记录资产计划快照
|
||||
|
||||
- **Requirement**: H10-1, H10-2, H10-3
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 故事已有可生成或可重试的图片/音频资源。
|
||||
- **Test Steps**:
|
||||
1. 执行 `asset_generation` worker 任务。
|
||||
2. 调用 `/api/generations/{story_id}/retry-assets`。
|
||||
3. 按 id 查询内部 events。
|
||||
- **Expected Results**:
|
||||
- `asset_generation` 事件顺序包含 `workflow_planned`。
|
||||
- `asset_generation` 的 `plan.mode=asset_generation`。
|
||||
- `asset_retry` 事件顺序包含 `workflow_planned`。
|
||||
- `asset_retry` 的 `plan.mode=asset_retry`。
|
||||
- 图片和音频任务在 plan 中为 `required=false`、`recoverable=true`。
|
||||
- **Postconditions**: 资源状态按原有语义更新。
|
||||
|
||||
#### TC-ST-006: 用户事件 metadata 使用白名单脱敏
|
||||
|
||||
- **Requirement**: H10-4, H10-5
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 内部 job events 包含原始 `plan.tasks`、`result_snapshot`、内部阈值或内部错误详情。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /api/generations/jobs/{job_id}`。
|
||||
2. 检查 `events[*].event_metadata`。
|
||||
- **Expected Results**:
|
||||
- 用户响应保留 `step`、`artifact`、`asset`、`assets`、`failure_category` 等可解释字段。
|
||||
- `workflow_planned` 只返回 `plan_mode`、`planned_task_count`、`recoverable_task_count`。
|
||||
- 用户响应不包含原始 `plan`、`tasks`、`result_snapshot`、内部阈值、内部错误原文。
|
||||
- 用户响应仍不包含 `evaluation_completed`、`overall_score`、维度分数或 golden replay 字段。
|
||||
- **Postconditions**: 内部数据库事件不被修改。
|
||||
|
||||
#### TC-ST-007: 用户 request payload 使用白名单脱敏
|
||||
|
||||
- **Requirement**: H11-1, H11-4
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 生成 job 的 `request_payload` 同时包含用户输入、公开控制字段、内部调度 token、Provider override 和评测策略。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /api/generations/jobs/{job_id}`。
|
||||
2. 检查响应中的 `request_payload`。
|
||||
- **Expected Results**:
|
||||
- 用户响应只保留 `output_mode`、`input_type`、`type`、`story_id`、`assets`、`page_count`、`generate_images` 等安全控制字段。
|
||||
- 用户响应不包含原始 `data`、`education_theme`、内部调度 token、Provider override 或 evaluation policy。
|
||||
- 内部数据库中的完整 request payload 不被修改。
|
||||
- **Postconditions**: 用户端仍可根据公开字段展示任务进度和可用操作。
|
||||
|
||||
#### TC-ST-008: 资产 plan runner 按 WorkflowPlan 顺序执行任务
|
||||
|
||||
- **Requirement**: H12-1, H12-5
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 构造 `asset_generation` 或 `asset_retry` plan,包含图片和音频 task。
|
||||
- **Test Steps**:
|
||||
1. 调用 `run_asset_plan(...)`。
|
||||
2. 记录 image/audio handler 的调用顺序。
|
||||
3. 检查 runner 返回的 executed/ignored task keys。
|
||||
- **Expected Results**:
|
||||
- 图片和音频 handler 按 plan 中 `WorkflowTask` 顺序执行。
|
||||
- `start_asset_*` 和 `complete_asset_*` 这类非资产生产 task 被记录为 ignored,不触发 provider handler。
|
||||
- 未知非资产 task 默认 ignored,不影响已知资产 task。
|
||||
- **Postconditions**: 无数据库修改。
|
||||
|
||||
#### TC-ST-009: 后台资产生成由 plan runner 执行组合资产
|
||||
|
||||
- **Requirement**: H12-2, H12-5
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 已持久化故事同时具备可生成图片和音频的输入。
|
||||
- 创建 `asset_generation` job,`assets=["audio", "image"]`。
|
||||
- **Test Steps**:
|
||||
1. 调用 worker 执行该 job。
|
||||
2. 查询 job events 和 story 状态。
|
||||
- **Expected Results**:
|
||||
- event stream 为 `workflow_planned` 后依次出现音频和图片生成事件。
|
||||
- plan tasks 顺序包含 `complete_audio_asset`、`complete_image_asset`。
|
||||
- story 的 `audio_status` 与 `image_status` 均为 `ready`。
|
||||
- 用户 API 仍只暴露 coarse plan metadata,不返回原始 `plan.tasks`。
|
||||
- **Postconditions**: job 完成,资源状态与原有语义一致。
|
||||
|
||||
#### TC-ST-010: 用户侧过滤 executor coverage 内部事件
|
||||
|
||||
- **Requirement**: H13-4, H13-5
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 生成 job 包含内部 `executor_completed` 事件。
|
||||
- `executor_completed.event_metadata` 包含 task keys 和 result assets。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /api/generations/jobs/{job_id}`。
|
||||
2. 调用 `GET /api/generations/{story_id}/jobs`。
|
||||
3. 调用 `GET /api/generations/{story_id}/trace-summary`。
|
||||
- **Expected Results**:
|
||||
- 用户 job detail 不包含 `executor_completed`。
|
||||
- 用户 job detail 不包含 `executed_task_keys`、`ignored_task_keys` 或具体 task key。
|
||||
- 当 job 当前步骤短暂停留在 `executor_completed` 时,用户 summary 显示为安全公开的 `workflow_planned` 进度。
|
||||
- 用户 trace summary 不包含 `executor_completed` 或具体 task key。
|
||||
- 用户 trace summary 的 `total_events` 不统计内部 `executor_completed`。
|
||||
- **Postconditions**: 内部数据库事件不被修改。
|
||||
|
||||
### 5. Admin-Only Analytics Tests
|
||||
|
||||
#### TC-ADM-001: 管理端评测 analytics 聚合内部评测事件
|
||||
|
||||
- **Requirement**: H8-1, H8-2
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 数据库存在多个用户的 `evaluation_completed` 事件。
|
||||
- 请求通过 admin guard。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/evaluations/analytics`。
|
||||
2. 检查聚合结果。
|
||||
- **Expected Results**:
|
||||
- 返回通过数、阻断数、通过率和平均分。
|
||||
- 返回 artifact、output mode、score band、dimension score、quality gate issue、failure category 和 warning 聚合。
|
||||
- 不返回故事正文、prompt、单条 evaluation event 或评分 reason。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-002: 管理端评测 analytics 支持过滤
|
||||
|
||||
- **Requirement**: H8-3
|
||||
- **Priority**: Medium
|
||||
- **Preconditions**:
|
||||
- 数据库存在新旧评测事件以及不同 artifact。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/evaluations/analytics?days=7`。
|
||||
2. 调用 `GET /admin/evaluations/analytics?artifact=story_text`。
|
||||
3. 调用非法 artifact。
|
||||
- **Expected Results**:
|
||||
- `days` 过滤只统计窗口内事件。
|
||||
- `artifact` 过滤只统计对应 artifact。
|
||||
- 非法 artifact 返回 `422`。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-003: 管理端评测 analytics 需要 admin 鉴权
|
||||
|
||||
- **Requirement**: H8-2
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 未提供 admin Basic Auth。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/evaluations/analytics`。
|
||||
- **Expected Results**:
|
||||
- 返回 `401`。
|
||||
- 不返回任何评测统计。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-004: 管理端完整生成 trace 返回内部事件流
|
||||
|
||||
- **Requirement**: H11-2, H11-3, H11-4
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 数据库存在包含 `workflow_planned` 与 `evaluation_completed` 的生成 job。
|
||||
- 请求通过 admin guard。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`。
|
||||
2. 检查 request payload 与 event stream。
|
||||
- **Expected Results**:
|
||||
- 返回完整 request payload,包括原始用户输入和内部调度字段。
|
||||
- 返回完整 `workflow_planned.event_metadata.plan.tasks`。
|
||||
- 返回 `evaluation_completed` 事件及其内部评分 metadata。
|
||||
- 响应包含 `user_id`,便于管理控制面审计。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-005: 管理端完整生成 trace 需要 admin 鉴权
|
||||
|
||||
- **Requirement**: H11-3
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 未提供 admin Basic Auth。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`。
|
||||
- **Expected Results**:
|
||||
- 返回 `401`。
|
||||
- 不返回 request payload 或内部 event metadata。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-006: 管理端 executor coverage 聚合内部执行事件
|
||||
|
||||
- **Requirement**: H13-1, H13-2, H13-3, H13-5
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 数据库存在多个 `executor_completed` 事件。
|
||||
- 请求通过 admin guard。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/executors/coverage`。
|
||||
2. 调用 `GET /admin/executors/coverage?plan_mode=asset_retry`。
|
||||
3. 调用非法 plan mode。
|
||||
- **Expected Results**:
|
||||
- 返回 total runs、planned/executed/ignored task counts 和 coverage ratio。
|
||||
- 返回 plan mode、output mode、executed task keys、ignored task keys 和 result assets 聚合。
|
||||
- `plan_mode` 过滤只统计对应 executor run。
|
||||
- 非法 plan mode 返回 `422`。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-007: 管理端 executor coverage 需要 admin 鉴权
|
||||
|
||||
- **Requirement**: H13-3
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 未提供 admin Basic Auth。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/executors/coverage`。
|
||||
- **Expected Results**:
|
||||
- 返回 `401`。
|
||||
- 不返回 executor task keys 或 coverage metadata。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-008: 管理端完整生成 trace 返回单 job executor coverage 摘要
|
||||
|
||||
- **Requirement**: H14-1, H14-2, H14-4
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 数据库存在包含 `executor_completed` 事件的生成 job。
|
||||
- 请求通过 admin guard。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`。
|
||||
2. 检查 `executor_coverage`。
|
||||
- **Expected Results**:
|
||||
- 响应包含 `executor_coverage.scope=admin_internal_job_executor_coverage`。
|
||||
- `executor_coverage` 只统计当前 job 的 runs、planned/executed/ignored task counts 和 coverage ratio。
|
||||
- `executor_coverage.executed_task_keys`、`ignored_task_keys` 和 `result_assets` 与当前 job 的内部 executor event 一致。
|
||||
- 完整 event stream 仍保留 `executor_completed`,便于 admin 调试。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-009: 管理端 harness readiness 聚合内部质量门
|
||||
|
||||
- **Requirement**: H15-1, H15-2, H15-3, H15-4
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- app 内部 harness fixture 存在 golden replay cases。
|
||||
- 数据库存在至少一条通过的 `evaluation_completed` 事件。
|
||||
- 数据库存在至少一条 `executor_completed` 事件。
|
||||
- 请求通过 admin guard。
|
||||
- **Test Steps**:
|
||||
1. 调用 `GET /admin/harness/readiness`。
|
||||
2. 检查 readiness status、checks 和聚合摘要。
|
||||
- **Expected Results**:
|
||||
- `status=ready`。
|
||||
- checks 包含 `golden_replay`、`runtime_evaluation_samples`、`runtime_evaluation_quality`、`executor_coverage_samples` 和 `executor_coverage_ratio`。
|
||||
- golden replay 显示全部通过。
|
||||
- evaluation analytics 与 executor coverage 只以聚合形式返回。
|
||||
- 响应不包含故事标题、正文、prompt、score reason 或 quality gate message。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
#### TC-ADM-010: 管理端 harness readiness 阻断低质量运行样本并需要 admin 鉴权
|
||||
|
||||
- **Requirement**: H15-2, H15-3, H15-4, H15-5
|
||||
- **Priority**: High
|
||||
- **Preconditions**:
|
||||
- 数据库存在低质量或 blocking 的 `evaluation_completed` 事件。
|
||||
- executor coverage 运行样本缺失或不足。
|
||||
- **Test Steps**:
|
||||
1. 通过 admin guard 调用 `GET /admin/harness/readiness`。
|
||||
2. 未提供 admin Basic Auth 调用同一路径。
|
||||
- **Expected Results**:
|
||||
- 有 admin 权限时返回 `status=blocked`。
|
||||
- `runtime_evaluation_quality.status=blocked`。
|
||||
- executor 样本缺失时对应 check 为 `needs_attention`。
|
||||
- 无 admin 权限时返回 `401`。
|
||||
- 响应不包含 quality gate message 或单条事件明细。
|
||||
- **Postconditions**: 无数据修改。
|
||||
|
||||
## Test Coverage Matrix
|
||||
|
||||
| Requirement ID | Test Cases | Coverage Status |
|
||||
| --- | --- | --- |
|
||||
| H7-1 | TC-F-002, TC-F-005, TC-E-001, TC-ERR-002, TC-ERR-003 | Complete |
|
||||
| H7-2 | TC-F-001, TC-ST-001 | Complete |
|
||||
| H7-3 | TC-F-001, TC-ST-001 | Complete |
|
||||
| H7-4 | TC-F-003, TC-ERR-001 | Complete |
|
||||
| H7-5 | This document | Complete |
|
||||
| H7-7 | TC-F-006, TC-E-002 | Complete |
|
||||
| H7-8 | TC-F-006, TC-F-007 | Complete |
|
||||
| H7B-1 | TC-F-003 | Complete |
|
||||
| H7B-2 | TC-F-004 | Complete |
|
||||
| H7C-1 | TC-F-005, TC-ERR-003, TC-ST-002 | Complete |
|
||||
| H8-1 | TC-ADM-001 | Complete |
|
||||
| H8-2 | TC-ADM-001, TC-ADM-003 | Complete |
|
||||
| H8-3 | TC-ADM-002 | Complete |
|
||||
| H8-4 | TC-F-003, TC-F-004, TC-ADM-001 | Complete |
|
||||
| H9-1 | TC-ST-002 | Complete |
|
||||
| H9-2 | TC-ST-003 | Complete |
|
||||
| H9-3 | TC-ST-001, TC-ST-002, TC-ST-003 | Complete |
|
||||
| H9-4 | TC-F-003, TC-F-004, TC-ST-004 | Complete |
|
||||
| H10-1 | TC-ST-005 | Complete |
|
||||
| H10-2 | TC-ST-005 | Complete |
|
||||
| H10-3 | TC-ST-005 | Complete |
|
||||
| H10-4 | TC-ST-006 | Complete |
|
||||
| H10-5 | TC-ST-005, TC-ST-006 | Complete |
|
||||
| H11-1 | TC-ST-007 | Complete |
|
||||
| H11-2 | TC-ADM-004 | Complete |
|
||||
| H11-3 | TC-ADM-004, TC-ADM-005 | Complete |
|
||||
| H11-4 | TC-ST-007, TC-ADM-004, TC-ADM-005 | Complete |
|
||||
| H11-5 | This document, `docs/planning/harness-stage-11-report.md` | Complete |
|
||||
| H12-1 | TC-ST-008 | Complete |
|
||||
| H12-2 | TC-ST-009 | Complete |
|
||||
| H12-3 | TC-ST-005, TC-ST-008 | Complete |
|
||||
| H12-4 | TC-ST-005, backend story endpoint regression tests | Complete |
|
||||
| H12-5 | TC-ST-008, TC-ST-009 | Complete |
|
||||
| H13-1 | TC-ADM-006 | Complete |
|
||||
| H13-2 | TC-ST-009, TC-ADM-006 | Complete |
|
||||
| H13-3 | TC-ADM-006, TC-ADM-007 | Complete |
|
||||
| H13-4 | TC-ST-010 | Complete |
|
||||
| H13-5 | TC-ST-010, TC-ADM-006, TC-ADM-007 | Complete |
|
||||
| H14-1 | TC-ADM-006, TC-ADM-008 | Complete |
|
||||
| H14-2 | TC-ADM-008 | Complete |
|
||||
| H14-3 | TC-ST-010 | Complete |
|
||||
| H14-4 | TC-ST-010, TC-ADM-008 | Complete |
|
||||
| H14-5 | This document, `docs/planning/harness-stage-14-report.md` | Complete |
|
||||
| H15-1 | TC-F-006, TC-ADM-009 | Complete |
|
||||
| H15-2 | TC-ADM-009, TC-ADM-010 | Complete |
|
||||
| H15-3 | TC-ADM-009, TC-ADM-010 | Complete |
|
||||
| H15-4 | TC-ADM-009, TC-ADM-010 | Complete |
|
||||
| H15-5 | This document, `docs/planning/harness-stage-15-report.md` | Complete |
|
||||
|
||||
## Notes
|
||||
|
||||
- 当前自动化已覆盖 TC-F-001、TC-F-002、TC-F-003、TC-F-004、TC-F-005、TC-F-006、TC-F-007、TC-E-002、TC-ERR-001、TC-ERR-002、TC-ERR-003、TC-ST-001、TC-ST-002、TC-ST-003、TC-ST-004、TC-ST-005、TC-ST-006、TC-ST-007、TC-ST-008、TC-ST-009、TC-ST-010、TC-ADM-001、TC-ADM-002、TC-ADM-003、TC-ADM-004、TC-ADM-005、TC-ADM-006、TC-ADM-007、TC-ADM-008、TC-ADM-009、TC-ADM-010。
|
||||
- TC-E-001 可在下一轮补成显式单测。
|
||||
- 所有 `evaluation_completed`、golden replay 和评分维度数据均按内部质量资产处理,不应进入用户端接口或用户前端。
|
||||
- `GET /admin/evaluations/analytics` 只允许 admin-only 聚合摘要,不应返回原始内容、prompt、单条事件或评分 reason。
|
||||
- `GET /admin/generations/jobs/{job_id}/trace` 是 admin-only 调试和审查接口,可返回完整内部链路,不应被用户前端调用。
|
||||
- `GET /admin/executors/coverage` 是 admin-only executor 覆盖率接口,可返回 task keys 和 result assets,不应被用户前端调用。
|
||||
- `GET /admin/generations/jobs/{job_id}/trace` 可返回当前 job 的 `executor_coverage` 摘要;该摘要与 task keys 一样属于内部执行资产。
|
||||
- `GET /admin/harness/readiness` 是 admin-only harness 上线前审查摘要,可返回聚合 readiness、thresholds、golden coverage、evaluation analytics 和 executor coverage,不应返回正文、prompt、score reason、quality gate message 或单条事件明细。
|
||||
@@ -27,6 +27,17 @@ def _build_admin_test_app(db_session) -> FastAPI:
|
||||
return app
|
||||
|
||||
|
||||
def _build_admin_auth_required_test_app(db_session) -> FastAPI:
|
||||
app = FastAPI()
|
||||
app.include_router(admin_providers.router, prefix="/admin")
|
||||
|
||||
async def override_get_db():
|
||||
yield db_session
|
||||
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
return app
|
||||
|
||||
|
||||
async def _create_story(
|
||||
db_session,
|
||||
*,
|
||||
@@ -51,6 +62,38 @@ async def _create_story(
|
||||
return story
|
||||
|
||||
|
||||
async def _record_evaluation_event(
|
||||
db_session,
|
||||
*,
|
||||
user_id: str,
|
||||
story_id: int,
|
||||
output_mode: str,
|
||||
artifact: str,
|
||||
status: str,
|
||||
metadata: dict,
|
||||
):
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=user_id,
|
||||
output_mode=output_mode,
|
||||
input_type="keywords",
|
||||
request_payload={"data": "测试"},
|
||||
story_id=story_id,
|
||||
)
|
||||
return await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=story_id,
|
||||
event_type="evaluation_completed",
|
||||
status=status,
|
||||
metadata={
|
||||
"step": "evaluation",
|
||||
"artifact": artifact,
|
||||
**metadata,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
async def test_admin_provider_analytics_aggregate_across_users(db_session, test_user):
|
||||
second_user = User(
|
||||
id="github:67890",
|
||||
@@ -197,6 +240,616 @@ async def test_admin_provider_analytics_aggregate_across_users(db_session, test_
|
||||
]
|
||||
|
||||
|
||||
async def test_admin_evaluation_analytics_aggregate_internal_events(
|
||||
db_session,
|
||||
test_user,
|
||||
):
|
||||
second_user = User(
|
||||
id="google:evaluation-user",
|
||||
name="Evaluation User",
|
||||
avatar_url="https://example.com/eval.png",
|
||||
provider="google",
|
||||
)
|
||||
db_session.add(second_user)
|
||||
await db_session.commit()
|
||||
|
||||
story = await _create_story(db_session, user_id=test_user.id, title="评测故事")
|
||||
storybook = await _create_story(
|
||||
db_session,
|
||||
user_id=second_user.id,
|
||||
title="评测绘本",
|
||||
mode="storybook",
|
||||
)
|
||||
|
||||
await _record_evaluation_event(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
story_id=story.id,
|
||||
output_mode="story",
|
||||
artifact="story_text",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"overall_score": 0.92,
|
||||
"passed": True,
|
||||
"blocking": False,
|
||||
"scores": [
|
||||
{"dimension": "structure", "score": 1.0, "reason": "完整"},
|
||||
{"dimension": "readability", "score": 0.84, "reason": "可读"},
|
||||
],
|
||||
"warnings": [],
|
||||
},
|
||||
)
|
||||
await _record_evaluation_event(
|
||||
db_session,
|
||||
user_id=second_user.id,
|
||||
story_id=storybook.id,
|
||||
output_mode="storybook",
|
||||
artifact="storybook_pages",
|
||||
status="failed",
|
||||
metadata={
|
||||
"overall_score": 0.0,
|
||||
"passed": False,
|
||||
"blocking": True,
|
||||
"scores": [
|
||||
{"dimension": "structure", "score": 0.0, "reason": "结构失败"},
|
||||
{"dimension": "safety", "score": 0.0, "reason": "安全失败"},
|
||||
],
|
||||
"quality_gate": {
|
||||
"issues": [
|
||||
{
|
||||
"code": "unsafe_child_content",
|
||||
"message": "风险词",
|
||||
"failure_category": "safety_error",
|
||||
"field": "pages",
|
||||
}
|
||||
]
|
||||
},
|
||||
"warnings": ["绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。"],
|
||||
},
|
||||
)
|
||||
|
||||
admin_app = _build_admin_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/evaluations/analytics")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["scope"] == "admin_internal_evaluations"
|
||||
assert data["total_evaluations"] == 2
|
||||
assert data["passed_evaluations"] == 1
|
||||
assert data["blocked_evaluations"] == 1
|
||||
assert data["pass_rate"] == 0.5
|
||||
assert data["average_score"] == 0.46
|
||||
assert data["job_count"] == 2
|
||||
assert data["story_count"] == 2
|
||||
assert data["user_count"] == 2
|
||||
assert data["by_artifact"] == [
|
||||
{"artifact": "story_text", "count": 1},
|
||||
{"artifact": "storybook_pages", "count": 1},
|
||||
]
|
||||
assert data["by_output_mode"] == [
|
||||
{"output_mode": "story", "count": 1},
|
||||
{"output_mode": "storybook", "count": 1},
|
||||
]
|
||||
assert data["score_bands"] == [
|
||||
{"band": "blocked_quality_gate", "count": 1},
|
||||
{"band": "excellent", "count": 1},
|
||||
]
|
||||
assert data["dimension_scores"] == [
|
||||
{"dimension": "structure", "average_score": 0.5, "count": 2},
|
||||
{"dimension": "readability", "average_score": 0.84, "count": 1},
|
||||
{"dimension": "safety", "average_score": 0.0, "count": 1},
|
||||
]
|
||||
assert data["quality_gate_issues"] == [
|
||||
{"code": "unsafe_child_content", "count": 1},
|
||||
]
|
||||
assert data["failure_categories"] == [
|
||||
{"category": "safety_error", "count": 1},
|
||||
]
|
||||
assert data["warnings"] == [
|
||||
{
|
||||
"message": "绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。",
|
||||
"count": 1,
|
||||
},
|
||||
]
|
||||
assert "评测故事" not in str(data)
|
||||
assert "风险词" not in str(data)
|
||||
assert "完整" not in str(data)
|
||||
|
||||
|
||||
async def test_admin_evaluation_analytics_support_days_and_artifact_filters(
|
||||
db_session,
|
||||
test_user,
|
||||
):
|
||||
story = await _create_story(db_session, user_id=test_user.id, title="旧评测")
|
||||
storybook = await _create_story(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
title="新评测",
|
||||
mode="storybook",
|
||||
)
|
||||
|
||||
old_event = await _record_evaluation_event(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
story_id=story.id,
|
||||
output_mode="story",
|
||||
artifact="story_text",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"overall_score": 0.96,
|
||||
"passed": True,
|
||||
"blocking": False,
|
||||
"scores": [{"dimension": "structure", "score": 1.0, "reason": "完整"}],
|
||||
"warnings": [],
|
||||
},
|
||||
)
|
||||
old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10)
|
||||
await db_session.commit()
|
||||
|
||||
await _record_evaluation_event(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
story_id=storybook.id,
|
||||
output_mode="storybook",
|
||||
artifact="storybook_pages",
|
||||
status="failed",
|
||||
metadata={
|
||||
"overall_score": 0.72,
|
||||
"passed": False,
|
||||
"blocking": True,
|
||||
"scores": [{"dimension": "readability", "score": 0.62, "reason": "过短"}],
|
||||
"warnings": ["分页正文长度偏短"],
|
||||
},
|
||||
)
|
||||
|
||||
admin_app = _build_admin_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/evaluations/analytics?days=7")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["window_days"] == 7
|
||||
assert data["total_evaluations"] == 1
|
||||
assert data["artifact"] is None
|
||||
assert data["by_artifact"] == [{"artifact": "storybook_pages", "count": 1}]
|
||||
|
||||
response = await client.get(
|
||||
"/admin/evaluations/analytics?artifact=story_text"
|
||||
)
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["artifact"] == "story_text"
|
||||
assert data["total_evaluations"] == 1
|
||||
assert data["average_score"] == 0.96
|
||||
|
||||
response = await client.get("/admin/evaluations/analytics?artifact=image")
|
||||
assert response.status_code == 422
|
||||
|
||||
|
||||
async def test_admin_evaluation_analytics_requires_admin_auth(db_session):
|
||||
admin_app = _build_admin_auth_required_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/evaluations/analytics")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
async def test_admin_generation_job_trace_returns_internal_event_stream(
|
||||
db_session,
|
||||
test_user,
|
||||
):
|
||||
story = await _create_story(db_session, user_id=test_user.id, title="内部链路故事")
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
output_mode="story",
|
||||
input_type="keywords",
|
||||
request_payload={
|
||||
"output_mode": "story",
|
||||
"type": "keywords",
|
||||
"data": "月亮森林",
|
||||
"internal_dispatch_token": "admin-visible-token",
|
||||
"provider_override": "internal-provider",
|
||||
"evaluation_policy": {"threshold": 0.9},
|
||||
},
|
||||
story_id=story.id,
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=story.id,
|
||||
event_type="workflow_planned",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"step": "request_acceptance",
|
||||
"artifact": "none",
|
||||
"plan": {
|
||||
"mode": "story",
|
||||
"tasks": [
|
||||
{
|
||||
"key": "generate_narrative",
|
||||
"step": "text_generation",
|
||||
"artifact": "story_text",
|
||||
"required": True,
|
||||
"recoverable": False,
|
||||
}
|
||||
],
|
||||
},
|
||||
"internal_threshold": 0.9,
|
||||
},
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=story.id,
|
||||
event_type="evaluation_completed",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"step": "evaluation",
|
||||
"artifact": "story_text",
|
||||
"overall_score": 0.94,
|
||||
"passed": True,
|
||||
"blocking": False,
|
||||
"scores": [{"dimension": "structure", "score": 1.0}],
|
||||
},
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=story.id,
|
||||
event_type="executor_completed",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"plan_mode": "asset_generation",
|
||||
"planned_task_count": 3,
|
||||
"executed_task_count": 1,
|
||||
"ignored_task_count": 2,
|
||||
"executed_task_keys": ["complete_image_asset"],
|
||||
"ignored_task_keys": [
|
||||
"start_asset_generation",
|
||||
"complete_asset_generation",
|
||||
],
|
||||
"result_assets": ["cover_image"],
|
||||
},
|
||||
)
|
||||
|
||||
admin_app = _build_admin_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get(f"/admin/generations/jobs/{job.id}/trace")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["id"] == job.id
|
||||
assert data["user_id"] == test_user.id
|
||||
assert data["request_payload"]["data"] == "月亮森林"
|
||||
assert data["request_payload"]["internal_dispatch_token"] == "admin-visible-token"
|
||||
assert data["request_payload"]["evaluation_policy"] == {"threshold": 0.9}
|
||||
|
||||
event_types = [event["event_type"] for event in data["events"]]
|
||||
assert event_types == [
|
||||
"request_accepted",
|
||||
"workflow_planned",
|
||||
"evaluation_completed",
|
||||
"executor_completed",
|
||||
]
|
||||
workflow_event = data["events"][1]
|
||||
assert workflow_event["event_metadata"]["plan"]["tasks"][0]["key"] == (
|
||||
"generate_narrative"
|
||||
)
|
||||
assert workflow_event["event_metadata"]["internal_threshold"] == 0.9
|
||||
|
||||
evaluation_event = data["events"][2]
|
||||
assert evaluation_event["event_metadata"]["overall_score"] == 0.94
|
||||
assert evaluation_event["event_metadata"]["scores"] == [
|
||||
{"dimension": "structure", "score": 1.0}
|
||||
]
|
||||
executor_event = data["events"][3]
|
||||
assert executor_event["event_metadata"]["executed_task_keys"] == [
|
||||
"complete_image_asset"
|
||||
]
|
||||
assert executor_event["event_metadata"]["result_assets"] == ["cover_image"]
|
||||
|
||||
executor_coverage = data["executor_coverage"]
|
||||
assert executor_coverage["scope"] == "admin_internal_job_executor_coverage"
|
||||
assert executor_coverage["total_runs"] == 1
|
||||
assert executor_coverage["total_planned_tasks"] == 3
|
||||
assert executor_coverage["total_executed_tasks"] == 1
|
||||
assert executor_coverage["total_ignored_tasks"] == 2
|
||||
assert executor_coverage["coverage_ratio"] == 0.3333
|
||||
assert executor_coverage["job_count"] == 1
|
||||
assert executor_coverage["story_count"] == 1
|
||||
assert executor_coverage["user_count"] == 1
|
||||
assert executor_coverage["by_plan_mode"] == [
|
||||
{"plan_mode": "asset_generation", "count": 1}
|
||||
]
|
||||
assert executor_coverage["by_output_mode"] == [
|
||||
{"output_mode": "story", "count": 1}
|
||||
]
|
||||
assert executor_coverage["executed_task_keys"] == [
|
||||
{"task_key": "complete_image_asset", "count": 1}
|
||||
]
|
||||
assert executor_coverage["ignored_task_keys"] == [
|
||||
{"task_key": "complete_asset_generation", "count": 1},
|
||||
{"task_key": "start_asset_generation", "count": 1},
|
||||
]
|
||||
assert executor_coverage["result_assets"] == [
|
||||
{"asset": "cover_image", "count": 1}
|
||||
]
|
||||
|
||||
|
||||
async def test_admin_generation_job_trace_requires_admin_auth(db_session):
|
||||
admin_app = _build_admin_auth_required_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/generations/jobs/missing-job/trace")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
async def test_admin_executor_coverage_aggregates_internal_events(
|
||||
db_session,
|
||||
test_user,
|
||||
):
|
||||
story = await _create_story(db_session, user_id=test_user.id, title="执行器覆盖故事")
|
||||
asset_job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
output_mode="asset_generation",
|
||||
input_type="audio,image",
|
||||
request_payload={"story_id": story.id, "assets": ["audio", "image"]},
|
||||
story_id=story.id,
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=asset_job,
|
||||
story_id=story.id,
|
||||
event_type="executor_completed",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"plan_mode": "asset_generation",
|
||||
"planned_task_count": 4,
|
||||
"executed_task_count": 2,
|
||||
"ignored_task_count": 2,
|
||||
"executed_task_keys": ["complete_audio_asset", "complete_image_asset"],
|
||||
"ignored_task_keys": [
|
||||
"start_asset_generation",
|
||||
"complete_asset_generation",
|
||||
],
|
||||
"result_assets": ["audio", "cover_image"],
|
||||
},
|
||||
)
|
||||
retry_job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
output_mode="asset_retry",
|
||||
input_type="image",
|
||||
request_payload={"story_id": story.id, "assets": ["image"]},
|
||||
story_id=story.id,
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=retry_job,
|
||||
story_id=story.id,
|
||||
event_type="executor_completed",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"plan_mode": "asset_retry",
|
||||
"planned_task_count": 3,
|
||||
"executed_task_count": 1,
|
||||
"ignored_task_count": 2,
|
||||
"executed_task_keys": ["complete_image_asset"],
|
||||
"ignored_task_keys": ["start_asset_retry", "complete_asset_retry"],
|
||||
"result_assets": ["cover_image"],
|
||||
},
|
||||
)
|
||||
|
||||
admin_app = _build_admin_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/executors/coverage")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["scope"] == "admin_internal_executor_coverage"
|
||||
assert data["total_runs"] == 2
|
||||
assert data["total_planned_tasks"] == 7
|
||||
assert data["total_executed_tasks"] == 3
|
||||
assert data["total_ignored_tasks"] == 4
|
||||
assert data["coverage_ratio"] == 0.4286
|
||||
assert data["job_count"] == 2
|
||||
assert data["story_count"] == 1
|
||||
assert data["user_count"] == 1
|
||||
assert data["by_plan_mode"] == [
|
||||
{"plan_mode": "asset_generation", "count": 1},
|
||||
{"plan_mode": "asset_retry", "count": 1},
|
||||
]
|
||||
assert data["executed_task_keys"] == [
|
||||
{"task_key": "complete_image_asset", "count": 2},
|
||||
{"task_key": "complete_audio_asset", "count": 1},
|
||||
]
|
||||
assert data["result_assets"] == [
|
||||
{"asset": "cover_image", "count": 2},
|
||||
{"asset": "audio", "count": 1},
|
||||
]
|
||||
|
||||
response = await client.get("/admin/executors/coverage?plan_mode=asset_retry")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["plan_mode"] == "asset_retry"
|
||||
assert data["total_runs"] == 1
|
||||
assert data["total_planned_tasks"] == 3
|
||||
assert data["total_executed_tasks"] == 1
|
||||
|
||||
response = await client.get("/admin/executors/coverage?plan_mode=story")
|
||||
assert response.status_code == 422
|
||||
|
||||
|
||||
async def test_admin_executor_coverage_requires_admin_auth(db_session):
|
||||
admin_app = _build_admin_auth_required_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/executors/coverage")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
async def test_admin_harness_readiness_returns_ready_when_internal_gates_pass(
|
||||
db_session,
|
||||
test_user,
|
||||
):
|
||||
story = await _create_story(db_session, user_id=test_user.id, title="readiness 故事")
|
||||
await _record_evaluation_event(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
story_id=story.id,
|
||||
output_mode="story",
|
||||
artifact="story_text",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"overall_score": 0.92,
|
||||
"passed": True,
|
||||
"blocking": False,
|
||||
"scores": [
|
||||
{"dimension": "structure", "score": 1.0, "reason": "内部 reason"},
|
||||
{"dimension": "readability", "score": 0.84, "reason": "内部 reason"},
|
||||
],
|
||||
"warnings": [],
|
||||
},
|
||||
)
|
||||
asset_job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
output_mode="asset_generation",
|
||||
input_type="image",
|
||||
request_payload={"story_id": story.id, "assets": ["image"]},
|
||||
story_id=story.id,
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=asset_job,
|
||||
story_id=story.id,
|
||||
event_type="executor_completed",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"plan_mode": "asset_generation",
|
||||
"planned_task_count": 3,
|
||||
"executed_task_count": 1,
|
||||
"ignored_task_count": 2,
|
||||
"executed_task_keys": ["complete_image_asset"],
|
||||
"ignored_task_keys": [
|
||||
"start_asset_generation",
|
||||
"complete_asset_generation",
|
||||
],
|
||||
"result_assets": ["cover_image"],
|
||||
},
|
||||
)
|
||||
|
||||
admin_app = _build_admin_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/harness/readiness")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["scope"] == "admin_internal_harness_readiness"
|
||||
assert data["status"] == "ready"
|
||||
assert data["thresholds"] == {
|
||||
"min_runtime_evaluations": 1,
|
||||
"min_executor_runs": 1,
|
||||
"min_evaluation_pass_rate": 0.7,
|
||||
"min_evaluation_average_score": 0.7,
|
||||
"min_executor_coverage_ratio": 0.2,
|
||||
}
|
||||
assert {check["code"]: check["status"] for check in data["checks"]} == {
|
||||
"golden_replay": "ready",
|
||||
"runtime_evaluation_samples": "ready",
|
||||
"runtime_evaluation_quality": "ready",
|
||||
"executor_coverage_samples": "ready",
|
||||
"executor_coverage_ratio": "ready",
|
||||
}
|
||||
assert data["golden_replay"]["passed"] is True
|
||||
assert data["golden_replay"]["total_cases"] == 11
|
||||
assert data["evaluation_analytics"]["total_evaluations"] == 1
|
||||
assert data["evaluation_analytics"]["pass_rate"] == 1.0
|
||||
assert data["executor_coverage"]["total_runs"] == 1
|
||||
assert data["executor_coverage"]["coverage_ratio"] == 0.3333
|
||||
assert "内部 reason" not in str(data)
|
||||
assert "readiness 故事" not in str(data)
|
||||
|
||||
|
||||
async def test_admin_harness_readiness_blocks_low_runtime_quality(
|
||||
db_session,
|
||||
test_user,
|
||||
):
|
||||
story = await _create_story(db_session, user_id=test_user.id, title="低质量 readiness")
|
||||
await _record_evaluation_event(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
story_id=story.id,
|
||||
output_mode="story",
|
||||
artifact="story_text",
|
||||
status="failed",
|
||||
metadata={
|
||||
"overall_score": 0.0,
|
||||
"passed": False,
|
||||
"blocking": True,
|
||||
"scores": [{"dimension": "structure", "score": 0.0, "reason": "缺失"}],
|
||||
"quality_gate": {
|
||||
"issues": [
|
||||
{
|
||||
"code": "missing_story_text",
|
||||
"message": "正文缺失",
|
||||
"failure_category": "schema_error",
|
||||
"field": "story_text",
|
||||
}
|
||||
]
|
||||
},
|
||||
"warnings": [],
|
||||
},
|
||||
)
|
||||
|
||||
admin_app = _build_admin_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/harness/readiness")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["status"] == "blocked"
|
||||
checks = {check["code"]: check for check in data["checks"]}
|
||||
assert checks["golden_replay"]["status"] == "ready"
|
||||
assert checks["runtime_evaluation_samples"]["status"] == "ready"
|
||||
assert checks["runtime_evaluation_quality"]["status"] == "blocked"
|
||||
assert checks["executor_coverage_samples"]["status"] == "needs_attention"
|
||||
assert checks["executor_coverage_ratio"]["status"] == "needs_attention"
|
||||
assert data["evaluation_analytics"]["blocked_evaluations"] == 1
|
||||
assert data["executor_coverage"]["total_runs"] == 0
|
||||
assert "正文缺失" not in str(data)
|
||||
assert "低质量 readiness" not in str(data)
|
||||
|
||||
|
||||
async def test_admin_harness_readiness_requires_admin_auth(db_session):
|
||||
admin_app = _build_admin_auth_required_test_app(db_session)
|
||||
transport = ASGITransport(app=admin_app)
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/admin/harness/readiness")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
async def test_admin_provider_analytics_support_days_and_capability_filters(
|
||||
db_session,
|
||||
test_user,
|
||||
|
||||
@@ -123,14 +123,19 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
|
||||
assert [event.event_type for event in events] == [
|
||||
"request_accepted",
|
||||
"worker_started",
|
||||
"workflow_planned",
|
||||
"context_prepared",
|
||||
"evaluation_completed",
|
||||
"narrative_generated",
|
||||
"story_saved",
|
||||
"generation_completed",
|
||||
]
|
||||
assert events[2].event_metadata["has_memory_context"] is False
|
||||
assert events[3].event_metadata["title"] == "小兔子的冒险"
|
||||
assert events[4].story_id == job.story_id
|
||||
assert events[2].event_metadata["plan"]["mode"] == "story"
|
||||
assert events[3].event_metadata["has_memory_context"] is False
|
||||
assert events[4].event_metadata["passed"] is True
|
||||
assert events[4].event_metadata["overall_score"] >= 0.7
|
||||
assert events[5].event_metadata["title"] == "小兔子的冒险"
|
||||
assert events[6].story_id == job.story_id
|
||||
|
||||
detail_response = await client.get(f"/api/generations/jobs/{job.id}")
|
||||
assert detail_response.status_code == 200
|
||||
@@ -143,11 +148,16 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
|
||||
assert [event["event_type"] for event in detail["events"]] == [
|
||||
"request_accepted",
|
||||
"worker_started",
|
||||
"workflow_planned",
|
||||
"context_prepared",
|
||||
"narrative_generated",
|
||||
"story_saved",
|
||||
"generation_completed",
|
||||
]
|
||||
assert all(
|
||||
event["event_type"] != "evaluation_completed"
|
||||
for event in detail["events"]
|
||||
)
|
||||
|
||||
story_response = await client.get(f"/api/generations/{job.story_id}")
|
||||
assert story_response.status_code == 200
|
||||
@@ -161,6 +171,13 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
|
||||
assert [item["id"] for item in job_list] == [job.id]
|
||||
assert job_list[0]["progress_percent"] == 100
|
||||
assert job_list[0]["is_terminal"] is True
|
||||
|
||||
trace_response = await client.get(
|
||||
f"/api/generations/{job.story_id}/trace-summary"
|
||||
)
|
||||
assert trace_response.status_code == 200
|
||||
trace = trace_response.json()
|
||||
assert "evaluation" not in trace
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
@@ -220,13 +237,88 @@ async def test_generation_worker_records_quality_gate_failure_without_persisting
|
||||
assert [event.event_type for event in events] == [
|
||||
"request_accepted",
|
||||
"worker_started",
|
||||
"workflow_planned",
|
||||
"context_prepared",
|
||||
"quality_gate_failed",
|
||||
"evaluation_completed",
|
||||
"generation_failed",
|
||||
]
|
||||
quality_event = events[3]
|
||||
quality_event = events[4]
|
||||
assert quality_event.event_metadata["step"] == "narrative_generation"
|
||||
assert quality_event.event_metadata["issues"][0]["code"] == "missing_story_text"
|
||||
evaluation_event = events[5]
|
||||
assert evaluation_event.event_metadata["step"] == "evaluation"
|
||||
assert evaluation_event.event_metadata["passed"] is False
|
||||
assert evaluation_event.event_metadata["blocking"] is True
|
||||
|
||||
|
||||
async def test_story_with_images_worker_records_plan_before_assets(
|
||||
db_session,
|
||||
test_user,
|
||||
mock_text_provider,
|
||||
mock_image_provider,
|
||||
):
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
output_mode="story",
|
||||
input_type="keywords",
|
||||
request_payload={
|
||||
"output_mode": "story",
|
||||
"type": "keywords",
|
||||
"data": "小兔子, 森林",
|
||||
"generate_images": True,
|
||||
},
|
||||
)
|
||||
|
||||
await run_generation_job_service(job.id, db_session)
|
||||
|
||||
refreshed_job = (
|
||||
await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id))
|
||||
).scalar_one()
|
||||
assert refreshed_job.story_id is not None
|
||||
assert refreshed_job.status == "completed"
|
||||
assert refreshed_job.current_step == "generation_completed"
|
||||
assert refreshed_job.result_snapshot["image_status"] == "ready"
|
||||
|
||||
events = (
|
||||
await db_session.execute(
|
||||
select(GenerationJobEvent)
|
||||
.where(GenerationJobEvent.job_id == job.id)
|
||||
.order_by(GenerationJobEvent.id)
|
||||
)
|
||||
).scalars().all()
|
||||
assert [event.event_type for event in events] == [
|
||||
"request_accepted",
|
||||
"worker_started",
|
||||
"workflow_planned",
|
||||
"context_prepared",
|
||||
"evaluation_completed",
|
||||
"narrative_generated",
|
||||
"story_saved",
|
||||
"cover_image_started",
|
||||
"cover_image_succeeded",
|
||||
"generation_completed",
|
||||
]
|
||||
|
||||
plan = events[2].event_metadata["plan"]
|
||||
assert plan["mode"] == "story_with_assets"
|
||||
assert [task["key"] for task in plan["tasks"]] == [
|
||||
"prepare_context",
|
||||
"generate_narrative",
|
||||
"evaluate_narrative",
|
||||
"persist_story",
|
||||
"generate_cover_image",
|
||||
"queue_postprocessing",
|
||||
"complete_generation",
|
||||
]
|
||||
cover_task = next(task for task in plan["tasks"] if task["key"] == "generate_cover_image")
|
||||
assert cover_task["required"] is False
|
||||
assert cover_task["recoverable"] is True
|
||||
assert events[4].event_metadata["passed"] is True
|
||||
assert events[8].event_metadata["asset"] == "cover_image"
|
||||
mock_text_provider.assert_called_once()
|
||||
mock_image_provider.assert_called_once()
|
||||
|
||||
|
||||
async def test_asset_retry_records_job_events_and_updates_retryable_assets(
|
||||
@@ -279,12 +371,30 @@ async def test_asset_retry_records_job_events_and_updates_retryable_assets(
|
||||
).scalars().all()
|
||||
assert [event.event_type for event in events] == [
|
||||
"request_accepted",
|
||||
"workflow_planned",
|
||||
"asset_retry_started",
|
||||
"cover_image_started",
|
||||
"cover_image_succeeded",
|
||||
"executor_completed",
|
||||
"asset_retry_completed",
|
||||
]
|
||||
assert events[3].event_metadata["asset"] == "cover_image"
|
||||
plan = events[1].event_metadata["plan"]
|
||||
assert plan["mode"] == "asset_retry"
|
||||
assert [task["key"] for task in plan["tasks"]] == [
|
||||
"start_asset_retry",
|
||||
"complete_image_asset",
|
||||
"complete_asset_retry",
|
||||
]
|
||||
image_task = next(
|
||||
task for task in plan["tasks"] if task["key"] == "complete_image_asset"
|
||||
)
|
||||
assert image_task["required"] is False
|
||||
assert image_task["recoverable"] is True
|
||||
assert events[4].event_metadata["asset"] == "cover_image"
|
||||
assert events[5].event_metadata["plan_mode"] == "asset_retry"
|
||||
assert events[5].event_metadata["executed_task_keys"] == [
|
||||
"complete_image_asset"
|
||||
]
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
@@ -365,10 +475,110 @@ async def test_asset_generation_job_worker_completes_cover_image(
|
||||
assert [event.event_type for event in events] == [
|
||||
"request_accepted",
|
||||
"worker_started",
|
||||
"workflow_planned",
|
||||
"cover_image_started",
|
||||
"cover_image_succeeded",
|
||||
"executor_completed",
|
||||
"asset_generation_completed",
|
||||
]
|
||||
plan = events[2].event_metadata["plan"]
|
||||
assert plan["mode"] == "asset_generation"
|
||||
assert [task["key"] for task in plan["tasks"]] == [
|
||||
"start_asset_generation",
|
||||
"complete_image_asset",
|
||||
"complete_asset_generation",
|
||||
]
|
||||
image_task = next(
|
||||
task for task in plan["tasks"] if task["key"] == "complete_image_asset"
|
||||
)
|
||||
assert image_task["required"] is False
|
||||
assert image_task["recoverable"] is True
|
||||
executor_event = events[5]
|
||||
assert executor_event.event_metadata["plan_mode"] == "asset_generation"
|
||||
assert executor_event.event_metadata["executed_task_keys"] == [
|
||||
"complete_image_asset"
|
||||
]
|
||||
assert executor_event.event_metadata["ignored_task_keys"] == [
|
||||
"start_asset_generation",
|
||||
"complete_asset_generation",
|
||||
]
|
||||
assert executor_event.event_metadata["result_assets"] == ["cover_image"]
|
||||
|
||||
|
||||
async def test_asset_generation_job_worker_executes_assets_in_plan_order(
|
||||
db_session,
|
||||
test_story,
|
||||
mock_tts_provider,
|
||||
):
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=test_story.user_id,
|
||||
output_mode="asset_generation",
|
||||
input_type="audio,image",
|
||||
request_payload={"story_id": test_story.id, "assets": ["audio", "image"]},
|
||||
story_id=test_story.id,
|
||||
)
|
||||
|
||||
with patch(
|
||||
"app.services.story_service.generate_image",
|
||||
new_callable=AsyncMock,
|
||||
) as mock_generate_image:
|
||||
mock_generate_image.return_value = "https://example.com/plan-cover.png"
|
||||
|
||||
await run_generation_job_service(job.id, db_session)
|
||||
|
||||
refreshed_job = (
|
||||
await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id))
|
||||
).scalar_one()
|
||||
assert refreshed_job.status == "completed"
|
||||
assert refreshed_job.current_step == "asset_generation_completed"
|
||||
assert refreshed_job.result_snapshot["image_status"] == "ready"
|
||||
assert refreshed_job.result_snapshot["audio_status"] == "ready"
|
||||
|
||||
story = (
|
||||
await db_session.execute(
|
||||
select(Story).where(Story.id == test_story.id)
|
||||
)
|
||||
).scalar_one()
|
||||
assert story.image_url == "https://example.com/plan-cover.png"
|
||||
assert story.audio_status == "ready"
|
||||
assert story.audio_path is not None
|
||||
|
||||
events = (
|
||||
await db_session.execute(
|
||||
select(GenerationJobEvent)
|
||||
.where(GenerationJobEvent.job_id == job.id)
|
||||
.order_by(GenerationJobEvent.id)
|
||||
)
|
||||
).scalars().all()
|
||||
assert [event.event_type for event in events] == [
|
||||
"request_accepted",
|
||||
"worker_started",
|
||||
"workflow_planned",
|
||||
"audio_started",
|
||||
"audio_succeeded",
|
||||
"cover_image_started",
|
||||
"cover_image_succeeded",
|
||||
"executor_completed",
|
||||
"asset_generation_completed",
|
||||
]
|
||||
plan = events[2].event_metadata["plan"]
|
||||
assert plan["mode"] == "asset_generation"
|
||||
assert [task["key"] for task in plan["tasks"]] == [
|
||||
"start_asset_generation",
|
||||
"complete_audio_asset",
|
||||
"complete_image_asset",
|
||||
"complete_asset_generation",
|
||||
]
|
||||
assert events[4].event_metadata["asset"] == "audio"
|
||||
assert events[6].event_metadata["asset"] == "cover_image"
|
||||
assert events[7].event_metadata["executed_task_keys"] == [
|
||||
"complete_audio_asset",
|
||||
"complete_image_asset",
|
||||
]
|
||||
assert events[7].event_metadata["result_assets"] == ["audio", "cover_image"]
|
||||
mock_tts_provider.assert_awaited_once()
|
||||
mock_generate_image.assert_awaited_once()
|
||||
|
||||
|
||||
async def test_cancel_queued_asset_generation_job_marks_it_canceled(
|
||||
@@ -538,7 +748,9 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve
|
||||
assert [event.event_type for event in events] == [
|
||||
"request_accepted",
|
||||
"worker_started",
|
||||
"workflow_planned",
|
||||
"context_prepared",
|
||||
"evaluation_completed",
|
||||
"narrative_generated",
|
||||
"storybook_images_started",
|
||||
"storybook_cover_image_succeeded",
|
||||
@@ -548,13 +760,45 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve
|
||||
"story_saved",
|
||||
"generation_completed",
|
||||
]
|
||||
plan = events[2].event_metadata["plan"]
|
||||
assert plan["mode"] == "storybook"
|
||||
assert [task["key"] for task in plan["tasks"]] == [
|
||||
"prepare_context",
|
||||
"generate_storybook_pages",
|
||||
"evaluate_storybook_pages",
|
||||
"generate_storybook_images",
|
||||
"persist_storybook",
|
||||
"queue_postprocessing",
|
||||
"complete_generation",
|
||||
]
|
||||
image_task = next(
|
||||
task
|
||||
for task in plan["tasks"]
|
||||
if task["key"] == "generate_storybook_images"
|
||||
)
|
||||
assert image_task["required"] is False
|
||||
assert image_task["recoverable"] is True
|
||||
assert events[4].event_metadata["passed"] is True
|
||||
assert events[4].event_metadata["artifact"] == "storybook_pages"
|
||||
page_events = [
|
||||
event
|
||||
for event in events
|
||||
if event.event_type == "storybook_page_image_succeeded"
|
||||
]
|
||||
assert [event.event_metadata["page_number"] for event in page_events] == [1, 2]
|
||||
assert events[8].event_metadata["completed_pages"] == [1, 2]
|
||||
assert events[10].event_metadata["completed_pages"] == [1, 2]
|
||||
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
client.cookies.set("access_token", auth_token)
|
||||
detail_response = await client.get(
|
||||
f"/api/generations/jobs/{job.id}"
|
||||
)
|
||||
|
||||
assert detail_response.status_code == 200
|
||||
detail = detail_response.json()
|
||||
assert "evaluation_completed" not in [
|
||||
event["event_type"] for event in detail["events"]
|
||||
]
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
@@ -716,6 +960,414 @@ async def test_story_provider_stats_aggregate_job_events(
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
async def test_story_trace_summary_aggregates_steps_artifacts_and_failure_categories(
|
||||
db_session,
|
||||
auth_token,
|
||||
degraded_story_with_text,
|
||||
):
|
||||
async def override_get_db():
|
||||
yield db_session
|
||||
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=degraded_story_with_text.user_id,
|
||||
output_mode="asset_retry",
|
||||
input_type="image",
|
||||
request_payload={"assets": ["image"]},
|
||||
story_id=degraded_story_with_text.id,
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=degraded_story_with_text.id,
|
||||
event_type="cover_image_started",
|
||||
status="running",
|
||||
metadata={
|
||||
"step": "image_generation",
|
||||
"artifact": "cover_image",
|
||||
"failure_category": None,
|
||||
},
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=degraded_story_with_text.id,
|
||||
event_type="cover_image_failed",
|
||||
status="failed",
|
||||
metadata={
|
||||
"step": "image_generation",
|
||||
"artifact": "cover_image",
|
||||
"failure_category": "provider_error",
|
||||
},
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=degraded_story_with_text.id,
|
||||
event_type="quality_gate_failed",
|
||||
status="failed",
|
||||
metadata={
|
||||
"step": "narrative_generation",
|
||||
"artifact": "story_text",
|
||||
"failure_category": "schema_error",
|
||||
},
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=degraded_story_with_text.id,
|
||||
event_type="evaluation_completed",
|
||||
status="failed",
|
||||
metadata={
|
||||
"step": "evaluation",
|
||||
"artifact": "story_text",
|
||||
"failure_category": "schema_error",
|
||||
"overall_score": 0.0,
|
||||
"passed": False,
|
||||
"blocking": True,
|
||||
"scores": [
|
||||
{
|
||||
"dimension": "structure",
|
||||
"score": 0.0,
|
||||
"reason": "故事结构未通过质量门。",
|
||||
},
|
||||
{
|
||||
"dimension": "safety",
|
||||
"score": 0.0,
|
||||
"reason": "内容未通过儿童安全或结构完整性检查。",
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
transport = ASGITransport(app=app)
|
||||
try:
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
client.cookies.set("access_token", auth_token)
|
||||
|
||||
response = await client.get(
|
||||
f"/api/generations/{degraded_story_with_text.id}/trace-summary"
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["story_id"] == degraded_story_with_text.id
|
||||
assert data["total_events"] == 4
|
||||
assert data["failed_events"] == 2
|
||||
assert data["by_step"] == [
|
||||
{"name": "image_generation", "count": 2},
|
||||
{"name": "narrative_generation", "count": 1},
|
||||
]
|
||||
assert data["by_artifact"] == [
|
||||
{"name": "cover_image", "count": 2},
|
||||
{"name": "story_text", "count": 1},
|
||||
]
|
||||
assert data["failure_categories"] == [
|
||||
{"name": "provider_error", "count": 1},
|
||||
{"name": "schema_error", "count": 1},
|
||||
]
|
||||
assert "evaluation" not in data
|
||||
assert "overall_score" not in str(data)
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
async def test_user_generation_job_detail_hides_internal_evaluation_step(
|
||||
db_session,
|
||||
auth_token,
|
||||
test_user,
|
||||
):
|
||||
async def override_get_db():
|
||||
yield db_session
|
||||
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
output_mode="story",
|
||||
input_type="keywords",
|
||||
request_payload={
|
||||
"output_mode": "story",
|
||||
"type": "keywords",
|
||||
"data": "小兔子",
|
||||
"generate_images": False,
|
||||
},
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
event_type="evaluation_completed",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"step": "evaluation",
|
||||
"artifact": "story_text",
|
||||
"overall_score": 0.96,
|
||||
"passed": True,
|
||||
"blocking": False,
|
||||
"scores": [
|
||||
{"dimension": "structure", "score": 1.0, "reason": "完整。"},
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
client.cookies.set("access_token", auth_token)
|
||||
|
||||
response = await client.get(f"/api/generations/jobs/{job.id}")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["current_step"] == "narrative_generated"
|
||||
assert data["progress_label"] == "正文已生成"
|
||||
assert [event["event_type"] for event in data["events"]] == [
|
||||
"request_accepted"
|
||||
]
|
||||
assert "evaluation_completed" not in str(data)
|
||||
assert "overall_score" not in str(data)
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
async def test_user_generation_job_detail_sanitizes_request_payload(
|
||||
db_session,
|
||||
auth_token,
|
||||
test_user,
|
||||
):
|
||||
async def override_get_db():
|
||||
yield db_session
|
||||
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=test_user.id,
|
||||
output_mode="story",
|
||||
input_type="keywords",
|
||||
request_payload={
|
||||
"output_mode": "story",
|
||||
"input_type": "keywords",
|
||||
"type": "keywords",
|
||||
"data": "不要回传原始关键词",
|
||||
"education_theme": "勇气",
|
||||
"generate_images": True,
|
||||
"page_count": 6,
|
||||
"child_profile_id": "child-public-id",
|
||||
"universe_id": "universe-public-id",
|
||||
"internal_dispatch_token": "secret-dispatch-token",
|
||||
"provider_override": "internal-provider",
|
||||
"evaluation_policy": {"threshold": 0.9},
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
client.cookies.set("access_token", auth_token)
|
||||
|
||||
response = await client.get(f"/api/generations/jobs/{job.id}")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["request_payload"] == {
|
||||
"child_profile_id": "child-public-id",
|
||||
"generate_images": True,
|
||||
"input_type": "keywords",
|
||||
"output_mode": "story",
|
||||
"page_count": 6,
|
||||
"type": "keywords",
|
||||
"universe_id": "universe-public-id",
|
||||
}
|
||||
payload_dump = str(data["request_payload"])
|
||||
assert "不要回传原始关键词" not in payload_dump
|
||||
assert "education_theme" not in payload_dump
|
||||
assert "secret-dispatch-token" not in payload_dump
|
||||
assert "internal-provider" not in payload_dump
|
||||
assert "evaluation_policy" not in payload_dump
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
async def test_user_generation_job_detail_sanitizes_public_event_metadata(
|
||||
db_session,
|
||||
auth_token,
|
||||
degraded_story_with_text,
|
||||
):
|
||||
async def override_get_db():
|
||||
yield db_session
|
||||
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=degraded_story_with_text.user_id,
|
||||
output_mode="asset_generation",
|
||||
input_type="image",
|
||||
request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]},
|
||||
story_id=degraded_story_with_text.id,
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=degraded_story_with_text.id,
|
||||
event_type="workflow_planned",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"step": "request_acceptance",
|
||||
"artifact": "none",
|
||||
"plan": {
|
||||
"mode": "asset_generation",
|
||||
"tasks": [
|
||||
{
|
||||
"key": "complete_image_asset",
|
||||
"step": "image_generation",
|
||||
"artifact": "image",
|
||||
"required": False,
|
||||
"recoverable": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
"internal_threshold": 0.72,
|
||||
},
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=degraded_story_with_text.id,
|
||||
event_type="asset_generation_completed",
|
||||
status="completed",
|
||||
metadata={
|
||||
"assets": ["image"],
|
||||
"result_snapshot": {
|
||||
"story_id": degraded_story_with_text.id,
|
||||
"last_error": "internal provider detail",
|
||||
},
|
||||
"error": "internal provider detail",
|
||||
},
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=degraded_story_with_text.id,
|
||||
event_type="executor_completed",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"plan_mode": "asset_generation",
|
||||
"planned_task_count": 3,
|
||||
"executed_task_keys": ["complete_image_asset"],
|
||||
"ignored_task_keys": [
|
||||
"start_asset_generation",
|
||||
"complete_asset_generation",
|
||||
],
|
||||
"result_assets": ["cover_image"],
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
client.cookies.set("access_token", auth_token)
|
||||
|
||||
response = await client.get(f"/api/generations/jobs/{job.id}")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
workflow_event = next(
|
||||
event for event in data["events"] if event["event_type"] == "workflow_planned"
|
||||
)
|
||||
assert workflow_event["event_metadata"] == {
|
||||
"artifact": "none",
|
||||
"plan_mode": "asset_generation",
|
||||
"planned_task_count": 1,
|
||||
"recoverable_task_count": 1,
|
||||
"step": "request_acceptance",
|
||||
}
|
||||
|
||||
completion_event = next(
|
||||
event
|
||||
for event in data["events"]
|
||||
if event["event_type"] == "asset_generation_completed"
|
||||
)
|
||||
assert completion_event["event_metadata"] == {"assets": ["image"]}
|
||||
assert "plan" not in workflow_event["event_metadata"]
|
||||
assert "tasks" not in str(data["events"])
|
||||
assert "internal_threshold" not in str(data["events"])
|
||||
assert "result_snapshot" not in str(data["events"])
|
||||
assert "internal provider detail" not in str(data["events"])
|
||||
assert "executor_completed" not in str(data["events"])
|
||||
assert "complete_image_asset" not in str(data["events"])
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
async def test_user_generation_job_summary_hides_internal_executor_step(
|
||||
db_session,
|
||||
auth_token,
|
||||
degraded_story_with_text,
|
||||
):
|
||||
async def override_get_db():
|
||||
yield db_session
|
||||
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
transport = ASGITransport(app=app)
|
||||
|
||||
job = await create_generation_job(
|
||||
db_session,
|
||||
user_id=degraded_story_with_text.user_id,
|
||||
output_mode="asset_generation",
|
||||
input_type="image",
|
||||
request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]},
|
||||
story_id=degraded_story_with_text.id,
|
||||
)
|
||||
await record_generation_event(
|
||||
db_session,
|
||||
job=job,
|
||||
story_id=degraded_story_with_text.id,
|
||||
event_type="executor_completed",
|
||||
status="succeeded",
|
||||
metadata={
|
||||
"plan_mode": "asset_generation",
|
||||
"executed_task_keys": ["complete_image_asset"],
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
client.cookies.set("access_token", auth_token)
|
||||
|
||||
detail_response = await client.get(f"/api/generations/jobs/{job.id}")
|
||||
list_response = await client.get(
|
||||
f"/api/generations/{degraded_story_with_text.id}/jobs"
|
||||
)
|
||||
trace_summary_response = await client.get(
|
||||
f"/api/generations/{degraded_story_with_text.id}/trace-summary"
|
||||
)
|
||||
|
||||
assert detail_response.status_code == 200
|
||||
detail = detail_response.json()
|
||||
assert detail["current_step"] == "workflow_planned"
|
||||
assert detail["progress_label"] == "工作流已规划"
|
||||
assert "executor_completed" not in str(detail)
|
||||
assert "complete_image_asset" not in str(detail)
|
||||
|
||||
assert list_response.status_code == 200
|
||||
listed_job = next(item for item in list_response.json() if item["id"] == job.id)
|
||||
assert listed_job["current_step"] == "workflow_planned"
|
||||
assert listed_job["progress_label"] == "工作流已规划"
|
||||
|
||||
assert trace_summary_response.status_code == 200
|
||||
trace_summary = trace_summary_response.json()
|
||||
assert "executor_completed" not in str(trace_summary)
|
||||
assert "complete_image_asset" not in str(trace_summary)
|
||||
assert trace_summary["total_events"] == 1
|
||||
finally:
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
async def test_user_provider_analytics_aggregate_across_stories(
|
||||
db_session,
|
||||
auth_token,
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
"""Tests for generation harness runtime support."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from sqlalchemy import select
|
||||
|
||||
@@ -7,8 +9,21 @@ from app.db.models import GenerationJob, GenerationJobEvent
|
||||
from app.services.adapters.storybook.primary import Storybook, StorybookPage
|
||||
from app.services.adapters.text.models import StoryOutput
|
||||
from app.services.generation_jobs import create_generation_job, record_generation_event
|
||||
from app.services.harness.artifacts import AssetCompletionResult
|
||||
from app.services.harness.control import ExecutionControl, GenerationJobCanceledError
|
||||
from app.services.harness.evaluation_replay import (
|
||||
EvaluationReplayArtifact,
|
||||
EvaluationReplayCase,
|
||||
ExpectedEvaluation,
|
||||
replay_evaluation_golden_cases,
|
||||
run_evaluation_replay_cases,
|
||||
)
|
||||
from app.services.harness.evaluators import evaluate_story_output, evaluate_storybook_output
|
||||
from app.services.harness.executor import run_asset_plan
|
||||
from app.services.harness.plans import (
|
||||
WorkflowMode,
|
||||
WorkflowPlan,
|
||||
WorkflowTask,
|
||||
build_asset_plan,
|
||||
build_story_plan,
|
||||
build_storybook_plan,
|
||||
@@ -27,12 +42,18 @@ from app.services.harness.types import (
|
||||
normalize_trace_metadata,
|
||||
step_for_event,
|
||||
)
|
||||
from app.services.story_status import StoryAssetStatus
|
||||
|
||||
FIXTURES_DIR = (
|
||||
Path(__file__).parents[1] / "app" / "services" / "harness" / "fixtures"
|
||||
)
|
||||
|
||||
|
||||
def test_event_type_maps_to_standard_workflow_step():
|
||||
assert step_for_event("request_accepted") == WorkflowStep.REQUEST_ACCEPTANCE
|
||||
assert step_for_event("context_prepared") == WorkflowStep.CONTEXT_PREPARATION
|
||||
assert step_for_event("narrative_generated") == WorkflowStep.NARRATIVE_GENERATION
|
||||
assert step_for_event("evaluation_completed") == WorkflowStep.EVALUATION
|
||||
assert step_for_event("story_saved") == WorkflowStep.STORY_PERSISTENCE
|
||||
assert step_for_event("provider_call_succeeded") == WorkflowStep.PROVIDER_INVOCATION
|
||||
assert step_for_event("quality_gate_failed") == WorkflowStep.NARRATIVE_GENERATION
|
||||
@@ -46,6 +67,7 @@ def test_event_type_maps_to_standard_workflow_step():
|
||||
def test_event_type_maps_to_standard_artifact():
|
||||
assert artifact_for_event("narrative_generated") == ArtifactKind.STORY_TEXT
|
||||
assert artifact_for_event("quality_gate_failed") == ArtifactKind.STORY_TEXT
|
||||
assert artifact_for_event("evaluation_completed") == ArtifactKind.STORY_TEXT
|
||||
assert artifact_for_event("cover_image_succeeded") == ArtifactKind.COVER_IMAGE
|
||||
assert artifact_for_event("storybook_page_image_failed") == ArtifactKind.PAGE_IMAGE
|
||||
assert artifact_for_event("audio_cache_hit") == ArtifactKind.AUDIO
|
||||
@@ -108,6 +130,13 @@ def test_story_plan_without_assets_snapshot():
|
||||
"required": True,
|
||||
"recoverable": False,
|
||||
},
|
||||
{
|
||||
"key": "evaluate_narrative",
|
||||
"step": "evaluation",
|
||||
"artifact": "story_text",
|
||||
"required": True,
|
||||
"recoverable": False,
|
||||
},
|
||||
{
|
||||
"key": "persist_story",
|
||||
"step": "story_persistence",
|
||||
@@ -137,7 +166,7 @@ def test_story_plan_with_assets_marks_cover_recoverable():
|
||||
plan = build_story_plan(generate_images=True).to_snapshot()
|
||||
|
||||
assert plan["mode"] == "story_with_assets"
|
||||
assert plan["tasks"][3] == {
|
||||
assert plan["tasks"][4] == {
|
||||
"key": "generate_cover_image",
|
||||
"step": "image_generation",
|
||||
"artifact": "cover_image",
|
||||
@@ -153,13 +182,14 @@ def test_storybook_plan_with_images_marks_storybook_images_recoverable():
|
||||
assert [task["key"] for task in plan["tasks"]] == [
|
||||
"prepare_context",
|
||||
"generate_storybook_pages",
|
||||
"evaluate_storybook_pages",
|
||||
"generate_storybook_images",
|
||||
"persist_storybook",
|
||||
"queue_postprocessing",
|
||||
"complete_generation",
|
||||
]
|
||||
assert plan["tasks"][2]["artifact"] == "image"
|
||||
assert plan["tasks"][2]["recoverable"] is True
|
||||
assert plan["tasks"][3]["artifact"] == "image"
|
||||
assert plan["tasks"][3]["recoverable"] is True
|
||||
|
||||
|
||||
def test_asset_retry_plan_deduplicates_assets():
|
||||
@@ -200,6 +230,86 @@ def test_asset_retry_plan_deduplicates_assets():
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_asset_plan_executes_asset_tasks_in_plan_order():
|
||||
calls: list[str] = []
|
||||
|
||||
async def image_task() -> AssetCompletionResult:
|
||||
calls.append("image")
|
||||
return AssetCompletionResult(
|
||||
asset="cover_image",
|
||||
status=StoryAssetStatus.READY,
|
||||
value="https://example.com/cover.png",
|
||||
)
|
||||
|
||||
async def audio_task() -> AssetCompletionResult:
|
||||
calls.append("audio")
|
||||
return AssetCompletionResult(
|
||||
asset="audio",
|
||||
status=StoryAssetStatus.READY,
|
||||
value=b"audio",
|
||||
)
|
||||
|
||||
result = await run_asset_plan(
|
||||
build_asset_plan(output_mode="asset_generation", assets=["audio", "image"]),
|
||||
image_task=image_task,
|
||||
audio_task=audio_task,
|
||||
)
|
||||
|
||||
assert calls == ["audio", "image"]
|
||||
assert result.executed_task_keys == ("complete_audio_asset", "complete_image_asset")
|
||||
assert result.ignored_task_keys == (
|
||||
"start_asset_generation",
|
||||
"complete_asset_generation",
|
||||
)
|
||||
assert [item.asset for item in result.task_results] == ["audio", "cover_image"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_asset_plan_ignores_unknown_non_asset_tasks():
|
||||
calls: list[str] = []
|
||||
plan = WorkflowPlan(
|
||||
mode=WorkflowMode.ASSET_RETRY,
|
||||
tasks=(
|
||||
WorkflowTask(
|
||||
key="start_asset_retry",
|
||||
step=WorkflowStep.ASSET_RETRY,
|
||||
artifact=ArtifactKind.NONE,
|
||||
),
|
||||
WorkflowTask(
|
||||
key="complete_video_asset",
|
||||
step=WorkflowStep.UNKNOWN,
|
||||
artifact=ArtifactKind.UNKNOWN,
|
||||
required=False,
|
||||
recoverable=True,
|
||||
),
|
||||
WorkflowTask(
|
||||
key="complete_asset_retry",
|
||||
step=WorkflowStep.ASSET_RETRY,
|
||||
artifact=ArtifactKind.NONE,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
async def image_task() -> AssetCompletionResult:
|
||||
calls.append("image")
|
||||
return AssetCompletionResult(
|
||||
asset="cover_image",
|
||||
status=StoryAssetStatus.READY,
|
||||
)
|
||||
|
||||
result = await run_asset_plan(plan, image_task=image_task)
|
||||
|
||||
assert calls == []
|
||||
assert result.task_results == ()
|
||||
assert result.executed_task_keys == ()
|
||||
assert result.ignored_task_keys == (
|
||||
"start_asset_retry",
|
||||
"complete_video_asset",
|
||||
"complete_asset_retry",
|
||||
)
|
||||
|
||||
|
||||
def test_story_quality_gate_accepts_complete_child_safe_story():
|
||||
validate_story_output(
|
||||
StoryOutput(
|
||||
@@ -211,6 +321,166 @@ def test_story_quality_gate_accepts_complete_child_safe_story():
|
||||
)
|
||||
|
||||
|
||||
def test_story_evaluator_scores_complete_child_safe_story():
|
||||
result = evaluate_story_output(
|
||||
StoryOutput(
|
||||
mode="generated",
|
||||
title="小兔子的月光花园",
|
||||
story_text="小兔子在花园里学会了和朋友轮流分享水壶,也学会了复盘今天的努力。",
|
||||
cover_prompt_suggestion="A gentle moonlit garden with a rabbit",
|
||||
),
|
||||
education_theme="复盘",
|
||||
)
|
||||
|
||||
assert result.passed is True
|
||||
assert result.blocking is False
|
||||
assert result.overall_score >= 0.9
|
||||
assert result.to_metadata()["scores"][0]["dimension"] == "structure"
|
||||
|
||||
|
||||
def test_story_evaluator_blocks_quality_gate_failure():
|
||||
result = evaluate_story_output(
|
||||
StoryOutput(
|
||||
mode="generated",
|
||||
title="空白故事",
|
||||
story_text="",
|
||||
cover_prompt_suggestion="A cover",
|
||||
)
|
||||
)
|
||||
|
||||
assert result.passed is False
|
||||
assert result.blocking is True
|
||||
assert result.overall_score == 0.0
|
||||
assert result.gate_error is not None
|
||||
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == "missing_story_text"
|
||||
|
||||
|
||||
def test_storybook_evaluator_scores_complete_child_safe_storybook():
|
||||
result = evaluate_storybook_output(
|
||||
Storybook(
|
||||
title="森林里的复盘星星",
|
||||
main_character="小兔子露露",
|
||||
art_style="温暖水彩",
|
||||
cover_prompt="A warm watercolor forest cover",
|
||||
pages=[
|
||||
StorybookPage(
|
||||
page_number=1,
|
||||
text="露露在森林里发现一颗会提醒她复盘的小星星。",
|
||||
image_prompt="Lulu finds a star",
|
||||
),
|
||||
StorybookPage(
|
||||
page_number=2,
|
||||
text="她回想今天的努力,学会下次先和朋友商量。",
|
||||
image_prompt="Lulu thinking with friends",
|
||||
),
|
||||
],
|
||||
),
|
||||
education_theme="复盘",
|
||||
)
|
||||
|
||||
assert result.passed is True
|
||||
assert result.blocking is False
|
||||
assert result.overall_score >= 0.9
|
||||
|
||||
|
||||
def test_storybook_evaluator_blocks_quality_gate_failure():
|
||||
result = evaluate_storybook_output(
|
||||
Storybook(
|
||||
title="森林绘本",
|
||||
main_character="小兔子",
|
||||
art_style="水彩",
|
||||
cover_prompt="A forest cover",
|
||||
pages=[
|
||||
StorybookPage(page_number=1, text="第一页。", image_prompt="page 1"),
|
||||
StorybookPage(page_number=1, text="第二页。", image_prompt="page 2"),
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
assert result.passed is False
|
||||
assert result.blocking is True
|
||||
assert result.gate_error is not None
|
||||
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == (
|
||||
"invalid_storybook_page_number"
|
||||
)
|
||||
|
||||
|
||||
def test_evaluation_golden_cases_replay_successfully():
|
||||
result = replay_evaluation_golden_cases(
|
||||
FIXTURES_DIR / "evaluation_golden_cases.json"
|
||||
)
|
||||
|
||||
assert result.passed is True, result.failure_report()
|
||||
assert result.failed_case_ids == ()
|
||||
assert len(result.cases) == 11
|
||||
assert {
|
||||
case.artifact
|
||||
for case in result.cases
|
||||
} == {
|
||||
EvaluationReplayArtifact.STORY,
|
||||
EvaluationReplayArtifact.STORYBOOK,
|
||||
}
|
||||
|
||||
|
||||
def test_evaluation_golden_cases_report_internal_coverage_summary():
|
||||
result = replay_evaluation_golden_cases(
|
||||
FIXTURES_DIR / "evaluation_golden_cases.json"
|
||||
)
|
||||
|
||||
summary = result.coverage_summary()
|
||||
|
||||
assert summary["artifact"] == {
|
||||
"storybook": 5,
|
||||
"story": 6,
|
||||
}
|
||||
assert summary["age_band"] == {
|
||||
"3-4": 4,
|
||||
"5-6": 4,
|
||||
"unknown": 2,
|
||||
"7-8": 1,
|
||||
}
|
||||
assert summary["risk_area"] == {
|
||||
"schema_error": 4,
|
||||
"happy_path": 2,
|
||||
"readability_warning": 2,
|
||||
"safety_error": 2,
|
||||
"length_boundary": 1,
|
||||
}
|
||||
assert summary["outcome"] == {
|
||||
"blocked": 8,
|
||||
"passed": 3,
|
||||
}
|
||||
assert summary["tags"]["story"] == 6
|
||||
assert summary["tags"]["storybook"] == 5
|
||||
assert summary["tags"]["blocking"] == 6
|
||||
assert summary["tags"]["threshold_block"] == 2
|
||||
|
||||
|
||||
def test_evaluation_replay_reports_expectation_mismatch():
|
||||
case = EvaluationReplayCase(
|
||||
case_id="expectation-mismatch",
|
||||
artifact=EvaluationReplayArtifact.STORY,
|
||||
input_payload={"keywords": "小兔子"},
|
||||
output_payload={
|
||||
"mode": "generated",
|
||||
"title": "小兔子的花园",
|
||||
"story_text": "小兔子学会了和朋友分享水壶。",
|
||||
"cover_prompt_suggestion": "A rabbit sharing a watering can",
|
||||
},
|
||||
expected=ExpectedEvaluation(
|
||||
passed=True,
|
||||
blocking=False,
|
||||
min_overall_score=0.99,
|
||||
),
|
||||
)
|
||||
|
||||
result = run_evaluation_replay_cases([case])
|
||||
|
||||
assert result.passed is False
|
||||
assert result.failed_case_ids == ("expectation-mismatch",)
|
||||
assert "expected overall_score >=" in result.failure_report()
|
||||
|
||||
|
||||
def test_story_quality_gate_rejects_missing_story_text():
|
||||
output = StoryOutput(
|
||||
mode="generated",
|
||||
|
||||
Reference in New Issue
Block a user