Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -47,6 +47,21 @@ interface GenerationProviderStats {
estimated_cost_usd: number
}
interface GenerationTraceBucket {
name: string
count: number
}
interface GenerationTraceSummary {
story_id: number
window_days: number | null
total_events: number
failed_events: number
by_step: GenerationTraceBucket[]
by_artifact: GenerationTraceBucket[]
failure_categories: GenerationTraceBucket[]
}
const props = withDefaults(
defineProps<{
storyId: number | null
@@ -64,6 +79,7 @@ const props = withDefaults(
const jobs = ref<GenerationJobSummary[]>([])
const activeJob = ref<GenerationJobDetail | null>(null)
const providerStats = ref<GenerationProviderStats | null>(null)
const traceSummary = ref<GenerationTraceSummary | null>(null)
const loading = ref(false)
const actionLoading = ref(false)
const error = ref('')
@@ -79,6 +95,8 @@ const providerSuccessRate = computed(() => {
if (!providerStats.value?.total_calls) return null
return Math.round((providerStats.value.successful_calls / providerStats.value.total_calls) * 100)
})
const topTraceStep = computed(() => traceSummary.value?.by_step[0] ?? null)
const topFailureCategory = computed(() => traceSummary.value?.failure_categories[0] ?? null)
const mutedClass = computed(() => (isDark.value ? 'text-white/65' : 'text-gray-500'))
const shellClass = computed(() => (
isDark.value ? 'border-white/10 bg-white/10 text-white backdrop-blur' : 'border-gray-100 bg-white/85 text-gray-900'
@@ -117,15 +135,18 @@ function statusLabel(status?: string) {
function eventLabel(eventType: string) {
const labels: Record<string, string> = {
request_accepted: '请求接收',
workflow_planned: '工作流规划',
worker_started: '后台任务开始',
retry_queued: '重新排队',
cancel_requested: '已请求取消',
context_prepared: '上下文准备',
evaluation_completed: '内容评测',
narrative_generated: '正文生成',
story_saved: '故事保存',
provider_call_started: '供应商调用',
provider_call_succeeded: '供应商成功',
provider_call_failed: '供应商失败',
quality_gate_failed: '质量门失败',
cover_image_started: '封面开始',
cover_image_succeeded: '封面就绪',
cover_image_failed: '封面失败',
@@ -147,6 +168,73 @@ function eventLabel(eventType: string) {
return labels[eventType] ?? eventType
}
function stepLabel(step?: unknown) {
const labels: Record<string, string> = {
request_acceptance: '请求接收',
worker_start: '后台启动',
context_preparation: '上下文准备',
narrative_generation: '主内容生成',
evaluation: '内容评测',
story_persistence: '故事保存',
provider_invocation: '供应商调用',
image_generation: '图片生成',
audio_generation: '音频生成',
asset_retry: '资源重试',
asset_generation: '资源生成',
postprocessing: '后处理',
completion: '任务完成',
cancellation: '取消',
stale_recovery: '超时收敛',
unknown: '未知步骤',
}
const key = typeof step === 'string' ? step : ''
return labels[key] ?? key
}
function artifactLabel(artifact?: unknown) {
const labels: Record<string, string> = {
story_text: '故事正文',
storybook_pages: '绘本分页',
cover_image: '封面图',
page_image: '分页插图',
image: '图片资源',
audio: '音频',
achievement_memory: '成长记忆',
none: '无资源',
unknown: '未知资源',
}
const key = typeof artifact === 'string' ? artifact : ''
return labels[key] ?? key
}
function failureCategoryLabel(category?: unknown) {
const labels: Record<string, string> = {
provider_error: '供应商失败',
schema_error: '结构不完整',
safety_error: '儿童安全风险',
timeout: '超时',
canceled: '用户取消',
stale_job: '任务卡住',
storage_error: '存储失败',
validation_error: '输入校验失败',
unknown_error: '未知失败',
}
const key = typeof category === 'string' ? category : ''
return labels[key] ?? key
}
function traceMetaText(event: GenerationJobEvent) {
const meta = event.event_metadata
const step = stepLabel(meta.step)
const artifact = artifactLabel(meta.artifact)
const failureCategory = meta.failure_category
? failureCategoryLabel(meta.failure_category)
: ''
return [step, artifact && artifact !== '无资源' ? artifact : '', failureCategory]
.filter(Boolean)
.join(' · ')
}
function formatTime(value: string) {
return new Intl.DateTimeFormat('zh-CN', {
hour: '2-digit',
@@ -188,22 +276,25 @@ async function selectJob(jobId: string) {
async function refresh() {
if (props.storyId === null) {
jobs.value = []
activeJob.value = null
providerStats.value = null
return
jobs.value = []
activeJob.value = null
providerStats.value = null
traceSummary.value = null
return
}
error.value = ''
const selectedJobId = activeJob.value?.id ?? null
try {
const [nextJobs, stats] = await Promise.all([
const [nextJobs, stats, trace] = await Promise.all([
api.get<GenerationJobSummary[]>(`/api/generations/${props.storyId}/jobs`),
api.get<GenerationProviderStats>(`/api/generations/${props.storyId}/provider-stats`),
api.get<GenerationTraceSummary>(`/api/generations/${props.storyId}/trace-summary`),
])
jobs.value = nextJobs
providerStats.value = stats
traceSummary.value = trace
const nextJobId = (
selectedJobId
? jobs.value.find((job) => job.id === selectedJobId)?.id
@@ -218,6 +309,7 @@ async function refresh() {
jobs.value = []
activeJob.value = null
providerStats.value = null
traceSummary.value = null
error.value = e instanceof Error ? e.message : '生成轨迹加载失败'
}
}
@@ -331,6 +423,32 @@ defineExpose({ refresh })
</div>
</div>
<div
v-if="traceSummary?.total_events"
class="grid gap-3 md:grid-cols-4"
>
<div class="rounded-lg border p-3" :class="panelClass">
<div class="text-xs" :class="mutedClass">流程事件</div>
<div class="mt-1 text-xl font-semibold">{{ traceSummary.total_events }}</div>
</div>
<div class="rounded-lg border p-3" :class="panelClass">
<div class="text-xs" :class="mutedClass">失败事件</div>
<div class="mt-1 text-xl font-semibold">{{ traceSummary.failed_events }}</div>
</div>
<div class="rounded-lg border p-3" :class="panelClass">
<div class="text-xs" :class="mutedClass">主要步骤</div>
<div class="mt-1 text-base font-semibold">
{{ topTraceStep ? `${stepLabel(topTraceStep.name)} · ${topTraceStep.count}` : '暂无' }}
</div>
</div>
<div class="rounded-lg border p-3" :class="panelClass">
<div class="text-xs" :class="mutedClass">主要失败</div>
<div class="mt-1 text-base font-semibold">
{{ topFailureCategory ? `${failureCategoryLabel(topFailureCategory.name)} · ${topFailureCategory.count}` : '暂无' }}
</div>
</div>
</div>
<div v-if="!jobs.length" class="rounded-lg border border-dashed border-gray-200 p-4 text-sm" :class="mutedClass">
暂无生成轨迹旧数据会在下一次资源补全后开始记录
</div>
@@ -445,6 +563,9 @@ defineExpose({ refresh })
<p v-else-if="event.message" class="mt-1 text-xs text-gray-500">
{{ event.message }}
</p>
<p v-if="traceMetaText(event)" class="mt-1 text-xs text-gray-500">
{{ traceMetaText(event) }}
</p>
</div>
</li>
</ol>

View File

@@ -1,4 +1,5 @@
from typing import Literal
from datetime import datetime
from typing import Any, Literal
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel, ConfigDict, Field
@@ -9,6 +10,10 @@ from app.core.admin_auth import admin_guard
from app.db.admin_models import Provider
from app.db.database import get_db
from app.services.adapters.registry import AdapterRegistry
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
from app.services.admin_executor_coverage import get_admin_executor_coverage
from app.services.admin_generation_trace import get_admin_generation_job_trace
from app.services.admin_harness_readiness import get_admin_harness_readiness
from app.services.admin_provider_analytics import get_admin_provider_analytics
from app.services.cost_tracker import cost_tracker
from app.services.provider_policy import DEFAULT_PROVIDERS, list_capability_policies
@@ -103,6 +108,169 @@ class ProviderAnalyticsResponse(BaseModel):
by_user: list[ProviderAnalyticsUserBucket]
failure_reasons: list[ProviderAnalyticsFailureReason]
class EvaluationAnalyticsArtifactBucket(BaseModel):
artifact: str
count: int
class EvaluationAnalyticsOutputModeBucket(BaseModel):
output_mode: str
count: int
class EvaluationAnalyticsScoreBandBucket(BaseModel):
band: str
count: int
class EvaluationAnalyticsDimensionScore(BaseModel):
dimension: str
average_score: float
count: int
class EvaluationAnalyticsQualityGateIssue(BaseModel):
code: str
count: int
class EvaluationAnalyticsFailureCategory(BaseModel):
category: str
count: int
class EvaluationAnalyticsWarning(BaseModel):
message: str
count: int
class EvaluationAnalyticsResponse(BaseModel):
scope: str
window_days: int | None = None
artifact: str | None = None
total_evaluations: int
passed_evaluations: int
blocked_evaluations: int
pass_rate: float
average_score: float | None = None
job_count: int
story_count: int
user_count: int
by_artifact: list[EvaluationAnalyticsArtifactBucket]
by_output_mode: list[EvaluationAnalyticsOutputModeBucket]
score_bands: list[EvaluationAnalyticsScoreBandBucket]
dimension_scores: list[EvaluationAnalyticsDimensionScore]
quality_gate_issues: list[EvaluationAnalyticsQualityGateIssue]
failure_categories: list[EvaluationAnalyticsFailureCategory]
warnings: list[EvaluationAnalyticsWarning]
class ExecutorCoveragePlanModeBucket(BaseModel):
plan_mode: str
count: int
class ExecutorCoverageOutputModeBucket(BaseModel):
output_mode: str
count: int
class ExecutorCoverageTaskKeyBucket(BaseModel):
task_key: str
count: int
class ExecutorCoverageAssetBucket(BaseModel):
asset: str
count: int
class ExecutorCoverageResponse(BaseModel):
scope: str
window_days: int | None = None
plan_mode: str | None = None
total_runs: int
total_planned_tasks: int
total_executed_tasks: int
total_ignored_tasks: int
coverage_ratio: float
job_count: int
story_count: int
user_count: int
by_plan_mode: list[ExecutorCoveragePlanModeBucket]
by_output_mode: list[ExecutorCoverageOutputModeBucket]
executed_task_keys: list[ExecutorCoverageTaskKeyBucket]
ignored_task_keys: list[ExecutorCoverageTaskKeyBucket]
result_assets: list[ExecutorCoverageAssetBucket]
class AdminGenerationJobEventResponse(BaseModel):
id: int
job_id: str
story_id: int | None = None
event_type: str
status: str
message: str | None = None
event_metadata: dict[str, Any] = Field(default_factory=dict)
created_at: datetime
class AdminGenerationJobTraceResponse(BaseModel):
id: str
user_id: str
story_id: int | None = None
output_mode: str
input_type: str
status: str
current_step: str
progress_percent: int
progress_label: str
is_terminal: bool
can_cancel: bool = False
can_retry: bool = False
result_snapshot: dict[str, Any] = Field(default_factory=dict)
error_message: str | None = None
request_payload: dict[str, Any] = Field(default_factory=dict)
executor_coverage: ExecutorCoverageResponse
events: list[AdminGenerationJobEventResponse] = Field(default_factory=list)
created_at: datetime
updated_at: datetime
class HarnessReadinessCheck(BaseModel):
code: str
status: Literal["ready", "needs_attention", "blocked"]
message: str
details: dict[str, Any] = Field(default_factory=dict)
class HarnessReadinessGoldenReplay(BaseModel):
passed: bool
total_cases: int
failed_case_ids: list[str]
coverage_summary: dict[str, dict[str, int]] = Field(default_factory=dict)
class HarnessReadinessThresholds(BaseModel):
min_runtime_evaluations: int
min_executor_runs: int
min_evaluation_pass_rate: float
min_evaluation_average_score: float
min_executor_coverage_ratio: float
class HarnessReadinessResponse(BaseModel):
scope: str
window_days: int | None = None
status: Literal["ready", "needs_attention", "blocked"]
thresholds: HarnessReadinessThresholds
checks: list[HarnessReadinessCheck]
golden_replay: HarnessReadinessGoldenReplay
evaluation_analytics: EvaluationAnalyticsResponse
executor_coverage: ExecutorCoverageResponse
@router.get("/providers/adapters")
async def list_available_adapters():
"""获取所有可用的适配器类型 (定义的类)。"""
@@ -137,6 +305,55 @@ async def get_provider_analytics(
)
@router.get("/evaluations/analytics", response_model=EvaluationAnalyticsResponse)
async def get_evaluation_analytics(
days: int | None = Query(default=None, ge=1, le=365),
artifact: Literal["story_text", "storybook_pages"] | None = Query(default=None),
db: AsyncSession = Depends(get_db),
):
"""获取内部内容评测摘要,仅供管理控制面使用。"""
return await get_admin_evaluation_analytics(
db,
days=days,
artifact=artifact,
)
@router.get("/executors/coverage", response_model=ExecutorCoverageResponse)
async def get_executor_coverage(
days: int | None = Query(default=None, ge=1, le=365),
plan_mode: Literal["asset_generation", "asset_retry"] | None = Query(default=None),
db: AsyncSession = Depends(get_db),
):
"""获取内部 executor 执行覆盖率,仅供管理控制面使用。"""
return await get_admin_executor_coverage(
db,
days=days,
plan_mode=plan_mode,
)
@router.get("/harness/readiness", response_model=HarnessReadinessResponse)
async def get_harness_readiness(
days: int | None = Query(default=None, ge=1, le=365),
db: AsyncSession = Depends(get_db),
):
"""获取内部 harness readiness 审查摘要,仅供管理控制面使用。"""
return await get_admin_harness_readiness(db, days=days)
@router.get(
"/generations/jobs/{job_id}/trace",
response_model=AdminGenerationJobTraceResponse,
)
async def get_generation_job_trace(
job_id: str,
db: AsyncSession = Depends(get_db),
):
"""获取完整内部生成链路,仅供管理控制面排查与审查使用。"""
return await get_admin_generation_job_trace(db, job_id=job_id)
@router.get("/providers", response_model=list[ProviderResponse])
async def list_providers(db: AsyncSession = Depends(get_db)):
result = await db.execute(select(Provider))

View File

@@ -24,6 +24,7 @@ from app.schemas.story_schemas import (
GenerationProviderStatsResponse,
GenerationRequest,
GenerationResponse,
GenerationTraceSummaryResponse,
StoryAssetRetryRequest,
StoryAudioStatusResponse,
StorybookRequest,
@@ -37,6 +38,7 @@ from app.services import story_service
from app.services.generation_jobs import (
get_generation_job_detail,
get_story_provider_stats,
get_story_trace_summary,
get_user_generation_ops_summary,
get_user_provider_analytics,
list_story_generation_jobs,
@@ -181,6 +183,25 @@ async def get_generation_provider_stats(
)
@router.get(
"/generations/{story_id}/trace-summary",
response_model=GenerationTraceSummaryResponse,
)
async def get_generation_trace_summary(
story_id: int,
days: int | None = Query(default=None, ge=1, le=365),
user: User = Depends(require_user),
db: AsyncSession = Depends(get_db),
):
"""Get workflow trace summary aggregated from generation job events."""
return await get_story_trace_summary(
db,
story_id=story_id,
user_id=user.id,
days=days,
)
@router.get("/generations/{story_id}", response_model=StoryDetailResponse)
async def get_generation(
story_id: int,

View File

@@ -244,6 +244,25 @@ class GenerationProviderStatsResponse(BaseModel):
failure_reasons: list[GenerationProviderFailureReasonResponse] = Field(default_factory=list)
class GenerationTraceBucketResponse(BaseModel):
"""Aggregated generation trace bucket."""
name: str
count: int
class GenerationTraceSummaryResponse(BaseModel):
"""Workflow trace summary aggregated from generation job events."""
story_id: int
window_days: int | None = None
total_events: int
failed_events: int
by_step: list[GenerationTraceBucketResponse] = Field(default_factory=list)
by_artifact: list[GenerationTraceBucketResponse] = Field(default_factory=list)
failure_categories: list[GenerationTraceBucketResponse] = Field(default_factory=list)
class GenerationProviderAnalyticsResponse(BaseModel):
"""Provider call stats aggregated across one user's generation history."""

View File

@@ -0,0 +1,204 @@
"""Admin-only analytics for internal generation evaluation events."""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
from typing import Any
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import GenerationJob, GenerationJobEvent
def _as_float(value: Any) -> float | None:
if isinstance(value, int | float):
return float(value)
return None
def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
return [
{key_name: name, "count": count}
for name, count in sorted(
counts.items(),
key=lambda item: (-item[1], item[0]),
)
]
def _average_bucket(
totals: dict[str, float],
counts: dict[str, int],
*,
key_name: str,
) -> list[dict[str, Any]]:
rows = [
{
key_name: name,
"average_score": round(totals[name] / counts[name], 4),
"count": counts[name],
}
for name in totals
if counts.get(name)
]
rows.sort(key=lambda item: (-int(item["count"]), str(item[key_name])))
return rows
def _score_band(score: float) -> str:
if score >= 0.9:
return "excellent"
if score >= 0.8:
return "good"
if score >= 0.7:
return "pass"
if score > 0:
return "blocked_low_score"
return "blocked_quality_gate"
def _metadata_scores(metadata: dict[str, Any]) -> list[dict[str, Any]]:
raw_scores = metadata.get("scores")
if not isinstance(raw_scores, list):
return []
return [score for score in raw_scores if isinstance(score, dict)]
def _quality_gate_issues(metadata: dict[str, Any]) -> list[dict[str, Any]]:
quality_gate = metadata.get("quality_gate")
if not isinstance(quality_gate, dict):
return []
raw_issues = quality_gate.get("issues")
if not isinstance(raw_issues, list):
return []
return [issue for issue in raw_issues if isinstance(issue, dict)]
async def get_admin_evaluation_analytics(
db: AsyncSession,
*,
days: int | None = None,
artifact: str | None = None,
) -> dict[str, Any]:
"""Aggregate internal evaluation results for the admin control plane."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None
query = (
select(GenerationJobEvent, GenerationJob)
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
.where(GenerationJobEvent.event_type == "evaluation_completed")
.order_by(GenerationJobEvent.id)
)
if cutoff is not None:
query = query.where(GenerationJobEvent.created_at >= cutoff)
rows = (await db.execute(query)).all()
total_evaluations = 0
passed_evaluations = 0
blocked_evaluations = 0
score_total = 0.0
score_count = 0
job_ids: set[str] = set()
story_ids: set[int] = set()
user_ids: set[str] = set()
artifacts: dict[str, int] = {}
output_modes: dict[str, int] = {}
score_bands: dict[str, int] = {}
dimension_totals: dict[str, float] = {}
dimension_counts: dict[str, int] = {}
quality_gate_codes: dict[str, int] = {}
failure_categories: dict[str, int] = {}
warning_counts: dict[str, int] = {}
for event, job in rows:
metadata = event.event_metadata or {}
event_artifact = str(metadata.get("artifact") or "unknown")
if artifact is not None and event_artifact != artifact:
continue
total_evaluations += 1
job_ids.add(job.id)
user_ids.add(job.user_id)
if event.story_id is not None:
story_ids.add(int(event.story_id))
elif job.story_id is not None:
story_ids.add(int(job.story_id))
artifacts[event_artifact] = artifacts.get(event_artifact, 0) + 1
output_modes[job.output_mode] = output_modes.get(job.output_mode, 0) + 1
passed = metadata.get("passed") is True
blocking = metadata.get("blocking") is True
if passed:
passed_evaluations += 1
if blocking:
blocked_evaluations += 1
overall_score = _as_float(metadata.get("overall_score"))
if overall_score is not None:
score_total += overall_score
score_count += 1
band = _score_band(overall_score)
score_bands[band] = score_bands.get(band, 0) + 1
for score in _metadata_scores(metadata):
dimension = score.get("dimension")
dimension_score = _as_float(score.get("score"))
if not isinstance(dimension, str) or dimension_score is None:
continue
dimension_totals[dimension] = dimension_totals.get(dimension, 0.0) + dimension_score
dimension_counts[dimension] = dimension_counts.get(dimension, 0) + 1
for issue in _quality_gate_issues(metadata):
code = issue.get("code")
if isinstance(code, str) and code:
quality_gate_codes[code] = quality_gate_codes.get(code, 0) + 1
failure_category = issue.get("failure_category")
if isinstance(failure_category, str) and failure_category:
failure_categories[failure_category] = (
failure_categories.get(failure_category, 0) + 1
)
warnings = metadata.get("warnings")
if isinstance(warnings, list):
for warning in warnings:
if isinstance(warning, str) and warning:
warning_counts[warning] = warning_counts.get(warning, 0) + 1
return {
"scope": "admin_internal_evaluations",
"window_days": days,
"artifact": artifact,
"total_evaluations": total_evaluations,
"passed_evaluations": passed_evaluations,
"blocked_evaluations": blocked_evaluations,
"pass_rate": (
round(passed_evaluations / total_evaluations, 4)
if total_evaluations
else 0.0
),
"average_score": round(score_total / score_count, 4) if score_count else None,
"job_count": len(job_ids),
"story_count": len(story_ids),
"user_count": len(user_ids),
"by_artifact": _sorted_count_buckets(artifacts, key_name="artifact"),
"by_output_mode": _sorted_count_buckets(output_modes, key_name="output_mode"),
"score_bands": _sorted_count_buckets(score_bands, key_name="band"),
"dimension_scores": _average_bucket(
dimension_totals,
dimension_counts,
key_name="dimension",
),
"quality_gate_issues": _sorted_count_buckets(
quality_gate_codes,
key_name="code",
),
"failure_categories": _sorted_count_buckets(
failure_categories,
key_name="category",
),
"warnings": _sorted_count_buckets(warning_counts, key_name="message"),
}

View File

@@ -0,0 +1,147 @@
"""Admin-only analytics for internal workflow executor coverage."""
from __future__ import annotations
from collections.abc import Iterable
from datetime import datetime, timedelta, timezone
from typing import Any
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import GenerationJob, GenerationJobEvent
def _as_int(value: Any) -> int:
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
return 0
def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
return [
{key_name: name, "count": count}
for name, count in sorted(
counts.items(),
key=lambda item: (-item[1], item[0]),
)
]
def _iter_strings(value: Any) -> Iterable[str]:
if not isinstance(value, list | tuple | set):
return
for item in value:
if isinstance(item, str) and item:
yield item
def summarize_executor_coverage_rows(
rows: Iterable[tuple[GenerationJobEvent, GenerationJob]],
*,
days: int | None = None,
plan_mode: str | None = None,
scope: str = "admin_internal_executor_coverage",
) -> dict[str, Any]:
"""Aggregate internal executor coverage rows into an admin-only summary."""
total_runs = 0
total_planned_tasks = 0
total_executed_tasks = 0
total_ignored_tasks = 0
job_ids: set[str] = set()
story_ids: set[int] = set()
user_ids: set[str] = set()
by_plan_mode: dict[str, int] = {}
by_output_mode: dict[str, int] = {}
executed_task_keys: dict[str, int] = {}
ignored_task_keys: dict[str, int] = {}
result_assets: dict[str, int] = {}
for event, job in rows:
metadata = event.event_metadata or {}
event_plan_mode = str(metadata.get("plan_mode") or "unknown")
if plan_mode is not None and event_plan_mode != plan_mode:
continue
total_runs += 1
job_ids.add(job.id)
user_ids.add(job.user_id)
if event.story_id is not None:
story_ids.add(int(event.story_id))
elif job.story_id is not None:
story_ids.add(int(job.story_id))
by_plan_mode[event_plan_mode] = by_plan_mode.get(event_plan_mode, 0) + 1
by_output_mode[job.output_mode] = by_output_mode.get(job.output_mode, 0) + 1
total_planned_tasks += _as_int(metadata.get("planned_task_count"))
total_executed_tasks += _as_int(metadata.get("executed_task_count"))
total_ignored_tasks += _as_int(metadata.get("ignored_task_count"))
for key in _iter_strings(metadata.get("executed_task_keys")):
executed_task_keys[key] = executed_task_keys.get(key, 0) + 1
for key in _iter_strings(metadata.get("ignored_task_keys")):
ignored_task_keys[key] = ignored_task_keys.get(key, 0) + 1
for asset in _iter_strings(metadata.get("result_assets")):
result_assets[asset] = result_assets.get(asset, 0) + 1
coverage_ratio = (
round(total_executed_tasks / total_planned_tasks, 4)
if total_planned_tasks
else 0.0
)
return {
"scope": scope,
"window_days": days,
"plan_mode": plan_mode,
"total_runs": total_runs,
"total_planned_tasks": total_planned_tasks,
"total_executed_tasks": total_executed_tasks,
"total_ignored_tasks": total_ignored_tasks,
"coverage_ratio": coverage_ratio,
"job_count": len(job_ids),
"story_count": len(story_ids),
"user_count": len(user_ids),
"by_plan_mode": _sorted_count_buckets(by_plan_mode, key_name="plan_mode"),
"by_output_mode": _sorted_count_buckets(by_output_mode, key_name="output_mode"),
"executed_task_keys": _sorted_count_buckets(
executed_task_keys,
key_name="task_key",
),
"ignored_task_keys": _sorted_count_buckets(
ignored_task_keys,
key_name="task_key",
),
"result_assets": _sorted_count_buckets(result_assets, key_name="asset"),
}
async def get_admin_executor_coverage(
db: AsyncSession,
*,
days: int | None = None,
plan_mode: str | None = None,
) -> dict[str, Any]:
"""Aggregate internal executor coverage events for the admin control plane."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None
query = (
select(GenerationJobEvent, GenerationJob)
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
.where(GenerationJobEvent.event_type == "executor_completed")
.order_by(GenerationJobEvent.id)
)
if cutoff is not None:
query = query.where(GenerationJobEvent.created_at >= cutoff)
rows = (await db.execute(query)).all()
return summarize_executor_coverage_rows(rows, days=days, plan_mode=plan_mode)

View File

@@ -0,0 +1,52 @@
"""Admin-only generation trace detail service."""
from __future__ import annotations
from typing import Any
from fastapi import HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import GenerationJob, GenerationJobEvent
from app.services.admin_executor_coverage import summarize_executor_coverage_rows
from app.services.generation_jobs import (
generation_event_to_response,
generation_job_to_summary,
)
async def get_admin_generation_job_trace(
db: AsyncSession,
*,
job_id: str,
) -> dict[str, Any]:
"""Return a complete internal generation trace for the admin control plane."""
job = (
await db.execute(select(GenerationJob).where(GenerationJob.id == job_id))
).scalar_one_or_none()
if job is None:
raise HTTPException(status_code=404, detail="Generation job not found")
events = (
await db.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
executor_rows = [
(event, job) for event in events if event.event_type == "executor_completed"
]
return {
**generation_job_to_summary(job),
"user_id": job.user_id,
"request_payload": job.request_payload or {},
"executor_coverage": summarize_executor_coverage_rows(
executor_rows,
scope="admin_internal_job_executor_coverage",
),
"events": [generation_event_to_response(event) for event in events],
}

View File

@@ -0,0 +1,262 @@
"""Admin-only readiness audit for harness-driven generation."""
from __future__ import annotations
from pathlib import Path
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
from app.services.admin_executor_coverage import get_admin_executor_coverage
from app.services.harness.evaluation_replay import replay_evaluation_golden_cases
_GOLDEN_CASES_PATH = (
Path(__file__).resolve().parent
/ "harness"
/ "fixtures"
/ "evaluation_golden_cases.json"
)
_MIN_RUNTIME_EVALUATIONS = 1
_MIN_EXECUTOR_RUNS = 1
_MIN_EVALUATION_PASS_RATE = 0.7
_MIN_EVALUATION_AVERAGE_SCORE = 0.7
_MIN_EXECUTOR_COVERAGE_RATIO = 0.2
def _check(
*,
code: str,
status: str,
message: str,
details: dict[str, Any] | None = None,
) -> dict[str, Any]:
return {
"code": code,
"status": status,
"message": message,
"details": details or {},
}
def _overall_status(checks: list[dict[str, Any]]) -> str:
statuses = {check["status"] for check in checks}
if "blocked" in statuses:
return "blocked"
if "needs_attention" in statuses:
return "needs_attention"
return "ready"
def _run_golden_replay() -> dict[str, Any]:
if not _GOLDEN_CASES_PATH.exists():
return {
"passed": False,
"total_cases": 0,
"failed_case_ids": ["fixture_missing"],
"coverage_summary": {},
}
result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH)
return {
"passed": result.passed,
"total_cases": len(result.cases),
"failed_case_ids": list(result.failed_case_ids),
"coverage_summary": result.coverage_summary(),
}
def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]:
if golden_replay["passed"] and golden_replay["total_cases"] > 0:
return _check(
code="golden_replay",
status="ready",
message="内部 golden replay 全部通过。",
details={
"total_cases": golden_replay["total_cases"],
"failed_case_count": len(golden_replay["failed_case_ids"]),
},
)
return _check(
code="golden_replay",
status="blocked",
message="内部 golden replay 未通过,暂停扩大 harness 接管范围。",
details={
"total_cases": golden_replay["total_cases"],
"failed_case_count": len(golden_replay["failed_case_ids"]),
"failed_case_ids": golden_replay["failed_case_ids"],
},
)
def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
total = int(evaluation_analytics["total_evaluations"])
if total >= _MIN_RUNTIME_EVALUATIONS:
return _check(
code="runtime_evaluation_samples",
status="ready",
message="当前窗口已有内部 evaluation 运行样本。",
details={
"total_evaluations": total,
"min_required": _MIN_RUNTIME_EVALUATIONS,
},
)
return _check(
code="runtime_evaluation_samples",
status="needs_attention",
message="当前窗口缺少内部 evaluation 运行样本,建议先跑生成烟测。",
details={
"total_evaluations": total,
"min_required": _MIN_RUNTIME_EVALUATIONS,
},
)
def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
total = int(evaluation_analytics["total_evaluations"])
pass_rate = float(evaluation_analytics["pass_rate"])
average_score = evaluation_analytics["average_score"]
if total == 0:
return _check(
code="runtime_evaluation_quality",
status="needs_attention",
message="暂无运行期 evaluation 质量样本。",
details={
"total_evaluations": total,
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
},
)
if pass_rate < _MIN_EVALUATION_PASS_RATE or (
average_score is not None
and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE
):
return _check(
code="runtime_evaluation_quality",
status="blocked",
message="运行期 evaluation 质量未达到内部 readiness 门槛。",
details={
"pass_rate": pass_rate,
"average_score": average_score,
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
},
)
return _check(
code="runtime_evaluation_quality",
status="ready",
message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。",
details={
"pass_rate": pass_rate,
"average_score": average_score,
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
},
)
def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
total_runs = int(executor_coverage["total_runs"])
if total_runs >= _MIN_EXECUTOR_RUNS:
return _check(
code="executor_coverage_samples",
status="ready",
message="当前窗口已有 executor coverage 运行样本。",
details={
"total_runs": total_runs,
"min_required": _MIN_EXECUTOR_RUNS,
},
)
return _check(
code="executor_coverage_samples",
status="needs_attention",
message="当前窗口缺少 executor coverage 样本,建议先跑资产生成或重试烟测。",
details={
"total_runs": total_runs,
"min_required": _MIN_EXECUTOR_RUNS,
},
)
def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
total_runs = int(executor_coverage["total_runs"])
coverage_ratio = float(executor_coverage["coverage_ratio"])
if total_runs == 0:
return _check(
code="executor_coverage_ratio",
status="needs_attention",
message="暂无 executor coverage 运行样本。",
details={
"total_runs": total_runs,
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
},
)
if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO:
return _check(
code="executor_coverage_ratio",
status="blocked",
message="executor coverage ratio 未达到内部 readiness 门槛。",
details={
"coverage_ratio": coverage_ratio,
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
"total_planned_tasks": executor_coverage["total_planned_tasks"],
"total_executed_tasks": executor_coverage["total_executed_tasks"],
},
)
return _check(
code="executor_coverage_ratio",
status="ready",
message="executor coverage ratio 达到内部 readiness 门槛。",
details={
"coverage_ratio": coverage_ratio,
"total_planned_tasks": executor_coverage["total_planned_tasks"],
"total_executed_tasks": executor_coverage["total_executed_tasks"],
},
)
async def get_admin_harness_readiness(
db: AsyncSession,
*,
days: int | None = None,
) -> dict[str, Any]:
"""Return an admin-only readiness audit for harness release decisions."""
golden_replay = _run_golden_replay()
evaluation_analytics = await get_admin_evaluation_analytics(db, days=days)
executor_coverage = await get_admin_executor_coverage(db, days=days)
checks = [
_golden_replay_check(golden_replay),
_evaluation_sample_check(evaluation_analytics),
_evaluation_quality_check(evaluation_analytics),
_executor_sample_check(executor_coverage),
_executor_ratio_check(executor_coverage),
]
return {
"scope": "admin_internal_harness_readiness",
"window_days": days,
"status": _overall_status(checks),
"thresholds": {
"min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS,
"min_executor_runs": _MIN_EXECUTOR_RUNS,
"min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
"min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
},
"checks": checks,
"golden_replay": golden_replay,
"evaluation_analytics": evaluation_analytics,
"executor_coverage": executor_coverage,
}

View File

@@ -90,11 +90,13 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]:
progress_map: dict[str, tuple[int, str]] = {
"request_accepted": (5, "已接收请求"),
"workflow_planned": (8, "工作流已规划"),
"retry_queued": (8, "重新排队中"),
"worker_started": (12, "后台任务已开始"),
"cancel_requested": (15, "已请求取消"),
"context_prepared": (20, "上下文已准备"),
"narrative_generated": (45, "正文已生成"),
"evaluation_completed": (52, "内容评测已完成"),
"story_saved": (60, "主记录已保存"),
"provider_call_started": (65, "Provider 调用中"),
"provider_call_succeeded": (72, "Provider 调用成功"),
@@ -307,6 +309,137 @@ def generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any]:
}
_PUBLIC_EVENT_METADATA_KEYS = {
"adapter",
"artifact",
"asset",
"assets",
"attempted_cover",
"audio_status",
"blocks_main_result",
"capability",
"completed_pages",
"cover_prompt_present",
"estimated_cost_usd",
"failed_pages",
"failure_category",
"generation_status",
"has_memory_context",
"image_status",
"input_type",
"latency_ms",
"mode",
"output_mode",
"page_count",
"page_number",
"recoverable",
"requested_from_step",
"retryable",
"scope",
"stale_after_minutes",
"status",
"step",
"strategy",
"text_status",
}
_PUBLIC_REQUEST_PAYLOAD_KEYS = {
"assets",
"child_profile_id",
"generate_images",
"input_type",
"output_mode",
"page_count",
"story_id",
"type",
"universe_id",
}
def _public_metadata_value(value: Any) -> Any:
"""Return a JSON-safe public value or None when the value is internal."""
if isinstance(value, str | int | float | bool) or value is None:
return value
if isinstance(value, list):
public_items = [
item
for item in value
if isinstance(item, str | int | float | bool) or item is None
]
return public_items
return None
def public_generation_request_payload(job: GenerationJob) -> dict[str, Any]:
"""Return request payload fields safe for user-facing job details."""
payload = job.request_payload or {}
public_payload: dict[str, Any] = {}
for key in sorted(_PUBLIC_REQUEST_PAYLOAD_KEYS):
if key not in payload:
continue
value = _public_metadata_value(payload[key])
if value is not None:
public_payload[key] = value
return public_payload
def _public_plan_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Expose only coarse workflow plan metadata to user-facing responses."""
plan = metadata.get("plan")
if not isinstance(plan, dict):
return {}
public: dict[str, Any] = {}
mode = plan.get("mode")
if isinstance(mode, str):
public["plan_mode"] = mode
tasks = plan.get("tasks")
if isinstance(tasks, list):
public["planned_task_count"] = len(tasks)
public["recoverable_task_count"] = sum(
1
for task in tasks
if isinstance(task, dict) and task.get("recoverable") is True
)
return public
def public_generation_event_metadata(event: GenerationJobEvent) -> dict[str, Any]:
"""Return event metadata safe for user-facing job event streams."""
metadata = event.event_metadata or {}
public_metadata: dict[str, Any] = {}
for key in sorted(_PUBLIC_EVENT_METADATA_KEYS):
if key not in metadata:
continue
value = _public_metadata_value(metadata[key])
if value is not None:
public_metadata[key] = value
if event.event_type == "workflow_planned":
public_metadata.update(_public_plan_metadata(metadata))
return public_metadata
def public_generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any] | None:
"""Convert a generation event for user-facing APIs with internal data removed."""
if event.event_type in {"evaluation_completed", "executor_completed"}:
return None
response = generation_event_to_response(event)
response["event_metadata"] = public_generation_event_metadata(event)
return response
def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
"""Convert a generation job ORM object to an API summary dict."""
@@ -328,6 +461,23 @@ def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
}
def public_generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
"""Convert a generation job for user-facing APIs with internal steps hidden."""
summary = generation_job_to_summary(job)
if summary["current_step"] == "evaluation_completed":
summary["current_step"] = "narrative_generated"
summary["progress_percent"] = 45
summary["progress_label"] = "正文已生成"
summary["is_terminal"] = False
elif summary["current_step"] == "executor_completed":
summary["current_step"] = "workflow_planned"
summary["progress_percent"] = 8
summary["progress_label"] = "工作流已规划"
summary["is_terminal"] = False
return summary
async def get_generation_job_for_user(
db: AsyncSession,
*,
@@ -362,13 +512,13 @@ async def request_generation_job_cancel(
raise HTTPException(status_code=409, detail="当前任务不支持取消")
if job.status == "canceled":
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
if _is_terminal_status(job.status):
raise HTTPException(status_code=409, detail="当前任务已终止,无法取消")
if job.current_step == "cancel_requested":
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
if job.current_step in {"request_accepted", "retry_queued"}:
story = None
@@ -391,7 +541,7 @@ async def request_generation_job_cancel(
error_message="Generation canceled by user before worker execution started.",
message="Generation job was canceled before worker execution started.",
)
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
previous_step = job.current_step
job.error_message = "Cancellation requested by user."
@@ -407,7 +557,7 @@ async def request_generation_job_cancel(
)
await db.commit()
await db.refresh(job)
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
async def get_generation_job_detail(
@@ -437,9 +587,13 @@ async def get_generation_job_detail(
).scalars().all()
return {
**generation_job_to_summary(job),
"request_payload": job.request_payload or {},
"events": [generation_event_to_response(event) for event in events],
**public_generation_job_to_summary(job),
"request_payload": public_generation_request_payload(job),
"events": [
response
for event in events
if (response := public_generation_event_to_response(event)) is not None
],
}
@@ -461,7 +615,7 @@ async def list_story_generation_jobs(
.order_by(desc(GenerationJob.created_at), desc(GenerationJob.id))
)
).scalars().all()
return [generation_job_to_summary(job) for job in jobs]
return [public_generation_job_to_summary(job) for job in jobs]
async def get_active_story_generation_job(
@@ -513,6 +667,59 @@ def _as_float(value: Any) -> float | None:
return None
def _sorted_buckets(counts: dict[str, int]) -> list[dict[str, Any]]:
return [
{"name": name, "count": count}
for name, count in sorted(
counts.items(),
key=lambda item: (-item[1], item[0]),
)
]
def _aggregate_trace_events(events: list[GenerationJobEvent]) -> dict[str, Any]:
"""Aggregate workflow trace metadata across job events."""
by_step: dict[str, int] = {}
by_artifact: dict[str, int] = {}
failure_categories: dict[str, int] = {}
failed_events = 0
total_events = 0
for event in events:
if event.event_type in {"evaluation_completed", "executor_completed"}:
continue
total_events += 1
metadata = event.event_metadata or {}
step = metadata.get("step")
artifact = metadata.get("artifact")
failure_category = metadata.get("failure_category")
if isinstance(step, str) and step:
by_step[step] = by_step.get(step, 0) + 1
if isinstance(artifact, str) and artifact and artifact != "none":
by_artifact[artifact] = by_artifact.get(artifact, 0) + 1
if event.status == "failed":
failed_events += 1
category = (
failure_category
if isinstance(failure_category, str) and failure_category
else "unknown_error"
)
failure_categories[category] = failure_categories.get(category, 0) + 1
return {
"total_events": total_events,
"failed_events": failed_events,
"by_step": _sorted_buckets(by_step),
"by_artifact": _sorted_buckets(by_artifact),
"failure_categories": _sorted_buckets(failure_categories),
}
def _aggregate_provider_events(
events: list[GenerationJobEvent],
*,
@@ -679,6 +886,38 @@ async def get_story_provider_stats(
}
async def get_story_trace_summary(
db: AsyncSession,
*,
story_id: int,
user_id: str,
days: int | None = None,
) -> dict[str, Any]:
"""Aggregate workflow trace metadata from all user-owned jobs for one story."""
query = (
select(GenerationJobEvent)
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
.where(
GenerationJob.story_id == story_id,
GenerationJob.user_id == user_id,
)
.order_by(GenerationJobEvent.id)
)
if days is not None:
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
query = query.where(GenerationJobEvent.created_at >= cutoff)
events = (await db.execute(query)).scalars().all()
return {
"story_id": story_id,
"window_days": days,
**_aggregate_trace_events(events),
}
async def get_user_provider_analytics(
db: AsyncSession,
*,

View File

@@ -0,0 +1,322 @@
"""Internal golden-case replay support for harness evaluations.
The replay helpers are intentionally not wired to user-facing APIs. They exist
to make evaluation behavior reproducible in tests and internal tooling.
"""
import json
from collections import Counter
from dataclasses import dataclass, field
from enum import StrEnum
from pathlib import Path
from typing import Any, Iterable
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.harness.evaluators import (
EvaluationDimension,
EvaluationResult,
evaluate_story_output,
evaluate_storybook_output,
)
class EvaluationReplayArtifact(StrEnum):
"""Artifacts supported by deterministic evaluation replay."""
STORY = "story"
STORYBOOK = "storybook"
@dataclass(frozen=True)
class ExpectedEvaluation:
"""Expected evaluation outcome for one golden case."""
passed: bool
blocking: bool
min_overall_score: float | None = None
max_overall_score: float | None = None
required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple)
quality_gate_codes: tuple[str, ...] = field(default_factory=tuple)
warning_substrings: tuple[str, ...] = field(default_factory=tuple)
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation":
"""Build expectations from a JSON-safe payload."""
return cls(
passed=bool(payload["passed"]),
blocking=bool(payload["blocking"]),
min_overall_score=payload.get("min_overall_score"),
max_overall_score=payload.get("max_overall_score"),
required_dimensions=tuple(
EvaluationDimension(dimension)
for dimension in payload.get("required_dimensions", [])
),
quality_gate_codes=tuple(payload.get("quality_gate_codes", [])),
warning_substrings=tuple(payload.get("warning_substrings", [])),
)
@dataclass(frozen=True)
class EvaluationReplayCoverage:
"""Internal coverage labels for one golden replay case."""
age_band: str = "unknown"
content_shape: str = "unknown"
risk_area: str = "unknown"
tags: tuple[str, ...] = field(default_factory=tuple)
@classmethod
def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage":
"""Build coverage labels from a JSON-safe payload."""
payload = payload or {}
return cls(
age_band=str(payload.get("age_band", "unknown")),
content_shape=str(payload.get("content_shape", "unknown")),
risk_area=str(payload.get("risk_area", "unknown")),
tags=tuple(str(tag) for tag in payload.get("tags", [])),
)
@dataclass(frozen=True)
class EvaluationReplayCase:
"""One internal golden evaluation case."""
case_id: str
artifact: EvaluationReplayArtifact
output_payload: dict[str, Any]
expected: ExpectedEvaluation
education_theme: str | None = None
minimum_score: float = 0.7
description: str = ""
input_payload: dict[str, Any] = field(default_factory=dict)
coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage)
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase":
"""Build a replay case from a JSON-safe payload."""
input_payload = dict(payload.get("input", {}))
minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7))
education_theme = input_payload.get("education_theme", payload.get("education_theme"))
return cls(
case_id=str(payload["id"]),
artifact=EvaluationReplayArtifact(payload["artifact"]),
description=str(payload.get("description", "")),
input_payload=input_payload,
output_payload=dict(payload["output"]),
education_theme=education_theme,
minimum_score=float(minimum_score),
expected=ExpectedEvaluation.from_payload(payload["expected"]),
coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")),
)
def evaluate(self) -> EvaluationResult:
"""Run the deterministic evaluator for this case."""
if self.artifact == EvaluationReplayArtifact.STORY:
return evaluate_story_output(
_story_output_from_payload(self.output_payload),
education_theme=self.education_theme,
minimum_score=self.minimum_score,
)
return evaluate_storybook_output(
_storybook_from_payload(self.output_payload),
education_theme=self.education_theme,
minimum_score=self.minimum_score,
)
def replay(self) -> "EvaluationReplayCaseResult":
"""Evaluate the case and compare it with expected outcomes."""
evaluation = self.evaluate()
failures = tuple(_compare_evaluation(self, evaluation))
return EvaluationReplayCaseResult(
case_id=self.case_id,
artifact=self.artifact,
coverage=self.coverage,
evaluation=evaluation,
failures=failures,
)
@dataclass(frozen=True)
class EvaluationReplayCaseResult:
"""Replay result for one golden case."""
case_id: str
artifact: EvaluationReplayArtifact
coverage: EvaluationReplayCoverage
evaluation: EvaluationResult
failures: tuple[str, ...] = field(default_factory=tuple)
@property
def expectations_met(self) -> bool:
"""Return whether the case matched all expectations."""
return not self.failures
@dataclass(frozen=True)
class EvaluationReplaySuiteResult:
"""Replay result for a set of golden cases."""
cases: tuple[EvaluationReplayCaseResult, ...]
@property
def passed(self) -> bool:
"""Return whether every replay case matched expectations."""
return all(case.expectations_met for case in self.cases)
@property
def failed_case_ids(self) -> tuple[str, ...]:
"""Return case IDs with expectation mismatches."""
return tuple(case.case_id for case in self.cases if not case.expectations_met)
def failure_report(self) -> str:
"""Return a compact failure report for assertion messages."""
lines: list[str] = []
for case in self.cases:
for failure in case.failures:
lines.append(f"{case.case_id}: {failure}")
return "\n".join(lines)
def coverage_summary(self) -> dict[str, dict[str, int]]:
"""Return internal coverage counts for golden replay review."""
return {
"artifact": _count_values(case.artifact.value for case in self.cases),
"age_band": _count_values(case.coverage.age_band for case in self.cases),
"content_shape": _count_values(
case.coverage.content_shape for case in self.cases
),
"risk_area": _count_values(case.coverage.risk_area for case in self.cases),
"tags": _count_values(
tag for case in self.cases for tag in case.coverage.tags
),
"outcome": _count_values(
"passed" if case.evaluation.passed else "blocked"
for case in self.cases
),
}
def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]:
"""Load internal golden replay cases from a JSON file."""
raw_cases = json.loads(Path(path).read_text(encoding="utf-8"))
if not isinstance(raw_cases, list):
raise ValueError("Evaluation replay fixture must be a JSON array.")
return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases)
def run_evaluation_replay_cases(
cases: Iterable[EvaluationReplayCase],
) -> EvaluationReplaySuiteResult:
"""Run a set of internal golden evaluation replay cases."""
return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases))
def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult:
"""Load and run internal golden evaluation replay cases."""
return run_evaluation_replay_cases(load_evaluation_replay_cases(path))
def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput:
return StoryOutput(
mode=payload.get("mode", "generated"),
title=payload.get("title", ""),
story_text=payload.get("story_text", ""),
cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""),
)
def _storybook_from_payload(payload: dict[str, Any]) -> Storybook:
pages = [
StorybookPage(
page_number=page.get("page_number", index + 1),
text=page.get("text", ""),
image_prompt=page.get("image_prompt", ""),
image_url=page.get("image_url"),
)
for index, page in enumerate(payload.get("pages", []))
]
return Storybook(
title=payload.get("title", ""),
main_character=payload.get("main_character", ""),
art_style=payload.get("art_style", ""),
pages=pages,
cover_prompt=payload.get("cover_prompt", ""),
cover_url=payload.get("cover_url"),
)
def _count_values(values: Iterable[str]) -> dict[str, int]:
counts = Counter(value for value in values if value)
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
def _compare_evaluation(
case: EvaluationReplayCase,
evaluation: EvaluationResult,
) -> list[str]:
expected = case.expected
failures: list[str] = []
if evaluation.passed != expected.passed:
failures.append(f"expected passed={expected.passed}, got {evaluation.passed}")
if evaluation.blocking != expected.blocking:
failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}")
if (
expected.min_overall_score is not None
and evaluation.overall_score < expected.min_overall_score
):
failures.append(
"expected overall_score >= "
f"{expected.min_overall_score}, got {evaluation.overall_score}"
)
if (
expected.max_overall_score is not None
and evaluation.overall_score > expected.max_overall_score
):
failures.append(
"expected overall_score <= "
f"{expected.max_overall_score}, got {evaluation.overall_score}"
)
actual_dimensions = {score.dimension for score in evaluation.scores}
missing_dimensions = [
dimension.value
for dimension in expected.required_dimensions
if dimension not in actual_dimensions
]
if missing_dimensions:
failures.append(f"missing dimensions: {', '.join(missing_dimensions)}")
actual_quality_gate_codes = tuple(
issue.code.value for issue in evaluation.gate_error.issues
) if evaluation.gate_error is not None else ()
if actual_quality_gate_codes != expected.quality_gate_codes:
failures.append(
"expected quality_gate_codes="
f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}"
)
for expected_warning in expected.warning_substrings:
if not any(expected_warning in warning for warning in evaluation.warnings):
failures.append(f"missing warning containing: {expected_warning}")
return failures

View File

@@ -0,0 +1,267 @@
"""Deterministic evaluation helpers for generated child-facing content."""
from dataclasses import dataclass, field
from enum import StrEnum
from typing import Any
from app.services.adapters.storybook.primary import Storybook
from app.services.adapters.text.models import StoryOutput
from app.services.harness.quality_gates import (
QualityGateError,
validate_story_output,
validate_storybook_output,
)
class EvaluationDimension(StrEnum):
"""Stable dimensions used by harness evaluations."""
STRUCTURE = "structure"
SAFETY = "safety"
AGE_FIT = "age_fit"
EDUCATIONAL_VALUE = "educational_value"
READABILITY = "readability"
@dataclass(frozen=True)
class EvaluationScore:
"""One scored evaluation dimension."""
dimension: EvaluationDimension
score: float
reason: str
def to_metadata(self) -> dict[str, Any]:
"""Return a JSON-safe metadata payload."""
return {
"dimension": self.dimension.value,
"score": self.score,
"reason": self.reason,
}
@dataclass(frozen=True)
class EvaluationResult:
"""Deterministic evaluation result for one generated artifact."""
overall_score: float
passed: bool
blocking: bool
scores: tuple[EvaluationScore, ...]
gate_error: QualityGateError | None = None
warnings: tuple[str, ...] = field(default_factory=tuple)
def to_metadata(self) -> dict[str, Any]:
"""Return a JSON-safe metadata payload."""
metadata: dict[str, Any] = {
"overall_score": self.overall_score,
"passed": self.passed,
"blocking": self.blocking,
"scores": [score.to_metadata() for score in self.scores],
"warnings": list(self.warnings),
}
if self.gate_error is not None:
metadata["quality_gate"] = self.gate_error.to_metadata()
return metadata
def _clamp_score(value: float) -> float:
return max(0.0, min(1.0, round(value, 2)))
def _story_text_readability_score(story_text: str) -> float:
"""Score text length with a conservative 3-8 age readability heuristic."""
normalized_length = len(story_text.strip())
if normalized_length < 30:
return 0.45
if normalized_length > 2500:
return 0.72
if normalized_length > 1800:
return 0.84
return 0.96
def _educational_value_score(story_text: str, education_theme: str | None) -> float:
if not education_theme:
return 0.82
return 0.96 if education_theme.strip() in story_text else 0.88
def _storybook_readability_score(page_texts: list[str]) -> float:
if not page_texts:
return 0.0
page_lengths = [len(text.strip()) for text in page_texts]
if any(length < 8 for length in page_lengths):
return 0.62
if any(length > 320 for length in page_lengths):
return 0.78
if any(length > 220 for length in page_lengths):
return 0.88
return 0.96
def _storybook_educational_value_score(
page_texts: list[str],
education_theme: str | None,
) -> float:
if not education_theme:
return 0.82
combined_text = " ".join(page_texts)
return 0.96 if education_theme.strip() in combined_text else 0.88
def evaluate_story_output(
output: StoryOutput,
*,
education_theme: str | None = None,
minimum_score: float = 0.7,
) -> EvaluationResult:
"""Evaluate a generated text story before persistence."""
try:
validate_story_output(output)
except QualityGateError as exc:
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=0.0,
reason="故事结构未通过质量门。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=0.0,
reason="内容未通过儿童安全或结构完整性检查。",
),
)
return EvaluationResult(
overall_score=0.0,
passed=False,
blocking=True,
scores=scores,
gate_error=exc,
)
readability_score = _story_text_readability_score(output.story_text)
educational_score = _educational_value_score(output.story_text, education_theme)
warnings: list[str] = []
if readability_score < 0.8:
warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。")
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=1.0,
reason="标题、正文和封面提示词完整。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=1.0,
reason="未命中确定性儿童安全风险词。",
),
EvaluationScore(
dimension=EvaluationDimension.AGE_FIT,
score=readability_score,
reason="根据正文长度估算低龄儿童阅读适配度。",
),
EvaluationScore(
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
score=educational_score,
reason="根据教育主题是否清晰融入正文估算。",
),
EvaluationScore(
dimension=EvaluationDimension.READABILITY,
score=readability_score,
reason="根据正文长度估算朗读和亲子共读流畅度。",
),
)
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
return EvaluationResult(
overall_score=overall_score,
passed=overall_score >= minimum_score,
blocking=overall_score < minimum_score,
scores=scores,
warnings=tuple(warnings),
)
def evaluate_storybook_output(
output: Storybook,
*,
education_theme: str | None = None,
minimum_score: float = 0.7,
) -> EvaluationResult:
"""Evaluate generated storybook structure before persistence."""
try:
validate_storybook_output(output)
except QualityGateError as exc:
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=0.0,
reason="绘本结构未通过质量门。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=0.0,
reason="绘本内容未通过儿童安全或结构完整性检查。",
),
)
return EvaluationResult(
overall_score=0.0,
passed=False,
blocking=True,
scores=scores,
gate_error=exc,
)
page_texts = [page.text for page in output.pages]
readability_score = _storybook_readability_score(page_texts)
educational_score = _storybook_educational_value_score(page_texts, education_theme)
warnings: list[str] = []
if readability_score < 0.8:
warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。")
scores = (
EvaluationScore(
dimension=EvaluationDimension.STRUCTURE,
score=1.0,
reason="绘本标题、分页和页码结构完整。",
),
EvaluationScore(
dimension=EvaluationDimension.SAFETY,
score=1.0,
reason="未命中确定性儿童安全风险词。",
),
EvaluationScore(
dimension=EvaluationDimension.AGE_FIT,
score=readability_score,
reason="根据每页正文长度估算低龄儿童翻页阅读适配度。",
),
EvaluationScore(
dimension=EvaluationDimension.EDUCATIONAL_VALUE,
score=educational_score,
reason="根据教育主题是否清晰融入分页正文估算。",
),
EvaluationScore(
dimension=EvaluationDimension.READABILITY,
score=readability_score,
reason="根据分页正文长度估算亲子共读流畅度。",
),
)
overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
return EvaluationResult(
overall_score=overall_score,
passed=overall_score >= minimum_score,
blocking=overall_score < minimum_score,
scores=scores,
warnings=tuple(warnings),
)

View File

@@ -0,0 +1,150 @@
"""Small-step workflow executor helpers for generation harness adoption."""
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.harness.artifacts import AssetCompletionResult
from app.services.harness.plans import WorkflowPlan
from app.services.harness.trace import TraceRecorder
from app.services.harness.types import ArtifactKind, WorkflowStep
if TYPE_CHECKING:
from app.db.models import GenerationJob
AssetTask = Callable[[], Awaitable[AssetCompletionResult]]
@dataclass(frozen=True)
class AssetPlanRunResult:
"""Result of executing asset-producing tasks from one workflow plan."""
task_results: tuple[AssetCompletionResult, ...]
executed_task_keys: tuple[str, ...]
ignored_task_keys: tuple[str, ...]
@property
def result_assets(self) -> tuple[str, ...]:
"""Assets returned by executed task handlers."""
return tuple(result.asset for result in self.task_results)
def to_metadata(self, plan: WorkflowPlan) -> dict[str, Any]:
"""Return internal executor coverage metadata for admin-only analytics."""
return {
"plan_mode": plan.mode.value,
"planned_task_count": len(plan.tasks),
"executed_task_count": len(self.executed_task_keys),
"ignored_task_count": len(self.ignored_task_keys),
"result_count": len(self.task_results),
"executed_task_keys": list(self.executed_task_keys),
"ignored_task_keys": list(self.ignored_task_keys),
"result_assets": list(self.result_assets),
}
async def record_workflow_plan(
db: AsyncSession,
*,
job: "GenerationJob | None",
plan: WorkflowPlan,
) -> None:
"""Persist a workflow plan snapshot for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
event_type="workflow_planned",
status="succeeded",
message="Workflow plan selected for this generation request.",
metadata={"plan": plan.to_snapshot()},
step=WorkflowStep.REQUEST_ACCEPTANCE,
artifact=ArtifactKind.NONE,
blocks_main_result=True,
)
async def record_evaluation_result(
db: AsyncSession,
*,
job: "GenerationJob | None",
story_id: int | None = None,
metadata: dict[str, Any],
status: str,
artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
) -> None:
"""Persist a deterministic evaluation result for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
story_id=story_id,
event_type="evaluation_completed",
status=status,
message="Generated content evaluation completed.",
metadata=metadata,
step=WorkflowStep.EVALUATION,
artifact=artifact,
blocks_main_result=status != "succeeded",
)
async def record_executor_result(
db: AsyncSession,
*,
job: "GenerationJob | None",
plan: WorkflowPlan,
result: AssetPlanRunResult,
) -> None:
"""Persist internal executor coverage metadata for a tracked job."""
await TraceRecorder(db).record_step(
job=job,
event_type="executor_completed",
status="succeeded",
message="Workflow executor completed planned asset tasks.",
metadata=result.to_metadata(plan),
step=WorkflowStep.UNKNOWN,
artifact=ArtifactKind.NONE,
blocks_main_result=False,
)
async def run_asset_plan(
plan: WorkflowPlan,
*,
image_task: AssetTask | None = None,
audio_task: AssetTask | None = None,
) -> AssetPlanRunResult:
"""Execute asset-producing tasks in the order declared by a workflow plan."""
if plan.mode.value not in {"asset_generation", "asset_retry"}:
raise ValueError("run_asset_plan only supports asset workflow plans")
task_results: list[AssetCompletionResult] = []
executed_task_keys: list[str] = []
ignored_task_keys: list[str] = []
for task in plan.tasks:
if task.key == "complete_image_asset":
if image_task is None:
raise ValueError("Asset workflow plan requires an image task handler")
task_results.append(await image_task())
executed_task_keys.append(task.key)
continue
if task.key == "complete_audio_asset":
if audio_task is None:
raise ValueError("Asset workflow plan requires an audio task handler")
task_results.append(await audio_task())
executed_task_keys.append(task.key)
continue
ignored_task_keys.append(task.key)
return AssetPlanRunResult(
task_results=tuple(task_results),
executed_task_keys=tuple(executed_task_keys),
ignored_task_keys=tuple(ignored_task_keys),
)

View File

@@ -0,0 +1,400 @@
[
{
"id": "story-safe-theme-pass",
"artifact": "story",
"description": "完整、儿童安全且清晰包含教育主题的普通故事。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "story"]
},
"input": {
"keywords": "小兔子, 月光花园",
"education_theme": "复盘"
},
"output": {
"mode": "generated",
"title": "小兔子的月光花园",
"story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。",
"cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-long-safe-pass",
"artifact": "story",
"description": "较长但仍适合亲子共读的普通故事。",
"coverage": {
"age_band": "7-8",
"content_shape": "long_story",
"risk_area": "length_boundary",
"tags": ["theme_present", "long_text", "story"]
},
"input": {
"keywords": "小海豚, 图书馆",
"education_theme": "合作"
},
"output": {
"mode": "generated",
"title": "小海豚的蓝色图书馆",
"story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。",
"cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-missing-text-blocks",
"artifact": "story",
"description": "故事正文缺失会被确定性质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "empty_story",
"risk_area": "schema_error",
"tags": ["missing_text", "story", "blocking"]
},
"input": {
"keywords": "小熊, 星星"
},
"output": {
"mode": "generated",
"title": "小熊找星星",
"story_text": "",
"cover_prompt_suggestion": "A bear looking at friendly stars"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_story_text"
]
}
},
{
"id": "story-missing-cover-prompt-blocks",
"artifact": "story",
"description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "schema_error",
"tags": ["missing_cover_prompt", "story", "blocking"]
},
"input": {
"keywords": "小松鼠, 风筝",
"education_theme": "勇敢"
},
"output": {
"mode": "generated",
"title": "小松鼠的风筝",
"story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。",
"cover_prompt_suggestion": ""
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_cover_prompt"
]
}
},
{
"id": "story-unsafe-term-blocks",
"artifact": "story",
"description": "明显不适合儿童的风险词会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "short_story",
"risk_area": "safety_error",
"tags": ["unsafe_term", "story", "blocking"]
},
"input": {
"keywords": "小猫, 城堡"
},
"output": {
"mode": "generated",
"title": "小猫的城堡",
"story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。",
"cover_prompt_suggestion": "A cat near a castle"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "story-short-high-threshold-blocks",
"artifact": "story",
"description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "very_short_story",
"risk_area": "readability_warning",
"tags": ["short_text", "threshold_block", "story"]
},
"input": {
"keywords": "小鹿, 书签",
"education_theme": "耐心",
"minimum_score": 0.82
},
"output": {
"mode": "generated",
"title": "小鹿的书签",
"story_text": "小鹿学会了耐心等待。",
"cover_prompt_suggestion": "A deer with a golden bookmark"
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.7,
"max_overall_score": 0.8,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"正文长度"
]
}
},
{
"id": "storybook-safe-theme-pass",
"artifact": "storybook",
"description": "完整、儿童安全且包含教育主题的绘本分页输出。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_3_pages",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "storybook"]
},
"input": {
"keywords": "小狐狸, 彩虹桥",
"education_theme": "合作"
},
"output": {
"title": "彩虹桥上的合作",
"main_character": "小狐狸米米",
"art_style": "温暖水彩",
"cover_prompt": "A warm watercolor fox near a rainbow bridge",
"pages": [
{
"page_number": 1,
"text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
"image_prompt": "A little fox finds a rainbow bridge"
},
{
"page_number": 2,
"text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。",
"image_prompt": "Forest friends work together"
},
{
"page_number": 3,
"text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。",
"image_prompt": "Friends carrying leaves together"
}
]
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "storybook-duplicate-page-blocks",
"artifact": "storybook",
"description": "重复页码的绘本结构会被质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_invalid_pages",
"risk_area": "schema_error",
"tags": ["duplicate_page", "storybook", "blocking"]
},
"input": {
"keywords": "小熊, 森林"
},
"output": {
"title": "森林里的小熊",
"main_character": "小熊布布",
"art_style": "水彩",
"cover_prompt": "A bear in a forest",
"pages": [
{
"page_number": 1,
"text": "布布在森林里找到一颗松果。",
"image_prompt": "Bear finds a pinecone"
},
{
"page_number": 1,
"text": "布布把松果带给朋友一起观察。",
"image_prompt": "Bear shares the pinecone"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"invalid_storybook_page_number"
]
}
},
{
"id": "storybook-missing-page-blocks",
"artifact": "storybook",
"description": "没有分页内容的绘本会被结构质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "storybook_empty_pages",
"risk_area": "schema_error",
"tags": ["missing_page", "storybook", "blocking"]
},
"input": {
"keywords": "小鸟, 云朵"
},
"output": {
"title": "小鸟和云朵",
"main_character": "小鸟啾啾",
"art_style": "柔和水彩",
"cover_prompt": "A bird near soft clouds",
"pages": []
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_storybook_page"
]
}
},
{
"id": "storybook-unsafe-term-blocks",
"artifact": "storybook",
"description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "safety_error",
"tags": ["unsafe_term", "storybook", "blocking"]
},
"input": {
"keywords": "小兔子, 山洞"
},
"output": {
"title": "山洞里的声音",
"main_character": "小兔子米粒",
"art_style": "温暖水彩",
"cover_prompt": "A rabbit near a cave",
"pages": [
{
"page_number": 1,
"text": "米粒走到山洞边,听见奇怪的声音。",
"image_prompt": "Rabbit near a cave"
},
{
"page_number": 2,
"text": "洞里出现血腥画面,米粒吓得跑开。",
"image_prompt": "Rabbit running away"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "storybook-short-page-warning",
"artifact": "storybook",
"description": "分页正文过短时保留内部警告,用于评测回归。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "readability_warning",
"tags": ["short_page_text", "threshold_block", "storybook"]
},
"input": {
"keywords": "小羊, 风铃",
"minimum_score": 0.85
},
"output": {
"title": "风铃响了",
"main_character": "小羊团团",
"art_style": "柔和蜡笔",
"cover_prompt": "A lamb listening to a wind chime",
"pages": [
{
"page_number": 1,
"text": "风响。",
"image_prompt": "Wind chime rings"
},
{
"page_number": 2,
"text": "团团笑。",
"image_prompt": "Lamb smiles"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.8,
"max_overall_score": 0.82,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"分页正文长度"
]
}
}
]

View File

@@ -69,6 +69,11 @@ def build_story_plan(*, generate_images: bool) -> WorkflowPlan:
step=WorkflowStep.NARRATIVE_GENERATION,
artifact=ArtifactKind.STORY_TEXT,
),
WorkflowTask(
key="evaluate_narrative",
step=WorkflowStep.EVALUATION,
artifact=ArtifactKind.STORY_TEXT,
),
WorkflowTask(
key="persist_story",
step=WorkflowStep.STORY_PERSISTENCE,
@@ -124,6 +129,11 @@ def build_storybook_plan(*, generate_images: bool) -> WorkflowPlan:
step=WorkflowStep.NARRATIVE_GENERATION,
artifact=ArtifactKind.STORYBOOK_PAGES,
),
WorkflowTask(
key="evaluate_storybook_pages",
step=WorkflowStep.EVALUATION,
artifact=ArtifactKind.STORYBOOK_PAGES,
),
]
if generate_images:

View File

@@ -11,6 +11,7 @@ class WorkflowStep(StrEnum):
WORKER_START = "worker_start"
CONTEXT_PREPARATION = "context_preparation"
NARRATIVE_GENERATION = "narrative_generation"
EVALUATION = "evaluation"
STORY_PERSISTENCE = "story_persistence"
PROVIDER_INVOCATION = "provider_invocation"
IMAGE_GENERATION = "image_generation"
@@ -64,6 +65,8 @@ class StepStatus(StrEnum):
EVENT_STEP_MAP: dict[str, WorkflowStep] = {
"request_accepted": WorkflowStep.REQUEST_ACCEPTANCE,
"workflow_planned": WorkflowStep.REQUEST_ACCEPTANCE,
"executor_completed": WorkflowStep.UNKNOWN,
"retry_queued": WorkflowStep.REQUEST_ACCEPTANCE,
"worker_started": WorkflowStep.WORKER_START,
"context_prepared": WorkflowStep.CONTEXT_PREPARATION,
@@ -73,6 +76,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
"provider_call_succeeded": WorkflowStep.PROVIDER_INVOCATION,
"provider_call_failed": WorkflowStep.PROVIDER_INVOCATION,
"quality_gate_failed": WorkflowStep.NARRATIVE_GENERATION,
"evaluation_completed": WorkflowStep.EVALUATION,
"cover_image_started": WorkflowStep.IMAGE_GENERATION,
"cover_image_succeeded": WorkflowStep.IMAGE_GENERATION,
"cover_image_failed": WorkflowStep.IMAGE_GENERATION,
@@ -100,6 +104,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
EVENT_ARTIFACT_MAP: dict[str, ArtifactKind] = {
"narrative_generated": ArtifactKind.STORY_TEXT,
"quality_gate_failed": ArtifactKind.STORY_TEXT,
"evaluation_completed": ArtifactKind.STORY_TEXT,
"cover_image_started": ArtifactKind.COVER_IMAGE,
"cover_image_succeeded": ArtifactKind.COVER_IMAGE,
"cover_image_failed": ArtifactKind.COVER_IMAGE,

View File

@@ -36,8 +36,8 @@ from app.services.generation_jobs import (
ensure_no_active_story_generation_job,
finish_generation_job,
generation_job_can_retry,
generation_job_to_summary,
get_generation_job_for_user,
public_generation_job_to_summary,
record_generation_event,
)
from app.services.harness.artifacts import (
@@ -57,12 +57,27 @@ from app.services.harness.control import (
ExecutionControl,
GenerationJobCanceledError,
)
from app.services.harness.evaluators import (
EvaluationResult,
evaluate_story_output,
evaluate_storybook_output,
)
from app.services.harness.executor import (
record_evaluation_result,
record_executor_result,
record_workflow_plan,
run_asset_plan,
)
from app.services.harness.plans import (
build_asset_plan,
build_story_plan,
build_storybook_plan,
)
from app.services.harness.quality_gates import (
QualityGateError,
validate_story_output,
validate_storybook_output,
)
from app.services.harness.trace import TraceRecorder
from app.services.harness.types import ArtifactKind
from app.services.memory_service import build_enhanced_memory_context
from app.services.provider_router import (
generate_image,
@@ -129,6 +144,24 @@ async def _record_quality_gate_failure_if_present(
)
async def _record_evaluation_result_if_present(
db: AsyncSession,
*,
job,
evaluation: EvaluationResult,
artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
) -> None:
"""Append deterministic evaluation metadata for tracked worker jobs."""
await record_evaluation_result(
db,
job=job,
metadata=evaluation.to_metadata(),
status="succeeded" if evaluation.passed else "failed",
artifact=artifact,
)
def _asset_result_metadata(result: AssetCompletionResult) -> dict:
"""Build JSON-safe metadata for asset workflow events."""
@@ -643,18 +676,33 @@ async def generate_and_save_story(
user_id=user_id,
generation_job=job,
)
validate_story_output(result)
except QualityGateError as exc:
await _record_quality_gate_failure_if_present(db, job=job, error=exc)
raise HTTPException(
status_code=502,
detail="Story generation failed quality checks, please try again.",
) from exc
except Exception as exc:
raise HTTPException(
status_code=502,
detail="Story generation failed, please try again.",
) from exc
evaluation = evaluate_story_output(
result,
education_theme=request.education_theme,
)
if evaluation.gate_error is not None:
await _record_quality_gate_failure_if_present(
db,
job=job,
error=evaluation.gate_error,
)
await _record_evaluation_result_if_present(
db,
job=job,
evaluation=evaluation,
)
if evaluation.blocking:
raise HTTPException(
status_code=502,
detail="Story generation failed quality checks, please try again.",
)
await _record_job_event_if_present(
db,
job=job,
@@ -758,13 +806,32 @@ async def generate_storybook_service(
user_id=user_id,
generation_job=job,
)
validate_storybook_output(storybook)
except QualityGateError as exc:
await _record_quality_gate_failure_if_present(db, job=job, error=exc)
raise HTTPException(status_code=500, detail=f"故事书质量检查失败: {exc}") from exc
except Exception as e:
logger.error("storybook_generation_failed", error=str(e))
raise HTTPException(status_code=500, detail=f"故事书生成失败: {e}")
evaluation = evaluate_storybook_output(
storybook,
education_theme=request.education_theme,
)
if evaluation.gate_error is not None:
await _record_quality_gate_failure_if_present(
db,
job=job,
error=evaluation.gate_error,
)
await _record_evaluation_result_if_present(
db,
job=job,
evaluation=evaluation,
artifact=ArtifactKind.STORYBOOK_PAGES,
)
if evaluation.blocking:
raise HTTPException(
status_code=500,
detail=f"故事书质量检查失败: {evaluation.gate_error or 'evaluation blocked'}",
)
await _record_job_event_if_present(
db,
job=job,
@@ -1025,28 +1092,50 @@ async def _generate_asset_generation_service_with_job(
if not requested_assets:
raise HTTPException(status_code=400, detail="资源任务缺少 assets。")
plan = build_asset_plan(
output_mode="asset_generation",
assets=requested_assets,
)
await record_workflow_plan(
db,
job=job,
plan=plan,
)
story = await get_story_detail(int(story_id), job.user_id, db)
if "image" in requested_assets:
async def complete_image() -> AssetCompletionResult:
if story.mode == "storybook":
await _complete_storybook_image_assets(story, db, job=job)
else:
await _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
)
return await _complete_storybook_image_assets(story, db, job=job)
if "audio" in requested_assets:
await _complete_audio_asset(
return await _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
)
async def complete_audio() -> AssetCompletionResult:
return await _complete_audio_asset(
story,
db,
raise_on_failure=True,
job=job,
)
asset_plan_result = await run_asset_plan(
plan,
image_task=complete_image if "image" in requested_assets else None,
audio_task=complete_audio if "audio" in requested_assets else None,
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_plan_result,
)
story = await get_story_detail(story.id, job.user_id, db)
await finish_generation_job(
db,
@@ -1096,7 +1185,7 @@ async def retry_generation_job_service(
)
await _dispatch_generation_job(db, job=retry_job)
await db.refresh(retry_job)
return generation_job_to_summary(retry_job)
return public_generation_job_to_summary(retry_job)
async def _generate_generation_service_with_job(
@@ -1109,6 +1198,11 @@ async def _generate_generation_service_with_job(
"""Run the unified generation workflow after the tracking job has been created."""
if request.output_mode == "storybook":
await record_workflow_plan(
db,
job=job,
plan=build_storybook_plan(generate_images=request.generate_images),
)
storybook = await generate_storybook_service(
StorybookRequest(
keywords=request.data,
@@ -1155,6 +1249,9 @@ async def _generate_generation_service_with_job(
retryable_assets=saved_story.retryable_assets,
)
if request.output_mode == "story" and not request.generate_images:
return await _execute_story_without_assets_plan(request, user_id, db, job=job)
generate_request = GenerateRequest(
type=request.type,
data=request.data,
@@ -1164,6 +1261,11 @@ async def _generate_generation_service_with_job(
)
if request.generate_images:
await record_workflow_plan(
db,
job=job,
plan=build_story_plan(generate_images=True),
)
story = await generate_full_story_service(generate_request, user_id, db, job=job)
saved_story = await get_story_detail(story.id, user_id, db)
await _record_postprocessing_event_if_needed(db, job=job, story=saved_story)
@@ -1222,6 +1324,54 @@ async def _generate_generation_service_with_job(
universe_id=story.universe_id,
retryable_assets=story.retryable_assets,
)
async def _execute_story_without_assets_plan(
request: GenerationRequest,
user_id: str,
db: AsyncSession,
*,
job,
) -> GenerationResponse:
"""Execute the minimal text-story workflow through an explicit plan."""
plan = build_story_plan(generate_images=False)
await record_workflow_plan(db, job=job, plan=plan)
generate_request = GenerateRequest(
type=request.type,
data=request.data,
education_theme=request.education_theme,
child_profile_id=request.child_profile_id,
universe_id=request.universe_id,
)
story = await generate_and_save_story(generate_request, user_id, db, job=job)
await _record_postprocessing_event_if_needed(db, job=job, story=story)
await finish_generation_job(
db,
job=job,
story=story,
current_step="generation_completed",
message="Story generation completed with a persisted readable narrative.",
)
return GenerationResponse(
id=story.id,
generation_job_id=job.id,
title=story.title,
mode=story.mode,
story_text=story.story_text,
cover_prompt=story.cover_prompt,
image_url=story.image_url,
cover_url=story.image_url,
generation_status=story.generation_status,
text_status=story.text_status,
image_status=story.image_status,
audio_status=story.audio_status,
last_error=story.last_error,
child_profile_id=story.child_profile_id,
universe_id=story.universe_id,
retryable_assets=story.retryable_assets,
)
async def list_stories(
@@ -1321,36 +1471,7 @@ async def queue_story_asset_generation(
)
await _dispatch_generation_job(db, job=job)
await db.refresh(job)
return generation_job_to_summary(job)
async def _retry_cover_image_asset(story: Story, db: AsyncSession, *, job=None) -> None:
"""Retry cover generation for a text story."""
await _complete_cover_image_asset(
story,
db,
last_error_prefix="封面生成失败",
log_event="cover_asset_retry_failed",
job=job,
)
async def _retry_storybook_image_assets(
story: Story,
db: AsyncSession,
*,
job=None,
) -> None:
"""Retry missing storybook cover/page images."""
await _complete_storybook_image_assets(story, db, job=job)
async def _retry_audio_asset(story: Story, db: AsyncSession, *, job=None) -> None:
"""Retry audio generation while preserving persisted status on provider failure."""
await _complete_audio_asset(story, db, raise_on_failure=False, job=job)
return public_generation_job_to_summary(job)
async def retry_story_assets(
@@ -1374,6 +1495,15 @@ async def retry_story_assets(
try:
story = await get_story_detail(story_id, user_id, db)
plan = build_asset_plan(
output_mode="asset_retry",
assets=requested_assets,
)
await record_workflow_plan(
db,
job=job,
plan=plan,
)
await record_generation_event(
db,
job=job,
@@ -1384,14 +1514,37 @@ async def retry_story_assets(
metadata={"assets": requested_assets},
)
if "image" in requested_assets:
async def retry_image() -> AssetCompletionResult:
if story.mode == "storybook":
await _retry_storybook_image_assets(story, db, job=job)
else:
await _retry_cover_image_asset(story, db, job=job)
return await _complete_storybook_image_assets(story, db, job=job)
if "audio" in requested_assets:
await _retry_audio_asset(story, db, job=job)
return await _complete_cover_image_asset(
story,
db,
last_error_prefix="封面生成失败",
log_event="cover_asset_retry_failed",
job=job,
)
async def retry_audio() -> AssetCompletionResult:
return await _complete_audio_asset(
story,
db,
raise_on_failure=False,
job=job,
)
asset_plan_result = await run_asset_plan(
plan,
image_task=retry_image if "image" in requested_assets else None,
audio_task=retry_audio if "audio" in requested_assets else None,
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_plan_result,
)
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
@@ -1448,13 +1601,29 @@ async def generate_story_cover(
try:
story = await get_story_detail(story_id, user_id, db)
image_result = await _complete_cover_image_asset(
story,
plan = build_asset_plan(output_mode="asset_generation", assets=["image"])
await record_workflow_plan(
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
plan=plan,
)
asset_result = await run_asset_plan(
plan,
image_task=lambda: _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
),
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_result,
)
image_result = asset_result.task_results[0] if asset_result.task_results else None
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
db,
@@ -1464,7 +1633,11 @@ async def generate_story_cover(
message="Cover image generation completed.",
metadata={"assets": ["image"]},
)
if image_result.succeeded and isinstance(image_result.value, str):
if (
image_result is not None
and image_result.succeeded
and isinstance(image_result.value, str)
):
return image_result.value
except HTTPException as exc:
await finish_generation_job(
@@ -1501,12 +1674,28 @@ async def generate_story_audio(
try:
story = await get_story_detail(story_id, user_id, db)
audio_result = await _complete_audio_asset(
story,
plan = build_asset_plan(output_mode="asset_generation", assets=["audio"])
await record_workflow_plan(
db,
raise_on_failure=True,
job=job,
plan=plan,
)
asset_result = await run_asset_plan(
plan,
audio_task=lambda: _complete_audio_asset(
story,
db,
raise_on_failure=True,
job=job,
),
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_result,
)
audio_result = asset_result.task_results[0] if asset_result.task_results else None
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
db,
@@ -1516,7 +1705,11 @@ async def generate_story_audio(
message="Story audio generation completed.",
metadata={"assets": ["audio"]},
)
if audio_result.succeeded and isinstance(audio_result.value, bytes):
if (
audio_result is not None
and audio_result.succeeded
and isinstance(audio_result.value, bytes)
):
return audio_result.value
except HTTPException as exc:
await finish_generation_job(

View File

@@ -0,0 +1,400 @@
[
{
"id": "story-safe-theme-pass",
"artifact": "story",
"description": "完整、儿童安全且清晰包含教育主题的普通故事。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "story"]
},
"input": {
"keywords": "小兔子, 月光花园",
"education_theme": "复盘"
},
"output": {
"mode": "generated",
"title": "小兔子的月光花园",
"story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。",
"cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-long-safe-pass",
"artifact": "story",
"description": "较长但仍适合亲子共读的普通故事。",
"coverage": {
"age_band": "7-8",
"content_shape": "long_story",
"risk_area": "length_boundary",
"tags": ["theme_present", "long_text", "story"]
},
"input": {
"keywords": "小海豚, 图书馆",
"education_theme": "合作"
},
"output": {
"mode": "generated",
"title": "小海豚的蓝色图书馆",
"story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。",
"cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-missing-text-blocks",
"artifact": "story",
"description": "故事正文缺失会被确定性质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "empty_story",
"risk_area": "schema_error",
"tags": ["missing_text", "story", "blocking"]
},
"input": {
"keywords": "小熊, 星星"
},
"output": {
"mode": "generated",
"title": "小熊找星星",
"story_text": "",
"cover_prompt_suggestion": "A bear looking at friendly stars"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_story_text"
]
}
},
{
"id": "story-missing-cover-prompt-blocks",
"artifact": "story",
"description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "schema_error",
"tags": ["missing_cover_prompt", "story", "blocking"]
},
"input": {
"keywords": "小松鼠, 风筝",
"education_theme": "勇敢"
},
"output": {
"mode": "generated",
"title": "小松鼠的风筝",
"story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。",
"cover_prompt_suggestion": ""
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_cover_prompt"
]
}
},
{
"id": "story-unsafe-term-blocks",
"artifact": "story",
"description": "明显不适合儿童的风险词会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "short_story",
"risk_area": "safety_error",
"tags": ["unsafe_term", "story", "blocking"]
},
"input": {
"keywords": "小猫, 城堡"
},
"output": {
"mode": "generated",
"title": "小猫的城堡",
"story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。",
"cover_prompt_suggestion": "A cat near a castle"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "story-short-high-threshold-blocks",
"artifact": "story",
"description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "very_short_story",
"risk_area": "readability_warning",
"tags": ["short_text", "threshold_block", "story"]
},
"input": {
"keywords": "小鹿, 书签",
"education_theme": "耐心",
"minimum_score": 0.82
},
"output": {
"mode": "generated",
"title": "小鹿的书签",
"story_text": "小鹿学会了耐心等待。",
"cover_prompt_suggestion": "A deer with a golden bookmark"
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.7,
"max_overall_score": 0.8,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"正文长度"
]
}
},
{
"id": "storybook-safe-theme-pass",
"artifact": "storybook",
"description": "完整、儿童安全且包含教育主题的绘本分页输出。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_3_pages",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "storybook"]
},
"input": {
"keywords": "小狐狸, 彩虹桥",
"education_theme": "合作"
},
"output": {
"title": "彩虹桥上的合作",
"main_character": "小狐狸米米",
"art_style": "温暖水彩",
"cover_prompt": "A warm watercolor fox near a rainbow bridge",
"pages": [
{
"page_number": 1,
"text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
"image_prompt": "A little fox finds a rainbow bridge"
},
{
"page_number": 2,
"text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。",
"image_prompt": "Forest friends work together"
},
{
"page_number": 3,
"text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。",
"image_prompt": "Friends carrying leaves together"
}
]
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "storybook-duplicate-page-blocks",
"artifact": "storybook",
"description": "重复页码的绘本结构会被质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_invalid_pages",
"risk_area": "schema_error",
"tags": ["duplicate_page", "storybook", "blocking"]
},
"input": {
"keywords": "小熊, 森林"
},
"output": {
"title": "森林里的小熊",
"main_character": "小熊布布",
"art_style": "水彩",
"cover_prompt": "A bear in a forest",
"pages": [
{
"page_number": 1,
"text": "布布在森林里找到一颗松果。",
"image_prompt": "Bear finds a pinecone"
},
{
"page_number": 1,
"text": "布布把松果带给朋友一起观察。",
"image_prompt": "Bear shares the pinecone"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"invalid_storybook_page_number"
]
}
},
{
"id": "storybook-missing-page-blocks",
"artifact": "storybook",
"description": "没有分页内容的绘本会被结构质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "storybook_empty_pages",
"risk_area": "schema_error",
"tags": ["missing_page", "storybook", "blocking"]
},
"input": {
"keywords": "小鸟, 云朵"
},
"output": {
"title": "小鸟和云朵",
"main_character": "小鸟啾啾",
"art_style": "柔和水彩",
"cover_prompt": "A bird near soft clouds",
"pages": []
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_storybook_page"
]
}
},
{
"id": "storybook-unsafe-term-blocks",
"artifact": "storybook",
"description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "safety_error",
"tags": ["unsafe_term", "storybook", "blocking"]
},
"input": {
"keywords": "小兔子, 山洞"
},
"output": {
"title": "山洞里的声音",
"main_character": "小兔子米粒",
"art_style": "温暖水彩",
"cover_prompt": "A rabbit near a cave",
"pages": [
{
"page_number": 1,
"text": "米粒走到山洞边,听见奇怪的声音。",
"image_prompt": "Rabbit near a cave"
},
{
"page_number": 2,
"text": "洞里出现血腥画面,米粒吓得跑开。",
"image_prompt": "Rabbit running away"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "storybook-short-page-warning",
"artifact": "storybook",
"description": "分页正文过短时保留内部警告,用于评测回归。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "readability_warning",
"tags": ["short_page_text", "threshold_block", "storybook"]
},
"input": {
"keywords": "小羊, 风铃",
"minimum_score": 0.85
},
"output": {
"title": "风铃响了",
"main_character": "小羊团团",
"art_style": "柔和蜡笔",
"cover_prompt": "A lamb listening to a wind chime",
"pages": [
{
"page_number": 1,
"text": "风响。",
"image_prompt": "Wind chime rings"
},
{
"page_number": 2,
"text": "团团笑。",
"image_prompt": "Lamb smiles"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.8,
"max_overall_score": 0.82,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"分页正文长度"
]
}
}
]

View File

@@ -0,0 +1,610 @@
# Test Cases: Harness Evaluation Driven Generation
## Overview
- **Feature**: Harness evaluation driven generation
- **Requirements Source**: `docs/technical/harness-engineering-modernization.md`
- **Test Coverage**: evaluation scoring, blocking quality failures, workflow plan events, trace aggregation, state transitions, internal golden replay, admin-only analytics, admin-only executor coverage summary, admin-only harness readiness
- **Last Updated**: 2026-06-23
## Test Case Categories
### 1. Functional Tests
#### TC-F-001: 普通故事无图片生成写入评测事件
- **Requirement**: H7-3, H7-4
- **Priority**: High
- **Preconditions**:
- 用户已登录。
- 文本 provider 返回完整、儿童安全的故事。
- **Test Steps**:
1. 调用 `POST /api/generations`,设置 `output_mode=story``generate_images=false`
2. 执行 worker 任务。
3. 查询 job detail。
- **Expected Results**:
- job 状态为 `completed`
- event 顺序包含 `workflow_planned`
- event 顺序包含 `evaluation_completed`
- `evaluation_completed.event_metadata.passed=true`
- `evaluation_completed.event_metadata.overall_score >= 0.7`
- **Postconditions**: 故事已持久化,`story_id` 写入 job。
#### TC-F-003: 用户 Trace summary 不返回评测摘要
- **Requirement**: H7-4, H7B-1
- **Priority**: High
- **Preconditions**:
- 故事已有 `evaluation_completed` job event。
- **Test Steps**:
1. 调用 `GET /api/generations/{story_id}/trace-summary`
2. 检查响应字段。
- **Expected Results**:
- 响应不包含 `evaluation` 字段。
- `by_step` 不包含 `evaluation`
- `by_artifact` 不因 `evaluation_completed` 增加 `story_text` 计数。
- `failed_events` 不统计 `evaluation_completed`
- `total_events` 不统计 `evaluation_completed`,避免通过事件数量泄露内部评测步骤。
- **Postconditions**: 无数据修改。
#### TC-F-004: 用户 Job detail 不返回评测事件
- **Requirement**: H7-4, H7B-2
- **Priority**: High
- **Preconditions**:
- job 已记录 `evaluation_completed` 事件。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查 `events` 列表。
- **Expected Results**:
- `events` 不包含 `evaluation_completed`
- 响应不包含评测分数、维度分数、通过率或阻断阈值。
- **Postconditions**: 内部数据库事件不被删除。
#### TC-F-002: 完整故事输出获得通过评分
- **Requirement**: H7-1
- **Priority**: High
- **Preconditions**:
- 构造完整 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 `EvaluationResult`
- **Expected Results**:
- `passed=true`
- `blocking=false`
- scores 包含 `structure``safety``age_fit``educational_value``readability`
- **Postconditions**: 无持久化副作用。
#### TC-F-005: 完整绘本输出获得通过评分
- **Requirement**: H7-1, H7C-1
- **Priority**: High
- **Preconditions**:
- 构造完整 `Storybook`
- **Test Steps**:
1. 调用 `evaluate_storybook_output`
2. 读取 `EvaluationResult`
- **Expected Results**:
- `passed=true`
- `blocking=false`
- scores 包含 `structure``safety``age_fit``educational_value``readability`
- **Postconditions**: 无持久化副作用。
#### TC-F-006: 内部 golden cases 可回放且全部符合预期
- **Requirement**: H7-7, H7-8
- **Priority**: High
- **Preconditions**:
- `backend/app/services/harness/fixtures/evaluation_golden_cases.json` 存在。
- fixture 只由后端测试、内部工具或 admin-only readiness 读取。
- **Test Steps**:
1. 调用 `replay_evaluation_golden_cases`
2. 读取 `EvaluationReplaySuiteResult`
- **Expected Results**:
- `passed=true`
- `failed_case_ids` 为空。
- 普通故事和绘本样本都被覆盖。
- 样本覆盖完整普通故事、较长普通故事、空正文、缺失封面提示词、安全风险词、短文本阈值阻断、绘本重复页码、绘本缺页、绘本安全风险和绘本短分页。
- 结果不通过任何用户端 API 返回。
- **Postconditions**: 无持久化副作用。
#### TC-F-007: 内部 golden replay 覆盖摘要稳定
- **Requirement**: H7-8
- **Priority**: High
- **Preconditions**:
- golden replay suite 已执行。
- **Test Steps**:
1. 调用 `coverage_summary`
2. 检查 artifact、age_band、risk_area、tags 和 outcome 分布。
- **Expected Results**:
- artifact 覆盖 `story=6``storybook=5`
- age_band 覆盖 `3-4``5-6``7-8``unknown`
- risk_area 覆盖 `happy_path``schema_error``safety_error``readability_warning``length_boundary`
- outcome 覆盖 `passed=3``blocked=8`
- 覆盖摘要不通过任何用户端 API 返回。
- **Postconditions**: 无持久化副作用。
### 2. Edge Case Tests
#### TC-E-001: 很短故事通过结构但产生低龄阅读体验警告
- **Requirement**: H7-1
- **Priority**: Medium
- **Preconditions**:
- 构造标题、正文、封面提示词完整但正文很短的 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 warnings 和维度分数。
- **Expected Results**:
- 不触发质量门异常。
- `age_fit``readability` 分数低于完整故事。
- warnings 包含阅读体验提示。
- **Postconditions**: 无持久化副作用。
#### TC-E-002: 内部 golden replay 能报告预期不匹配
- **Requirement**: H7-7
- **Priority**: Medium
- **Preconditions**:
- 构造一个实际得分低于期望阈值的 `EvaluationReplayCase`
- **Test Steps**:
1. 调用 `run_evaluation_replay_cases`
2. 读取 `failure_report`
- **Expected Results**:
- `passed=false`
- `failed_case_ids` 包含该 case id。
- `failure_report` 包含 `overall_score` 差异。
- **Postconditions**: 无持久化副作用。
### 3. Error Handling Tests
#### TC-ERR-001: 空正文阻断持久化
- **Requirement**: H7-4
- **Priority**: High
- **Preconditions**:
- 文本 provider 返回空 `story_text`
- **Test Steps**:
1. 执行 worker 任务。
2. 查询 job 和 story 表。
3. 查询 job events。
- **Expected Results**:
- job 状态为 `failed`
- 没有 story 被持久化。
- events 包含 `quality_gate_failed`
- events 包含 `evaluation_completed`
- `evaluation_completed.event_metadata.blocking=true`
- **Postconditions**: 用户可重试该 job。
#### TC-ERR-002: 不适龄风险词阻断生成
- **Requirement**: H7-1
- **Priority**: High
- **Preconditions**:
- 构造包含明显不适龄风险词的 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 `quality_gate` metadata。
- **Expected Results**:
- `passed=false`
- `blocking=true`
- `quality_gate.issues[0].failure_category=safety_error`
- **Postconditions**: 无持久化副作用。
#### TC-ERR-003: 绘本结构错误阻断生成
- **Requirement**: H7-1, H7C-1
- **Priority**: High
- **Preconditions**:
- 构造页码重复或页面缺失的 `Storybook`
- **Test Steps**:
1. 调用 `evaluate_storybook_output`
2. 读取 `quality_gate` metadata。
- **Expected Results**:
- `passed=false`
- `blocking=true`
- `quality_gate.issues[0].code=invalid_storybook_page_number` 或对应结构错误。
- **Postconditions**: 无持久化副作用。
### 4. State Transition Tests
#### TC-ST-001: 普通故事无图片路径事件顺序稳定
- **Requirement**: H7-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询 events。
- **Expected Results**:
- event 顺序为 `request_accepted``worker_started``workflow_planned``context_prepared``evaluation_completed``narrative_generated``story_saved``generation_completed`
- **Postconditions**: job `current_step=generation_completed`
#### TC-ST-002: 普通故事带图片路径记录可恢复资产计划
- **Requirement**: H9-1, H9-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- 请求设置 `output_mode=story``generate_images=true`
- 文本 provider 返回合格故事,图片 provider 返回封面 URL。
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询内部 events。
3. 读取 `workflow_planned.event_metadata.plan`
- **Expected Results**:
- event 顺序为 `request_accepted``worker_started``workflow_planned``context_prepared``evaluation_completed``narrative_generated``story_saved``cover_image_started``cover_image_succeeded``generation_completed`
- `plan.mode=story_with_assets`
- plan tasks 包含 `evaluate_narrative`
- plan tasks 包含 `generate_cover_image`
- `generate_cover_image.required=false`
- `generate_cover_image.recoverable=true`
- **Postconditions**: job `current_step=generation_completed`,故事 `image_status=ready`
#### TC-ST-003: 绘本路径记录绘本计划快照
- **Requirement**: H9-2, H9-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- 请求设置 `output_mode=storybook`
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询内部 events。
3. 读取 `workflow_planned.event_metadata.plan`
- **Expected Results**:
- event 顺序包含 `workflow_planned`,且位于 `worker_started``context_prepared` 之间。
- `plan.mode=storybook`
- plan tasks 包含 `generate_storybook_pages`
- plan tasks 包含 `evaluate_storybook_pages`
-`generate_images=true`plan tasks 包含 `generate_storybook_images`
- `generate_storybook_images.required=false`
- `generate_storybook_images.recoverable=true`
- **Postconditions**: job `current_step=generation_completed`
#### TC-ST-004: 绘本生成内部记录评测但用户事件脱敏
- **Requirement**: H7C-1, H7B-2, H9-4
- **Priority**: High
- **Preconditions**:
- 绘本生成 job 已执行完成。
- **Test Steps**:
1. 直接查询内部 `generation_job_events`
2. 调用 `GET /api/generations/jobs/{job_id}`
- **Expected Results**:
- 内部事件包含 `evaluation_completed`
- 内部 `evaluation_completed.event_metadata.artifact=storybook_pages`
- 用户 API events 不包含 `evaluation_completed`
- 用户 API 响应不包含 `overall_score`、维度分数、阈值或 golden replay 字段。
- **Postconditions**: job 完成,绘本已持久化。
#### TC-ST-005: 资产生成和重试路径记录资产计划快照
- **Requirement**: H10-1, H10-2, H10-3
- **Priority**: High
- **Preconditions**:
- 故事已有可生成或可重试的图片/音频资源。
- **Test Steps**:
1. 执行 `asset_generation` worker 任务。
2. 调用 `/api/generations/{story_id}/retry-assets`
3. 按 id 查询内部 events。
- **Expected Results**:
- `asset_generation` 事件顺序包含 `workflow_planned`
- `asset_generation``plan.mode=asset_generation`
- `asset_retry` 事件顺序包含 `workflow_planned`
- `asset_retry``plan.mode=asset_retry`
- 图片和音频任务在 plan 中为 `required=false``recoverable=true`
- **Postconditions**: 资源状态按原有语义更新。
#### TC-ST-006: 用户事件 metadata 使用白名单脱敏
- **Requirement**: H10-4, H10-5
- **Priority**: High
- **Preconditions**:
- 内部 job events 包含原始 `plan.tasks``result_snapshot`、内部阈值或内部错误详情。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查 `events[*].event_metadata`
- **Expected Results**:
- 用户响应保留 `step``artifact``asset``assets``failure_category` 等可解释字段。
- `workflow_planned` 只返回 `plan_mode``planned_task_count``recoverable_task_count`
- 用户响应不包含原始 `plan``tasks``result_snapshot`、内部阈值、内部错误原文。
- 用户响应仍不包含 `evaluation_completed``overall_score`、维度分数或 golden replay 字段。
- **Postconditions**: 内部数据库事件不被修改。
#### TC-ST-007: 用户 request payload 使用白名单脱敏
- **Requirement**: H11-1, H11-4
- **Priority**: High
- **Preconditions**:
- 生成 job 的 `request_payload` 同时包含用户输入、公开控制字段、内部调度 token、Provider override 和评测策略。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查响应中的 `request_payload`
- **Expected Results**:
- 用户响应只保留 `output_mode``input_type``type``story_id``assets``page_count``generate_images` 等安全控制字段。
- 用户响应不包含原始 `data``education_theme`、内部调度 token、Provider override 或 evaluation policy。
- 内部数据库中的完整 request payload 不被修改。
- **Postconditions**: 用户端仍可根据公开字段展示任务进度和可用操作。
#### TC-ST-008: 资产 plan runner 按 WorkflowPlan 顺序执行任务
- **Requirement**: H12-1, H12-5
- **Priority**: High
- **Preconditions**:
- 构造 `asset_generation``asset_retry` plan包含图片和音频 task。
- **Test Steps**:
1. 调用 `run_asset_plan(...)`
2. 记录 image/audio handler 的调用顺序。
3. 检查 runner 返回的 executed/ignored task keys。
- **Expected Results**:
- 图片和音频 handler 按 plan 中 `WorkflowTask` 顺序执行。
- `start_asset_*``complete_asset_*` 这类非资产生产 task 被记录为 ignored不触发 provider handler。
- 未知非资产 task 默认 ignored不影响已知资产 task。
- **Postconditions**: 无数据库修改。
#### TC-ST-009: 后台资产生成由 plan runner 执行组合资产
- **Requirement**: H12-2, H12-5
- **Priority**: High
- **Preconditions**:
- 已持久化故事同时具备可生成图片和音频的输入。
- 创建 `asset_generation` job`assets=["audio", "image"]`
- **Test Steps**:
1. 调用 worker 执行该 job。
2. 查询 job events 和 story 状态。
- **Expected Results**:
- event stream 为 `workflow_planned` 后依次出现音频和图片生成事件。
- plan tasks 顺序包含 `complete_audio_asset``complete_image_asset`
- story 的 `audio_status``image_status` 均为 `ready`
- 用户 API 仍只暴露 coarse plan metadata不返回原始 `plan.tasks`
- **Postconditions**: job 完成,资源状态与原有语义一致。
#### TC-ST-010: 用户侧过滤 executor coverage 内部事件
- **Requirement**: H13-4, H13-5
- **Priority**: High
- **Preconditions**:
- 生成 job 包含内部 `executor_completed` 事件。
- `executor_completed.event_metadata` 包含 task keys 和 result assets。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 调用 `GET /api/generations/{story_id}/jobs`
3. 调用 `GET /api/generations/{story_id}/trace-summary`
- **Expected Results**:
- 用户 job detail 不包含 `executor_completed`
- 用户 job detail 不包含 `executed_task_keys``ignored_task_keys` 或具体 task key。
- 当 job 当前步骤短暂停留在 `executor_completed` 时,用户 summary 显示为安全公开的 `workflow_planned` 进度。
- 用户 trace summary 不包含 `executor_completed` 或具体 task key。
- 用户 trace summary 的 `total_events` 不统计内部 `executor_completed`
- **Postconditions**: 内部数据库事件不被修改。
### 5. Admin-Only Analytics Tests
#### TC-ADM-001: 管理端评测 analytics 聚合内部评测事件
- **Requirement**: H8-1, H8-2
- **Priority**: High
- **Preconditions**:
- 数据库存在多个用户的 `evaluation_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics`
2. 检查聚合结果。
- **Expected Results**:
- 返回通过数、阻断数、通过率和平均分。
- 返回 artifact、output mode、score band、dimension score、quality gate issue、failure category 和 warning 聚合。
- 不返回故事正文、prompt、单条 evaluation event 或评分 reason。
- **Postconditions**: 无数据修改。
#### TC-ADM-002: 管理端评测 analytics 支持过滤
- **Requirement**: H8-3
- **Priority**: Medium
- **Preconditions**:
- 数据库存在新旧评测事件以及不同 artifact。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics?days=7`
2. 调用 `GET /admin/evaluations/analytics?artifact=story_text`
3. 调用非法 artifact。
- **Expected Results**:
- `days` 过滤只统计窗口内事件。
- `artifact` 过滤只统计对应 artifact。
- 非法 artifact 返回 `422`
- **Postconditions**: 无数据修改。
#### TC-ADM-003: 管理端评测 analytics 需要 admin 鉴权
- **Requirement**: H8-2
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics`
- **Expected Results**:
- 返回 `401`
- 不返回任何评测统计。
- **Postconditions**: 无数据修改。
#### TC-ADM-004: 管理端完整生成 trace 返回内部事件流
- **Requirement**: H11-2, H11-3, H11-4
- **Priority**: High
- **Preconditions**:
- 数据库存在包含 `workflow_planned``evaluation_completed` 的生成 job。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
2. 检查 request payload 与 event stream。
- **Expected Results**:
- 返回完整 request payload包括原始用户输入和内部调度字段。
- 返回完整 `workflow_planned.event_metadata.plan.tasks`
- 返回 `evaluation_completed` 事件及其内部评分 metadata。
- 响应包含 `user_id`,便于管理控制面审计。
- **Postconditions**: 无数据修改。
#### TC-ADM-005: 管理端完整生成 trace 需要 admin 鉴权
- **Requirement**: H11-3
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
- **Expected Results**:
- 返回 `401`
- 不返回 request payload 或内部 event metadata。
- **Postconditions**: 无数据修改。
#### TC-ADM-006: 管理端 executor coverage 聚合内部执行事件
- **Requirement**: H13-1, H13-2, H13-3, H13-5
- **Priority**: High
- **Preconditions**:
- 数据库存在多个 `executor_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/executors/coverage`
2. 调用 `GET /admin/executors/coverage?plan_mode=asset_retry`
3. 调用非法 plan mode。
- **Expected Results**:
- 返回 total runs、planned/executed/ignored task counts 和 coverage ratio。
- 返回 plan mode、output mode、executed task keys、ignored task keys 和 result assets 聚合。
- `plan_mode` 过滤只统计对应 executor run。
- 非法 plan mode 返回 `422`
- **Postconditions**: 无数据修改。
#### TC-ADM-007: 管理端 executor coverage 需要 admin 鉴权
- **Requirement**: H13-3
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/executors/coverage`
- **Expected Results**:
- 返回 `401`
- 不返回 executor task keys 或 coverage metadata。
- **Postconditions**: 无数据修改。
#### TC-ADM-008: 管理端完整生成 trace 返回单 job executor coverage 摘要
- **Requirement**: H14-1, H14-2, H14-4
- **Priority**: High
- **Preconditions**:
- 数据库存在包含 `executor_completed` 事件的生成 job。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
2. 检查 `executor_coverage`
- **Expected Results**:
- 响应包含 `executor_coverage.scope=admin_internal_job_executor_coverage`
- `executor_coverage` 只统计当前 job 的 runs、planned/executed/ignored task counts 和 coverage ratio。
- `executor_coverage.executed_task_keys``ignored_task_keys``result_assets` 与当前 job 的内部 executor event 一致。
- 完整 event stream 仍保留 `executor_completed`,便于 admin 调试。
- **Postconditions**: 无数据修改。
#### TC-ADM-009: 管理端 harness readiness 聚合内部质量门
- **Requirement**: H15-1, H15-2, H15-3, H15-4
- **Priority**: High
- **Preconditions**:
- app 内部 harness fixture 存在 golden replay cases。
- 数据库存在至少一条通过的 `evaluation_completed` 事件。
- 数据库存在至少一条 `executor_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/harness/readiness`
2. 检查 readiness status、checks 和聚合摘要。
- **Expected Results**:
- `status=ready`
- checks 包含 `golden_replay``runtime_evaluation_samples``runtime_evaluation_quality``executor_coverage_samples``executor_coverage_ratio`
- golden replay 显示全部通过。
- evaluation analytics 与 executor coverage 只以聚合形式返回。
- 响应不包含故事标题、正文、prompt、score reason 或 quality gate message。
- **Postconditions**: 无数据修改。
#### TC-ADM-010: 管理端 harness readiness 阻断低质量运行样本并需要 admin 鉴权
- **Requirement**: H15-2, H15-3, H15-4, H15-5
- **Priority**: High
- **Preconditions**:
- 数据库存在低质量或 blocking 的 `evaluation_completed` 事件。
- executor coverage 运行样本缺失或不足。
- **Test Steps**:
1. 通过 admin guard 调用 `GET /admin/harness/readiness`
2. 未提供 admin Basic Auth 调用同一路径。
- **Expected Results**:
- 有 admin 权限时返回 `status=blocked`
- `runtime_evaluation_quality.status=blocked`
- executor 样本缺失时对应 check 为 `needs_attention`
- 无 admin 权限时返回 `401`
- 响应不包含 quality gate message 或单条事件明细。
- **Postconditions**: 无数据修改。
## Test Coverage Matrix
| Requirement ID | Test Cases | Coverage Status |
| --- | --- | --- |
| H7-1 | TC-F-002, TC-F-005, TC-E-001, TC-ERR-002, TC-ERR-003 | Complete |
| H7-2 | TC-F-001, TC-ST-001 | Complete |
| H7-3 | TC-F-001, TC-ST-001 | Complete |
| H7-4 | TC-F-003, TC-ERR-001 | Complete |
| H7-5 | This document | Complete |
| H7-7 | TC-F-006, TC-E-002 | Complete |
| H7-8 | TC-F-006, TC-F-007 | Complete |
| H7B-1 | TC-F-003 | Complete |
| H7B-2 | TC-F-004 | Complete |
| H7C-1 | TC-F-005, TC-ERR-003, TC-ST-002 | Complete |
| H8-1 | TC-ADM-001 | Complete |
| H8-2 | TC-ADM-001, TC-ADM-003 | Complete |
| H8-3 | TC-ADM-002 | Complete |
| H8-4 | TC-F-003, TC-F-004, TC-ADM-001 | Complete |
| H9-1 | TC-ST-002 | Complete |
| H9-2 | TC-ST-003 | Complete |
| H9-3 | TC-ST-001, TC-ST-002, TC-ST-003 | Complete |
| H9-4 | TC-F-003, TC-F-004, TC-ST-004 | Complete |
| H10-1 | TC-ST-005 | Complete |
| H10-2 | TC-ST-005 | Complete |
| H10-3 | TC-ST-005 | Complete |
| H10-4 | TC-ST-006 | Complete |
| H10-5 | TC-ST-005, TC-ST-006 | Complete |
| H11-1 | TC-ST-007 | Complete |
| H11-2 | TC-ADM-004 | Complete |
| H11-3 | TC-ADM-004, TC-ADM-005 | Complete |
| H11-4 | TC-ST-007, TC-ADM-004, TC-ADM-005 | Complete |
| H11-5 | This document, `docs/planning/harness-stage-11-report.md` | Complete |
| H12-1 | TC-ST-008 | Complete |
| H12-2 | TC-ST-009 | Complete |
| H12-3 | TC-ST-005, TC-ST-008 | Complete |
| H12-4 | TC-ST-005, backend story endpoint regression tests | Complete |
| H12-5 | TC-ST-008, TC-ST-009 | Complete |
| H13-1 | TC-ADM-006 | Complete |
| H13-2 | TC-ST-009, TC-ADM-006 | Complete |
| H13-3 | TC-ADM-006, TC-ADM-007 | Complete |
| H13-4 | TC-ST-010 | Complete |
| H13-5 | TC-ST-010, TC-ADM-006, TC-ADM-007 | Complete |
| H14-1 | TC-ADM-006, TC-ADM-008 | Complete |
| H14-2 | TC-ADM-008 | Complete |
| H14-3 | TC-ST-010 | Complete |
| H14-4 | TC-ST-010, TC-ADM-008 | Complete |
| H14-5 | This document, `docs/planning/harness-stage-14-report.md` | Complete |
| H15-1 | TC-F-006, TC-ADM-009 | Complete |
| H15-2 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-3 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-4 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-5 | This document, `docs/planning/harness-stage-15-report.md` | Complete |
## Notes
- 当前自动化已覆盖 TC-F-001、TC-F-002、TC-F-003、TC-F-004、TC-F-005、TC-F-006、TC-F-007、TC-E-002、TC-ERR-001、TC-ERR-002、TC-ERR-003、TC-ST-001、TC-ST-002、TC-ST-003、TC-ST-004、TC-ST-005、TC-ST-006、TC-ST-007、TC-ST-008、TC-ST-009、TC-ST-010、TC-ADM-001、TC-ADM-002、TC-ADM-003、TC-ADM-004、TC-ADM-005、TC-ADM-006、TC-ADM-007、TC-ADM-008、TC-ADM-009、TC-ADM-010。
- TC-E-001 可在下一轮补成显式单测。
- 所有 `evaluation_completed`、golden replay 和评分维度数据均按内部质量资产处理,不应进入用户端接口或用户前端。
- `GET /admin/evaluations/analytics` 只允许 admin-only 聚合摘要不应返回原始内容、prompt、单条事件或评分 reason。
- `GET /admin/generations/jobs/{job_id}/trace` 是 admin-only 调试和审查接口,可返回完整内部链路,不应被用户前端调用。
- `GET /admin/executors/coverage` 是 admin-only executor 覆盖率接口,可返回 task keys 和 result assets不应被用户前端调用。
- `GET /admin/generations/jobs/{job_id}/trace` 可返回当前 job 的 `executor_coverage` 摘要;该摘要与 task keys 一样属于内部执行资产。
- `GET /admin/harness/readiness` 是 admin-only harness 上线前审查摘要,可返回聚合 readiness、thresholds、golden coverage、evaluation analytics 和 executor coverage不应返回正文、prompt、score reason、quality gate message 或单条事件明细。

View File

@@ -27,6 +27,17 @@ def _build_admin_test_app(db_session) -> FastAPI:
return app
def _build_admin_auth_required_test_app(db_session) -> FastAPI:
app = FastAPI()
app.include_router(admin_providers.router, prefix="/admin")
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
return app
async def _create_story(
db_session,
*,
@@ -51,6 +62,38 @@ async def _create_story(
return story
async def _record_evaluation_event(
db_session,
*,
user_id: str,
story_id: int,
output_mode: str,
artifact: str,
status: str,
metadata: dict,
):
job = await create_generation_job(
db_session,
user_id=user_id,
output_mode=output_mode,
input_type="keywords",
request_payload={"data": "测试"},
story_id=story_id,
)
return await record_generation_event(
db_session,
job=job,
story_id=story_id,
event_type="evaluation_completed",
status=status,
metadata={
"step": "evaluation",
"artifact": artifact,
**metadata,
},
)
async def test_admin_provider_analytics_aggregate_across_users(db_session, test_user):
second_user = User(
id="github:67890",
@@ -197,6 +240,616 @@ async def test_admin_provider_analytics_aggregate_across_users(db_session, test_
]
async def test_admin_evaluation_analytics_aggregate_internal_events(
db_session,
test_user,
):
second_user = User(
id="google:evaluation-user",
name="Evaluation User",
avatar_url="https://example.com/eval.png",
provider="google",
)
db_session.add(second_user)
await db_session.commit()
story = await _create_story(db_session, user_id=test_user.id, title="评测故事")
storybook = await _create_story(
db_session,
user_id=second_user.id,
title="评测绘本",
mode="storybook",
)
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.92,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "完整"},
{"dimension": "readability", "score": 0.84, "reason": "可读"},
],
"warnings": [],
},
)
await _record_evaluation_event(
db_session,
user_id=second_user.id,
story_id=storybook.id,
output_mode="storybook",
artifact="storybook_pages",
status="failed",
metadata={
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [
{"dimension": "structure", "score": 0.0, "reason": "结构失败"},
{"dimension": "safety", "score": 0.0, "reason": "安全失败"},
],
"quality_gate": {
"issues": [
{
"code": "unsafe_child_content",
"message": "风险词",
"failure_category": "safety_error",
"field": "pages",
}
]
},
"warnings": ["绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_evaluations"
assert data["total_evaluations"] == 2
assert data["passed_evaluations"] == 1
assert data["blocked_evaluations"] == 1
assert data["pass_rate"] == 0.5
assert data["average_score"] == 0.46
assert data["job_count"] == 2
assert data["story_count"] == 2
assert data["user_count"] == 2
assert data["by_artifact"] == [
{"artifact": "story_text", "count": 1},
{"artifact": "storybook_pages", "count": 1},
]
assert data["by_output_mode"] == [
{"output_mode": "story", "count": 1},
{"output_mode": "storybook", "count": 1},
]
assert data["score_bands"] == [
{"band": "blocked_quality_gate", "count": 1},
{"band": "excellent", "count": 1},
]
assert data["dimension_scores"] == [
{"dimension": "structure", "average_score": 0.5, "count": 2},
{"dimension": "readability", "average_score": 0.84, "count": 1},
{"dimension": "safety", "average_score": 0.0, "count": 1},
]
assert data["quality_gate_issues"] == [
{"code": "unsafe_child_content", "count": 1},
]
assert data["failure_categories"] == [
{"category": "safety_error", "count": 1},
]
assert data["warnings"] == [
{
"message": "绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。",
"count": 1,
},
]
assert "评测故事" not in str(data)
assert "风险词" not in str(data)
assert "完整" not in str(data)
async def test_admin_evaluation_analytics_support_days_and_artifact_filters(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="旧评测")
storybook = await _create_story(
db_session,
user_id=test_user.id,
title="新评测",
mode="storybook",
)
old_event = await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.96,
"passed": True,
"blocking": False,
"scores": [{"dimension": "structure", "score": 1.0, "reason": "完整"}],
"warnings": [],
},
)
old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10)
await db_session.commit()
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=storybook.id,
output_mode="storybook",
artifact="storybook_pages",
status="failed",
metadata={
"overall_score": 0.72,
"passed": False,
"blocking": True,
"scores": [{"dimension": "readability", "score": 0.62, "reason": "过短"}],
"warnings": ["分页正文长度偏短"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics?days=7")
assert response.status_code == 200
data = response.json()
assert data["window_days"] == 7
assert data["total_evaluations"] == 1
assert data["artifact"] is None
assert data["by_artifact"] == [{"artifact": "storybook_pages", "count": 1}]
response = await client.get(
"/admin/evaluations/analytics?artifact=story_text"
)
assert response.status_code == 200
data = response.json()
assert data["artifact"] == "story_text"
assert data["total_evaluations"] == 1
assert data["average_score"] == 0.96
response = await client.get("/admin/evaluations/analytics?artifact=image")
assert response.status_code == 422
async def test_admin_evaluation_analytics_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics")
assert response.status_code == 401
async def test_admin_generation_job_trace_returns_internal_event_stream(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="内部链路故事")
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "月亮森林",
"internal_dispatch_token": "admin-visible-token",
"provider_override": "internal-provider",
"evaluation_policy": {"threshold": 0.9},
},
story_id=story.id,
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="workflow_planned",
status="succeeded",
metadata={
"step": "request_acceptance",
"artifact": "none",
"plan": {
"mode": "story",
"tasks": [
{
"key": "generate_narrative",
"step": "text_generation",
"artifact": "story_text",
"required": True,
"recoverable": False,
}
],
},
"internal_threshold": 0.9,
},
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="evaluation_completed",
status="succeeded",
metadata={
"step": "evaluation",
"artifact": "story_text",
"overall_score": 0.94,
"passed": True,
"blocking": False,
"scores": [{"dimension": "structure", "score": 1.0}],
},
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get(f"/admin/generations/jobs/{job.id}/trace")
assert response.status_code == 200
data = response.json()
assert data["id"] == job.id
assert data["user_id"] == test_user.id
assert data["request_payload"]["data"] == "月亮森林"
assert data["request_payload"]["internal_dispatch_token"] == "admin-visible-token"
assert data["request_payload"]["evaluation_policy"] == {"threshold": 0.9}
event_types = [event["event_type"] for event in data["events"]]
assert event_types == [
"request_accepted",
"workflow_planned",
"evaluation_completed",
"executor_completed",
]
workflow_event = data["events"][1]
assert workflow_event["event_metadata"]["plan"]["tasks"][0]["key"] == (
"generate_narrative"
)
assert workflow_event["event_metadata"]["internal_threshold"] == 0.9
evaluation_event = data["events"][2]
assert evaluation_event["event_metadata"]["overall_score"] == 0.94
assert evaluation_event["event_metadata"]["scores"] == [
{"dimension": "structure", "score": 1.0}
]
executor_event = data["events"][3]
assert executor_event["event_metadata"]["executed_task_keys"] == [
"complete_image_asset"
]
assert executor_event["event_metadata"]["result_assets"] == ["cover_image"]
executor_coverage = data["executor_coverage"]
assert executor_coverage["scope"] == "admin_internal_job_executor_coverage"
assert executor_coverage["total_runs"] == 1
assert executor_coverage["total_planned_tasks"] == 3
assert executor_coverage["total_executed_tasks"] == 1
assert executor_coverage["total_ignored_tasks"] == 2
assert executor_coverage["coverage_ratio"] == 0.3333
assert executor_coverage["job_count"] == 1
assert executor_coverage["story_count"] == 1
assert executor_coverage["user_count"] == 1
assert executor_coverage["by_plan_mode"] == [
{"plan_mode": "asset_generation", "count": 1}
]
assert executor_coverage["by_output_mode"] == [
{"output_mode": "story", "count": 1}
]
assert executor_coverage["executed_task_keys"] == [
{"task_key": "complete_image_asset", "count": 1}
]
assert executor_coverage["ignored_task_keys"] == [
{"task_key": "complete_asset_generation", "count": 1},
{"task_key": "start_asset_generation", "count": 1},
]
assert executor_coverage["result_assets"] == [
{"asset": "cover_image", "count": 1}
]
async def test_admin_generation_job_trace_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/generations/jobs/missing-job/trace")
assert response.status_code == 401
async def test_admin_executor_coverage_aggregates_internal_events(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="执行器覆盖故事")
asset_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_generation",
input_type="audio,image",
request_payload={"story_id": story.id, "assets": ["audio", "image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=asset_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 4,
"executed_task_count": 2,
"ignored_task_count": 2,
"executed_task_keys": ["complete_audio_asset", "complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["audio", "cover_image"],
},
)
retry_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_retry",
input_type="image",
request_payload={"story_id": story.id, "assets": ["image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=retry_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_retry",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": ["start_asset_retry", "complete_asset_retry"],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/executors/coverage")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_executor_coverage"
assert data["total_runs"] == 2
assert data["total_planned_tasks"] == 7
assert data["total_executed_tasks"] == 3
assert data["total_ignored_tasks"] == 4
assert data["coverage_ratio"] == 0.4286
assert data["job_count"] == 2
assert data["story_count"] == 1
assert data["user_count"] == 1
assert data["by_plan_mode"] == [
{"plan_mode": "asset_generation", "count": 1},
{"plan_mode": "asset_retry", "count": 1},
]
assert data["executed_task_keys"] == [
{"task_key": "complete_image_asset", "count": 2},
{"task_key": "complete_audio_asset", "count": 1},
]
assert data["result_assets"] == [
{"asset": "cover_image", "count": 2},
{"asset": "audio", "count": 1},
]
response = await client.get("/admin/executors/coverage?plan_mode=asset_retry")
assert response.status_code == 200
data = response.json()
assert data["plan_mode"] == "asset_retry"
assert data["total_runs"] == 1
assert data["total_planned_tasks"] == 3
assert data["total_executed_tasks"] == 1
response = await client.get("/admin/executors/coverage?plan_mode=story")
assert response.status_code == 422
async def test_admin_executor_coverage_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/executors/coverage")
assert response.status_code == 401
async def test_admin_harness_readiness_returns_ready_when_internal_gates_pass(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="readiness 故事")
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.92,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "内部 reason"},
{"dimension": "readability", "score": 0.84, "reason": "内部 reason"},
],
"warnings": [],
},
)
asset_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": story.id, "assets": ["image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=asset_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_harness_readiness"
assert data["status"] == "ready"
assert data["thresholds"] == {
"min_runtime_evaluations": 1,
"min_executor_runs": 1,
"min_evaluation_pass_rate": 0.7,
"min_evaluation_average_score": 0.7,
"min_executor_coverage_ratio": 0.2,
}
assert {check["code"]: check["status"] for check in data["checks"]} == {
"golden_replay": "ready",
"runtime_evaluation_samples": "ready",
"runtime_evaluation_quality": "ready",
"executor_coverage_samples": "ready",
"executor_coverage_ratio": "ready",
}
assert data["golden_replay"]["passed"] is True
assert data["golden_replay"]["total_cases"] == 11
assert data["evaluation_analytics"]["total_evaluations"] == 1
assert data["evaluation_analytics"]["pass_rate"] == 1.0
assert data["executor_coverage"]["total_runs"] == 1
assert data["executor_coverage"]["coverage_ratio"] == 0.3333
assert "内部 reason" not in str(data)
assert "readiness 故事" not in str(data)
async def test_admin_harness_readiness_blocks_low_runtime_quality(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="低质量 readiness")
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="failed",
metadata={
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [{"dimension": "structure", "score": 0.0, "reason": "缺失"}],
"quality_gate": {
"issues": [
{
"code": "missing_story_text",
"message": "正文缺失",
"failure_category": "schema_error",
"field": "story_text",
}
]
},
"warnings": [],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 200
data = response.json()
assert data["status"] == "blocked"
checks = {check["code"]: check for check in data["checks"]}
assert checks["golden_replay"]["status"] == "ready"
assert checks["runtime_evaluation_samples"]["status"] == "ready"
assert checks["runtime_evaluation_quality"]["status"] == "blocked"
assert checks["executor_coverage_samples"]["status"] == "needs_attention"
assert checks["executor_coverage_ratio"]["status"] == "needs_attention"
assert data["evaluation_analytics"]["blocked_evaluations"] == 1
assert data["executor_coverage"]["total_runs"] == 0
assert "正文缺失" not in str(data)
assert "低质量 readiness" not in str(data)
async def test_admin_harness_readiness_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 401
async def test_admin_provider_analytics_support_days_and_capability_filters(
db_session,
test_user,

View File

@@ -123,14 +123,19 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"story_saved",
"generation_completed",
]
assert events[2].event_metadata["has_memory_context"] is False
assert events[3].event_metadata["title"] == "小兔子的冒险"
assert events[4].story_id == job.story_id
assert events[2].event_metadata["plan"]["mode"] == "story"
assert events[3].event_metadata["has_memory_context"] is False
assert events[4].event_metadata["passed"] is True
assert events[4].event_metadata["overall_score"] >= 0.7
assert events[5].event_metadata["title"] == "小兔子的冒险"
assert events[6].story_id == job.story_id
detail_response = await client.get(f"/api/generations/jobs/{job.id}")
assert detail_response.status_code == 200
@@ -143,11 +148,16 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [event["event_type"] for event in detail["events"]] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"narrative_generated",
"story_saved",
"generation_completed",
]
assert all(
event["event_type"] != "evaluation_completed"
for event in detail["events"]
)
story_response = await client.get(f"/api/generations/{job.story_id}")
assert story_response.status_code == 200
@@ -161,6 +171,13 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [item["id"] for item in job_list] == [job.id]
assert job_list[0]["progress_percent"] == 100
assert job_list[0]["is_terminal"] is True
trace_response = await client.get(
f"/api/generations/{job.story_id}/trace-summary"
)
assert trace_response.status_code == 200
trace = trace_response.json()
assert "evaluation" not in trace
finally:
app.dependency_overrides.clear()
@@ -220,13 +237,88 @@ async def test_generation_worker_records_quality_gate_failure_without_persisting
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"quality_gate_failed",
"evaluation_completed",
"generation_failed",
]
quality_event = events[3]
quality_event = events[4]
assert quality_event.event_metadata["step"] == "narrative_generation"
assert quality_event.event_metadata["issues"][0]["code"] == "missing_story_text"
evaluation_event = events[5]
assert evaluation_event.event_metadata["step"] == "evaluation"
assert evaluation_event.event_metadata["passed"] is False
assert evaluation_event.event_metadata["blocking"] is True
async def test_story_with_images_worker_records_plan_before_assets(
db_session,
test_user,
mock_text_provider,
mock_image_provider,
):
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "小兔子, 森林",
"generate_images": True,
},
)
await run_generation_job_service(job.id, db_session)
refreshed_job = (
await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id))
).scalar_one()
assert refreshed_job.story_id is not None
assert refreshed_job.status == "completed"
assert refreshed_job.current_step == "generation_completed"
assert refreshed_job.result_snapshot["image_status"] == "ready"
events = (
await db_session.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"story_saved",
"cover_image_started",
"cover_image_succeeded",
"generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "story_with_assets"
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_narrative",
"evaluate_narrative",
"persist_story",
"generate_cover_image",
"queue_postprocessing",
"complete_generation",
]
cover_task = next(task for task in plan["tasks"] if task["key"] == "generate_cover_image")
assert cover_task["required"] is False
assert cover_task["recoverable"] is True
assert events[4].event_metadata["passed"] is True
assert events[8].event_metadata["asset"] == "cover_image"
mock_text_provider.assert_called_once()
mock_image_provider.assert_called_once()
async def test_asset_retry_records_job_events_and_updates_retryable_assets(
@@ -279,12 +371,30 @@ async def test_asset_retry_records_job_events_and_updates_retryable_assets(
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"workflow_planned",
"asset_retry_started",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_retry_completed",
]
assert events[3].event_metadata["asset"] == "cover_image"
plan = events[1].event_metadata["plan"]
assert plan["mode"] == "asset_retry"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_retry",
"complete_image_asset",
"complete_asset_retry",
]
image_task = next(
task for task in plan["tasks"] if task["key"] == "complete_image_asset"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
assert events[4].event_metadata["asset"] == "cover_image"
assert events[5].event_metadata["plan_mode"] == "asset_retry"
assert events[5].event_metadata["executed_task_keys"] == [
"complete_image_asset"
]
finally:
app.dependency_overrides.clear()
@@ -365,10 +475,110 @@ async def test_asset_generation_job_worker_completes_cover_image(
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "asset_generation"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_generation",
"complete_image_asset",
"complete_asset_generation",
]
image_task = next(
task for task in plan["tasks"] if task["key"] == "complete_image_asset"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
executor_event = events[5]
assert executor_event.event_metadata["plan_mode"] == "asset_generation"
assert executor_event.event_metadata["executed_task_keys"] == [
"complete_image_asset"
]
assert executor_event.event_metadata["ignored_task_keys"] == [
"start_asset_generation",
"complete_asset_generation",
]
assert executor_event.event_metadata["result_assets"] == ["cover_image"]
async def test_asset_generation_job_worker_executes_assets_in_plan_order(
db_session,
test_story,
mock_tts_provider,
):
job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="asset_generation",
input_type="audio,image",
request_payload={"story_id": test_story.id, "assets": ["audio", "image"]},
story_id=test_story.id,
)
with patch(
"app.services.story_service.generate_image",
new_callable=AsyncMock,
) as mock_generate_image:
mock_generate_image.return_value = "https://example.com/plan-cover.png"
await run_generation_job_service(job.id, db_session)
refreshed_job = (
await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id))
).scalar_one()
assert refreshed_job.status == "completed"
assert refreshed_job.current_step == "asset_generation_completed"
assert refreshed_job.result_snapshot["image_status"] == "ready"
assert refreshed_job.result_snapshot["audio_status"] == "ready"
story = (
await db_session.execute(
select(Story).where(Story.id == test_story.id)
)
).scalar_one()
assert story.image_url == "https://example.com/plan-cover.png"
assert story.audio_status == "ready"
assert story.audio_path is not None
events = (
await db_session.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"audio_started",
"audio_succeeded",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "asset_generation"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_generation",
"complete_audio_asset",
"complete_image_asset",
"complete_asset_generation",
]
assert events[4].event_metadata["asset"] == "audio"
assert events[6].event_metadata["asset"] == "cover_image"
assert events[7].event_metadata["executed_task_keys"] == [
"complete_audio_asset",
"complete_image_asset",
]
assert events[7].event_metadata["result_assets"] == ["audio", "cover_image"]
mock_tts_provider.assert_awaited_once()
mock_generate_image.assert_awaited_once()
async def test_cancel_queued_asset_generation_job_marks_it_canceled(
@@ -538,7 +748,9 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"storybook_images_started",
"storybook_cover_image_succeeded",
@@ -548,13 +760,45 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve
"story_saved",
"generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "storybook"
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_storybook_pages",
"evaluate_storybook_pages",
"generate_storybook_images",
"persist_storybook",
"queue_postprocessing",
"complete_generation",
]
image_task = next(
task
for task in plan["tasks"]
if task["key"] == "generate_storybook_images"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
assert events[4].event_metadata["passed"] is True
assert events[4].event_metadata["artifact"] == "storybook_pages"
page_events = [
event
for event in events
if event.event_type == "storybook_page_image_succeeded"
]
assert [event.event_metadata["page_number"] for event in page_events] == [1, 2]
assert events[8].event_metadata["completed_pages"] == [1, 2]
assert events[10].event_metadata["completed_pages"] == [1, 2]
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
detail_response = await client.get(
f"/api/generations/jobs/{job.id}"
)
assert detail_response.status_code == 200
detail = detail_response.json()
assert "evaluation_completed" not in [
event["event_type"] for event in detail["events"]
]
finally:
app.dependency_overrides.clear()
@@ -716,6 +960,414 @@ async def test_story_provider_stats_aggregate_job_events(
app.dependency_overrides.clear()
async def test_story_trace_summary_aggregates_steps_artifacts_and_failure_categories(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_retry",
input_type="image",
request_payload={"assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="cover_image_started",
status="running",
metadata={
"step": "image_generation",
"artifact": "cover_image",
"failure_category": None,
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="cover_image_failed",
status="failed",
metadata={
"step": "image_generation",
"artifact": "cover_image",
"failure_category": "provider_error",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="quality_gate_failed",
status="failed",
metadata={
"step": "narrative_generation",
"artifact": "story_text",
"failure_category": "schema_error",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="evaluation_completed",
status="failed",
metadata={
"step": "evaluation",
"artifact": "story_text",
"failure_category": "schema_error",
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [
{
"dimension": "structure",
"score": 0.0,
"reason": "故事结构未通过质量门。",
},
{
"dimension": "safety",
"score": 0.0,
"reason": "内容未通过儿童安全或结构完整性检查。",
},
],
},
)
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/trace-summary"
)
assert response.status_code == 200
data = response.json()
assert data["story_id"] == degraded_story_with_text.id
assert data["total_events"] == 4
assert data["failed_events"] == 2
assert data["by_step"] == [
{"name": "image_generation", "count": 2},
{"name": "narrative_generation", "count": 1},
]
assert data["by_artifact"] == [
{"name": "cover_image", "count": 2},
{"name": "story_text", "count": 1},
]
assert data["failure_categories"] == [
{"name": "provider_error", "count": 1},
{"name": "schema_error", "count": 1},
]
assert "evaluation" not in data
assert "overall_score" not in str(data)
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_hides_internal_evaluation_step(
db_session,
auth_token,
test_user,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "小兔子",
"generate_images": False,
},
)
await record_generation_event(
db_session,
job=job,
event_type="evaluation_completed",
status="succeeded",
metadata={
"step": "evaluation",
"artifact": "story_text",
"overall_score": 0.96,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "完整。"},
],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
assert data["current_step"] == "narrative_generated"
assert data["progress_label"] == "正文已生成"
assert [event["event_type"] for event in data["events"]] == [
"request_accepted"
]
assert "evaluation_completed" not in str(data)
assert "overall_score" not in str(data)
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_sanitizes_request_payload(
db_session,
auth_token,
test_user,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"input_type": "keywords",
"type": "keywords",
"data": "不要回传原始关键词",
"education_theme": "勇气",
"generate_images": True,
"page_count": 6,
"child_profile_id": "child-public-id",
"universe_id": "universe-public-id",
"internal_dispatch_token": "secret-dispatch-token",
"provider_override": "internal-provider",
"evaluation_policy": {"threshold": 0.9},
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
assert data["request_payload"] == {
"child_profile_id": "child-public-id",
"generate_images": True,
"input_type": "keywords",
"output_mode": "story",
"page_count": 6,
"type": "keywords",
"universe_id": "universe-public-id",
}
payload_dump = str(data["request_payload"])
assert "不要回传原始关键词" not in payload_dump
assert "education_theme" not in payload_dump
assert "secret-dispatch-token" not in payload_dump
assert "internal-provider" not in payload_dump
assert "evaluation_policy" not in payload_dump
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_sanitizes_public_event_metadata(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="workflow_planned",
status="succeeded",
metadata={
"step": "request_acceptance",
"artifact": "none",
"plan": {
"mode": "asset_generation",
"tasks": [
{
"key": "complete_image_asset",
"step": "image_generation",
"artifact": "image",
"required": False,
"recoverable": True,
}
],
},
"internal_threshold": 0.72,
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="asset_generation_completed",
status="completed",
metadata={
"assets": ["image"],
"result_snapshot": {
"story_id": degraded_story_with_text.id,
"last_error": "internal provider detail",
},
"error": "internal provider detail",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
workflow_event = next(
event for event in data["events"] if event["event_type"] == "workflow_planned"
)
assert workflow_event["event_metadata"] == {
"artifact": "none",
"plan_mode": "asset_generation",
"planned_task_count": 1,
"recoverable_task_count": 1,
"step": "request_acceptance",
}
completion_event = next(
event
for event in data["events"]
if event["event_type"] == "asset_generation_completed"
)
assert completion_event["event_metadata"] == {"assets": ["image"]}
assert "plan" not in workflow_event["event_metadata"]
assert "tasks" not in str(data["events"])
assert "internal_threshold" not in str(data["events"])
assert "result_snapshot" not in str(data["events"])
assert "internal provider detail" not in str(data["events"])
assert "executor_completed" not in str(data["events"])
assert "complete_image_asset" not in str(data["events"])
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_summary_hides_internal_executor_step(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"executed_task_keys": ["complete_image_asset"],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
detail_response = await client.get(f"/api/generations/jobs/{job.id}")
list_response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/jobs"
)
trace_summary_response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/trace-summary"
)
assert detail_response.status_code == 200
detail = detail_response.json()
assert detail["current_step"] == "workflow_planned"
assert detail["progress_label"] == "工作流已规划"
assert "executor_completed" not in str(detail)
assert "complete_image_asset" not in str(detail)
assert list_response.status_code == 200
listed_job = next(item for item in list_response.json() if item["id"] == job.id)
assert listed_job["current_step"] == "workflow_planned"
assert listed_job["progress_label"] == "工作流已规划"
assert trace_summary_response.status_code == 200
trace_summary = trace_summary_response.json()
assert "executor_completed" not in str(trace_summary)
assert "complete_image_asset" not in str(trace_summary)
assert trace_summary["total_events"] == 1
finally:
app.dependency_overrides.clear()
async def test_user_provider_analytics_aggregate_across_stories(
db_session,
auth_token,

View File

@@ -1,5 +1,7 @@
"""Tests for generation harness runtime support."""
from pathlib import Path
import pytest
from sqlalchemy import select
@@ -7,8 +9,21 @@ from app.db.models import GenerationJob, GenerationJobEvent
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.generation_jobs import create_generation_job, record_generation_event
from app.services.harness.artifacts import AssetCompletionResult
from app.services.harness.control import ExecutionControl, GenerationJobCanceledError
from app.services.harness.evaluation_replay import (
EvaluationReplayArtifact,
EvaluationReplayCase,
ExpectedEvaluation,
replay_evaluation_golden_cases,
run_evaluation_replay_cases,
)
from app.services.harness.evaluators import evaluate_story_output, evaluate_storybook_output
from app.services.harness.executor import run_asset_plan
from app.services.harness.plans import (
WorkflowMode,
WorkflowPlan,
WorkflowTask,
build_asset_plan,
build_story_plan,
build_storybook_plan,
@@ -27,12 +42,18 @@ from app.services.harness.types import (
normalize_trace_metadata,
step_for_event,
)
from app.services.story_status import StoryAssetStatus
FIXTURES_DIR = (
Path(__file__).parents[1] / "app" / "services" / "harness" / "fixtures"
)
def test_event_type_maps_to_standard_workflow_step():
assert step_for_event("request_accepted") == WorkflowStep.REQUEST_ACCEPTANCE
assert step_for_event("context_prepared") == WorkflowStep.CONTEXT_PREPARATION
assert step_for_event("narrative_generated") == WorkflowStep.NARRATIVE_GENERATION
assert step_for_event("evaluation_completed") == WorkflowStep.EVALUATION
assert step_for_event("story_saved") == WorkflowStep.STORY_PERSISTENCE
assert step_for_event("provider_call_succeeded") == WorkflowStep.PROVIDER_INVOCATION
assert step_for_event("quality_gate_failed") == WorkflowStep.NARRATIVE_GENERATION
@@ -46,6 +67,7 @@ def test_event_type_maps_to_standard_workflow_step():
def test_event_type_maps_to_standard_artifact():
assert artifact_for_event("narrative_generated") == ArtifactKind.STORY_TEXT
assert artifact_for_event("quality_gate_failed") == ArtifactKind.STORY_TEXT
assert artifact_for_event("evaluation_completed") == ArtifactKind.STORY_TEXT
assert artifact_for_event("cover_image_succeeded") == ArtifactKind.COVER_IMAGE
assert artifact_for_event("storybook_page_image_failed") == ArtifactKind.PAGE_IMAGE
assert artifact_for_event("audio_cache_hit") == ArtifactKind.AUDIO
@@ -108,6 +130,13 @@ def test_story_plan_without_assets_snapshot():
"required": True,
"recoverable": False,
},
{
"key": "evaluate_narrative",
"step": "evaluation",
"artifact": "story_text",
"required": True,
"recoverable": False,
},
{
"key": "persist_story",
"step": "story_persistence",
@@ -137,7 +166,7 @@ def test_story_plan_with_assets_marks_cover_recoverable():
plan = build_story_plan(generate_images=True).to_snapshot()
assert plan["mode"] == "story_with_assets"
assert plan["tasks"][3] == {
assert plan["tasks"][4] == {
"key": "generate_cover_image",
"step": "image_generation",
"artifact": "cover_image",
@@ -153,13 +182,14 @@ def test_storybook_plan_with_images_marks_storybook_images_recoverable():
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_storybook_pages",
"evaluate_storybook_pages",
"generate_storybook_images",
"persist_storybook",
"queue_postprocessing",
"complete_generation",
]
assert plan["tasks"][2]["artifact"] == "image"
assert plan["tasks"][2]["recoverable"] is True
assert plan["tasks"][3]["artifact"] == "image"
assert plan["tasks"][3]["recoverable"] is True
def test_asset_retry_plan_deduplicates_assets():
@@ -200,6 +230,86 @@ def test_asset_retry_plan_deduplicates_assets():
}
@pytest.mark.asyncio
async def test_run_asset_plan_executes_asset_tasks_in_plan_order():
calls: list[str] = []
async def image_task() -> AssetCompletionResult:
calls.append("image")
return AssetCompletionResult(
asset="cover_image",
status=StoryAssetStatus.READY,
value="https://example.com/cover.png",
)
async def audio_task() -> AssetCompletionResult:
calls.append("audio")
return AssetCompletionResult(
asset="audio",
status=StoryAssetStatus.READY,
value=b"audio",
)
result = await run_asset_plan(
build_asset_plan(output_mode="asset_generation", assets=["audio", "image"]),
image_task=image_task,
audio_task=audio_task,
)
assert calls == ["audio", "image"]
assert result.executed_task_keys == ("complete_audio_asset", "complete_image_asset")
assert result.ignored_task_keys == (
"start_asset_generation",
"complete_asset_generation",
)
assert [item.asset for item in result.task_results] == ["audio", "cover_image"]
@pytest.mark.asyncio
async def test_run_asset_plan_ignores_unknown_non_asset_tasks():
calls: list[str] = []
plan = WorkflowPlan(
mode=WorkflowMode.ASSET_RETRY,
tasks=(
WorkflowTask(
key="start_asset_retry",
step=WorkflowStep.ASSET_RETRY,
artifact=ArtifactKind.NONE,
),
WorkflowTask(
key="complete_video_asset",
step=WorkflowStep.UNKNOWN,
artifact=ArtifactKind.UNKNOWN,
required=False,
recoverable=True,
),
WorkflowTask(
key="complete_asset_retry",
step=WorkflowStep.ASSET_RETRY,
artifact=ArtifactKind.NONE,
),
),
)
async def image_task() -> AssetCompletionResult:
calls.append("image")
return AssetCompletionResult(
asset="cover_image",
status=StoryAssetStatus.READY,
)
result = await run_asset_plan(plan, image_task=image_task)
assert calls == []
assert result.task_results == ()
assert result.executed_task_keys == ()
assert result.ignored_task_keys == (
"start_asset_retry",
"complete_video_asset",
"complete_asset_retry",
)
def test_story_quality_gate_accepts_complete_child_safe_story():
validate_story_output(
StoryOutput(
@@ -211,6 +321,166 @@ def test_story_quality_gate_accepts_complete_child_safe_story():
)
def test_story_evaluator_scores_complete_child_safe_story():
result = evaluate_story_output(
StoryOutput(
mode="generated",
title="小兔子的月光花园",
story_text="小兔子在花园里学会了和朋友轮流分享水壶,也学会了复盘今天的努力。",
cover_prompt_suggestion="A gentle moonlit garden with a rabbit",
),
education_theme="复盘",
)
assert result.passed is True
assert result.blocking is False
assert result.overall_score >= 0.9
assert result.to_metadata()["scores"][0]["dimension"] == "structure"
def test_story_evaluator_blocks_quality_gate_failure():
result = evaluate_story_output(
StoryOutput(
mode="generated",
title="空白故事",
story_text="",
cover_prompt_suggestion="A cover",
)
)
assert result.passed is False
assert result.blocking is True
assert result.overall_score == 0.0
assert result.gate_error is not None
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == "missing_story_text"
def test_storybook_evaluator_scores_complete_child_safe_storybook():
result = evaluate_storybook_output(
Storybook(
title="森林里的复盘星星",
main_character="小兔子露露",
art_style="温暖水彩",
cover_prompt="A warm watercolor forest cover",
pages=[
StorybookPage(
page_number=1,
text="露露在森林里发现一颗会提醒她复盘的小星星。",
image_prompt="Lulu finds a star",
),
StorybookPage(
page_number=2,
text="她回想今天的努力,学会下次先和朋友商量。",
image_prompt="Lulu thinking with friends",
),
],
),
education_theme="复盘",
)
assert result.passed is True
assert result.blocking is False
assert result.overall_score >= 0.9
def test_storybook_evaluator_blocks_quality_gate_failure():
result = evaluate_storybook_output(
Storybook(
title="森林绘本",
main_character="小兔子",
art_style="水彩",
cover_prompt="A forest cover",
pages=[
StorybookPage(page_number=1, text="第一页。", image_prompt="page 1"),
StorybookPage(page_number=1, text="第二页。", image_prompt="page 2"),
],
)
)
assert result.passed is False
assert result.blocking is True
assert result.gate_error is not None
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == (
"invalid_storybook_page_number"
)
def test_evaluation_golden_cases_replay_successfully():
result = replay_evaluation_golden_cases(
FIXTURES_DIR / "evaluation_golden_cases.json"
)
assert result.passed is True, result.failure_report()
assert result.failed_case_ids == ()
assert len(result.cases) == 11
assert {
case.artifact
for case in result.cases
} == {
EvaluationReplayArtifact.STORY,
EvaluationReplayArtifact.STORYBOOK,
}
def test_evaluation_golden_cases_report_internal_coverage_summary():
result = replay_evaluation_golden_cases(
FIXTURES_DIR / "evaluation_golden_cases.json"
)
summary = result.coverage_summary()
assert summary["artifact"] == {
"storybook": 5,
"story": 6,
}
assert summary["age_band"] == {
"3-4": 4,
"5-6": 4,
"unknown": 2,
"7-8": 1,
}
assert summary["risk_area"] == {
"schema_error": 4,
"happy_path": 2,
"readability_warning": 2,
"safety_error": 2,
"length_boundary": 1,
}
assert summary["outcome"] == {
"blocked": 8,
"passed": 3,
}
assert summary["tags"]["story"] == 6
assert summary["tags"]["storybook"] == 5
assert summary["tags"]["blocking"] == 6
assert summary["tags"]["threshold_block"] == 2
def test_evaluation_replay_reports_expectation_mismatch():
case = EvaluationReplayCase(
case_id="expectation-mismatch",
artifact=EvaluationReplayArtifact.STORY,
input_payload={"keywords": "小兔子"},
output_payload={
"mode": "generated",
"title": "小兔子的花园",
"story_text": "小兔子学会了和朋友分享水壶。",
"cover_prompt_suggestion": "A rabbit sharing a watering can",
},
expected=ExpectedEvaluation(
passed=True,
blocking=False,
min_overall_score=0.99,
),
)
result = run_evaluation_replay_cases([case])
assert result.passed is False
assert result.failed_case_ids == ("expectation-mismatch",)
assert "expected overall_score >=" in result.failure_report()
def test_story_quality_gate_rejects_missing_story_text():
output = StoryOutput(
mode="generated",

View File

@@ -0,0 +1,159 @@
# Harness Engineering 改造阶段 10 报告
**阶段**: 10 - 资产计划与 Public Metadata Sanitizer
**日期**: 2026-06-22
**状态**: 已完成当前切片
**范围**: 资产生成/重试 WorkflowPlan、用户侧 job event metadata 白名单脱敏、回归测试和商业机密边界复核
---
## 1. 本阶段目标
阶段 10 的目标是把资产任务也纳入 Harness Engineering 的显式计划模型,并把用户侧事件 metadata 从“过滤少数内部事件”升级为“白名单公开”。
本阶段重点:
- `asset_generation` 写入 `workflow_planned`
- `asset_retry` 写入 `workflow_planned`
- 旧封面/音频兼容接口创建的资产 job 也写入 plan。
- 用户侧 job detail 的 event metadata 使用 public sanitizer。
- 内部数据库事件继续保留完整 metadata供测试、内部分析和 admin-only 能力使用。
## 2. 已完成工作
### 资产 WorkflowPlan
修改文件:
- `backend/app/services/story_service.py`
新增行为:
- 后台 `asset_generation` worker 在执行资源补全前记录 `asset_generation` plan。
- `/api/generations/{story_id}/retry-assets` 同步重试路径记录 `asset_retry` plan。
-`/api/image/generate/{story_id}``/api/audio/{story_id}` 兼容路径记录 `asset_generation` plan。
资产 plan 快照:
- `plan.mode=asset_generation``asset_retry`
- 图片任务使用 `complete_image_asset`
- 音频任务使用 `complete_audio_asset`
- 图片/音频任务均为 `required=false``recoverable=true`
### Public Metadata Sanitizer
修改文件:
- `backend/app/services/generation_jobs.py`
新增能力:
- `public_generation_event_metadata(...)`
- 用户侧 `public_generation_event_to_response(...)` 不再原样返回 event metadata。
- `evaluation_completed` 事件继续完全过滤。
- `workflow_planned` 只返回 coarse plan 摘要:
- `plan_mode`
- `planned_task_count`
- `recoverable_task_count`
用户侧允许保留:
- `step`
- `artifact`
- `failure_category`
- `asset` / `assets`
- `status`
- `mode`
- `output_mode`
- `input_type`
- `page_count`
- `page_number`
- `adapter`
- `capability`
- `strategy`
- `latency_ms`
- `estimated_cost_usd`
- 资源状态和少量可解释执行上下文
用户侧禁止返回:
- 原始 `plan`
- 原始 `plan.tasks`
- `result_snapshot`
- 内部阈值
- 内部错误原文
- `overall_score`
- 维度分数
- 评分 reason
- golden replay 信息
## 3. 测试覆盖
修改文件:
- `backend/tests/test_generation_jobs.py`
新增或更新覆盖:
- 更新 `asset_retry` 事件顺序,断言 `asset_retry` plan。
- 更新 `asset_generation` worker 事件顺序,断言 `asset_generation` plan。
- 新增 `test_user_generation_job_detail_sanitizes_public_event_metadata`,确认用户 API 不返回原始 plan、tasks、result snapshot、内部阈值和内部错误原文。
## 4. 验证结果
已执行:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
结果:
- 定向生成任务测试:`22 passed`
- 后端全量测试:`152 passed`
- Ruff`All checks passed!`
- 用户前端构建:通过
- 管理端构建:通过
构建提示:
- `frontend``admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。
- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。
- 以上均为依赖数据 freshness 提示,不影响当前构建结果。
## 5. 自审结论
本阶段继续保持“内部完整、外部最小”的边界:
- 内部 event metadata 没有丢失admin-only 和测试仍可读取完整 plan 与评测数据。
- 用户侧 job event metadata 已从 denylist 走向 allowlist未来新增内部字段默认不会公开。
- 用户侧仍可看到进度、资源、Provider 和失败分类等可操作信息。
- 原始 `plan.tasks`、内部阈值、内部错误原文和 result snapshot 不进入用户事件流。
## 6. Bug 与风险记录
已发现并即时修复的问题:
- 初次测试时 `asset_generation``asset_retry` 的旧事件顺序断言未包含 `workflow_planned`;已更新测试并增加 plan 快照断言。
- sanitizer 测试最初用字符串搜索禁止 `plan`,误伤公开字段 `plan_mode`;已改为断言原始 `plan` key 不存在。
当前风险:
- `request_payload` 仍作为 job detail 字段返回,当前包含用户发起请求本身。后续如请求 payload 增加内部调度参数,需要单独做 payload sanitizer。
- Provider 成本信息当前仍在用户侧展示,属于既有产品运营摘要。若商业策略变化,需要从 white list 中移除 `estimated_cost_usd` 并同步前端。
- admin-frontend 当前复用用户侧 `/api/generations/jobs/{job_id}`,因此看到的是脱敏事件。未来如果管理端需要完整内部 event metadata应新增 admin-only trace endpoint。
## 7. 后续建议
下一阶段建议进入阶段 11
1. 设计 admin-only generation trace detail让管理端在权限保护下查看完整内部 plan/evaluation/provider metadata。
2.`request_payload` 增加 public sanitizer防止未来内部调度字段被用户端 job detail 透出。
3. 继续推进 executor 小步接管,把资产 plan 从“记录事实”升级为“驱动执行”的最小执行单元。

View File

@@ -0,0 +1,165 @@
# Harness Engineering 改造阶段 11 报告
**阶段**: 11 - Trace 访问分级与 Request Payload Sanitizer
**日期**: 2026-06-22
**状态**: 已完成当前切片
**范围**: 用户侧 request payload 白名单脱敏、admin-only 完整生成 trace、回归测试和商业机密边界复核
---
## 1. 本阶段目标
阶段 11 承接阶段 10 的风险记录:事件 metadata 已经白名单脱敏,但用户侧 job detail 仍会原样返回 `request_payload`。如果后续 executor 或调度层把内部字段写入 payload就可能把内部策略、Provider override 或评测配置分发给用户端。
本阶段目标:
- 用户侧 `GET /api/generations/jobs/{job_id}` 只返回安全公开的 request payload 字段。
- 管理控制面新增完整 trace detail用于内部审查、排障和评测驱动复盘。
- 完整内部评测数据、workflow plan、原始 request payload 只在 `admin_guard` 后可见。
## 2. 已完成工作
### 用户侧 Request Payload Sanitizer
修改文件:
- `backend/app/services/generation_jobs.py`
新增能力:
- `public_generation_request_payload(...)`
- 用户侧 `get_generation_job_detail(...)` 不再原样返回 `job.request_payload`
- request payload 使用白名单公开
当前用户侧允许字段:
- `assets`
- `child_profile_id`
- `generate_images`
- `input_type`
- `output_mode`
- `page_count`
- `story_id`
- `type`
- `universe_id`
当前用户侧禁止字段:
- 原始 `data`
- `education_theme`
- 内部调度 token
- Provider override
- evaluation policy
- 任意 dict 型内部配置
### Admin-Only 完整 Trace Detail
新增文件:
- `backend/app/services/admin_generation_trace.py`
修改文件:
- `backend/app/api/admin_providers.py`
新增接口:
```http
GET /admin/generations/jobs/{job_id}/trace
```
接口能力:
- 返回完整 `request_payload`
- 返回完整 event stream
- 不过滤 `evaluation_completed`
- 不脱敏 `workflow_planned.event_metadata.plan.tasks`
- 返回 `user_id` 供管理控制面审计
- 继承 admin router 的 `admin_guard` 保护
## 3. 测试覆盖
修改文件:
- `backend/tests/test_generation_jobs.py`
- `backend/tests/test_admin_providers.py`
- `backend/tests/harness-evaluation-test-cases.md`
新增覆盖:
- `test_user_generation_job_detail_sanitizes_request_payload`
- 断言用户 job detail 不返回原始 `data`
- 断言用户 job detail 不返回内部调度 token、Provider override 或 evaluation policy
- 断言用户 job detail 保留必要公开控制字段
- `test_admin_generation_job_trace_returns_internal_event_stream`
- 断言 admin trace 返回完整 request payload
- 断言 admin trace 返回 `workflow_planned` 原始 plan tasks
- 断言 admin trace 返回 `evaluation_completed` 和评分 metadata
- `test_admin_generation_job_trace_requires_admin_auth`
- 断言未通过 admin guard 时返回 `401`
## 4. 当前验证结果
已执行:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
结果:
- 定向生成任务 + admin trace 测试:`31 passed`
- 后端全量测试:`155 passed`
- Ruff`All checks passed!`
- 用户前端构建:通过
- 管理端构建:通过
补充敏感公开面扫描:
```bash
rg -n "evaluations/analytics|EvaluationAnalytics|admin_evaluation|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py
```
结果:无命中。用户前端、公开 schema、用户 API 和用户 job service 未暴露评测 analytics、评分、golden/replay 或内部 request payload 字段。
构建提示:
- `frontend``admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。
- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。
- 以上均为依赖数据 freshness 提示,不影响当前构建结果。
## 5. 自审结论
本阶段把 trace 数据访问明确分成两层:
- 用户层:只看可用功能、进度、资源状态和少量安全控制字段。
- 管理层:在 admin guard 后查看完整内部链路,用于调试、审查和评测驱动改进。
这满足“用户前端不能展示评测数据”的要求,并且比阶段 10 更稳:即使后续内部调度把更多策略字段写入 request payload用户接口也不会默认公开。
## 6. Bug 与风险记录
已发现并即时修复的问题:
- 无新增运行时 bug。
当前风险:
- admin-frontend 当前还没有专门调用 `/admin/generations/jobs/{job_id}/trace` 的页面;管理端如果继续复用用户接口,看到的仍是脱敏 trace。这是安全默认值但内部排障体验还可以继续增强。
- 用户 request payload 白名单当前保守,不返回 `data``education_theme`。如果未来用户端确实需要展示“我刚才输入了什么”,应设计单独的用户输入回显字段,并避免混入内部调度字段。
- admin trace 返回完整内部 metadata必须继续保持在 admin-only router 下,不得被用户前端或公开 API 复用。
## 7. 后续建议
下一阶段建议进入阶段 12
1. 推进 executor 小步接管,让 `WorkflowPlan` 从“记录计划”逐步变成“驱动最小任务执行”。
2. 先选择资产生成或 asset retry 作为低风险 executor 试点。
3. 管理端可后续增加 trace detail UI但必须调用 admin-only endpoint并明确标记为内部审查视图。

View File

@@ -0,0 +1,150 @@
# Harness Engineering 改造阶段 12 报告
**阶段**: 12 - Plan-Driven Asset Executor 试点
**日期**: 2026-06-22
**状态**: 已完成当前切片
**范围**: 资产任务 executor 最小接管、后台资产生成/资源重试/旧资源接口接入、回归测试和用户公开面边界复核
---
## 1. 本阶段目标
阶段 12 的目标是让 `WorkflowPlan` 不再只是 trace 快照,而是开始驱动一部分真实执行。为了控制风险,本阶段只接管资产任务,不迁移主文本生成、评测和故事持久化。
本阶段重点:
- 新增 plan-driven asset runner。
- 后台 `asset_generation` 按 plan task key 执行图片/音频任务。
- 同步 `asset_retry` 按 plan task key 执行图片/音频重试。
- 旧封面和音频兼容接口也通过同一个 runner 执行。
- 保留既有 asset workflow 对 provider、缓存、状态同步、取消检查和事件记录的职责。
## 2. 已完成工作
### Asset Executor Runner
修改文件:
- `backend/app/services/harness/executor.py`
新增能力:
- `AssetPlanRunResult`
- `run_asset_plan(...)`
执行规则:
- 只支持 `asset_generation``asset_retry` plan。
- `complete_image_asset` 调用 image handler。
- `complete_audio_asset` 调用 audio handler。
- `start_asset_*``complete_asset_*` 和未知非资产 task 记录为 ignored不触发 provider handler。
- 返回 task results、executed task keys 和 ignored task keys便于单测和后续观测扩展。
### Story Service 接入
修改文件:
- `backend/app/services/story_service.py`
已接入路径:
- 后台 `asset_generation` worker。
- 同步 `retry_story_assets`
-`generate_story_cover`
-`generate_story_audio`
保持不变的职责:
- 图片/音频 provider 调用仍在 `asset_workflows`
- 音频缓存读写仍在 `asset_workflows`
- story 状态同步仍在 `asset_workflows`
- `cover_image_*``audio_*``storybook_*image*` 事件仍由 asset workflow 记录。
- job 完成/失败语义保持原有 `finish_generation_job` 路径。
## 3. 测试覆盖
修改文件:
- `backend/tests/test_harness_runtime.py`
- `backend/tests/test_generation_jobs.py`
- `backend/tests/harness-evaluation-test-cases.md`
新增覆盖:
- `test_run_asset_plan_executes_asset_tasks_in_plan_order`
- 验证 runner 按 plan task 顺序执行音频和图片。
- 验证非资产生产 task 被记录为 ignored。
- `test_run_asset_plan_ignores_unknown_non_asset_tasks`
- 验证未知非资产 task 不触发 handler。
- `test_asset_generation_job_worker_executes_assets_in_plan_order`
- 验证后台组合资产 job 按 plan 顺序先生成音频再生成图片。
- 验证 story 的 `audio_status``image_status` 均为 `ready`
- 验证 event stream 与 plan tasks 对齐。
## 4. 当前验证结果
已执行:
```bash
cd backend
.venv/bin/python -m pytest tests/test_harness_runtime.py tests/test_generation_jobs.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
结果:
- Harness runtime + generation job 定向测试:`48 passed`
- 后端全量测试:`158 passed`
- Ruff`All checks passed!`
- 用户前端构建:通过
- 管理端构建:通过
补充敏感公开面扫描:
```bash
rg -n "evaluations/analytics|EvaluationAnalytics|admin_evaluation|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py
```
结果:无命中。用户前端、公开 schema、用户 API 和用户 job service 未暴露评测 analytics、评分、golden/replay 或内部 request payload 字段。
构建提示:
- `frontend``admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。
- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。
- 以上均为依赖数据 freshness 提示,不影响当前构建结果。
## 5. 自审结论
本阶段完成了 executor 接管的第一步,但没有扩大到主生成链路:
- `WorkflowPlan` 已能驱动资产 task 执行。
- asset workflow 仍保持单一职责,负责真实 provider 调用和状态转换。
- 事件流与用户可见行为保持兼容。
- 用户侧仍只看到 coarse plan metadata原始 `plan.tasks`、评测结果和内部调度数据不进入用户接口。
这个切片足够小,失败时也容易回滚:只需要把资产入口从 `run_asset_plan` 调回原来的顺序 `if "image"` / `if "audio"` 分支。
## 6. Bug 与风险记录
已发现并即时修复的问题:
- 接入 runner 后,原来的 `_retry_*` 私有薄封装不再被调用。已删除这些死代码,避免后续误读。
当前风险:
- `run_asset_plan` 当前只解释图片和音频 task未知资产默认 ignored。未来如果新增视频、角色设定图等资产需要显式增加 handler而不是依赖 unknown task。
- 主文本生成、评测和持久化仍未由 executor 驱动;它们当前仍是 plan-aware trace而不是 plan-driven execution。
- runner 当前不单独写入 task-level start/finish 事件,仍复用 asset workflow 的现有事件。若后续需要更细粒度 executor 审计,可以增加 admin-only 内部事件,但不能默认进入用户侧。
## 7. 后续建议
下一阶段建议进入阶段 13
1.`WorkflowPlan` 的 task result 纳入 admin-only trace 聚合,便于看 executor 执行覆盖率。
2. 选择主文本生成中的低风险 task例如 `queue_postprocessing``complete_generation`,继续小步接管。
3. 若要接管 `evaluate_narrative`,必须先补更明确的评测数据隔离测试,避免任何评分字段进入用户前端。

View File

@@ -0,0 +1,182 @@
# Harness Engineering 改造阶段 13 报告
**阶段**: 13 - Admin-Only Executor Coverage
**日期**: 2026-06-23
**状态**: 已完成当前切片
**范围**: 内部 executor coverage 事件、admin-only coverage 聚合、用户侧 executor 数据隔离、回归测试
---
## 1. 本阶段目标
阶段 13 承接阶段 12 的 plan-driven asset executor资产任务已经按 `WorkflowPlan` 执行,但内部还缺少跨 job 的覆盖率视角。本阶段把 executor 执行结果记录为内部事件,并新增管理控制面聚合,帮助我们审查计划任务是否真的被执行。
本阶段目标:
- 资产 executor 完成后写入内部 `executor_completed` 事件。
- 管理端可聚合 executor runs、planned/executed/ignored task counts、task keys 和 result assets。
- 用户端继续看不到 executor task keys、coverage metadata 或内部 executor step。
## 2. 已完成工作
### Executor Coverage Metadata
修改文件:
- `backend/app/services/harness/executor.py`
- `backend/app/services/story_service.py`
新增能力:
- `AssetPlanRunResult.result_assets`
- `AssetPlanRunResult.to_metadata(...)`
- `record_executor_result(...)`
内部 metadata 包含:
- `plan_mode`
- `planned_task_count`
- `executed_task_count`
- `ignored_task_count`
- `result_count`
- `executed_task_keys`
- `ignored_task_keys`
- `result_assets`
已接入路径:
- 后台 `asset_generation`
- 同步 `asset_retry`
-`generate_story_cover`
-`generate_story_audio`
### Admin-Only Coverage Analytics
新增文件:
- `backend/app/services/admin_executor_coverage.py`
修改文件:
- `backend/app/api/admin_providers.py`
新增接口:
```http
GET /admin/executors/coverage
```
支持过滤:
```http
GET /admin/executors/coverage?days=7
GET /admin/executors/coverage?plan_mode=asset_retry
```
返回聚合:
- total runs
- total planned/executed/ignored task counts
- coverage ratio
- job/story/user counts
- by plan mode
- by output mode
- executed task keys
- ignored task keys
- result assets
### 用户侧隔离
修改文件:
- `backend/app/services/generation_jobs.py`
隔离规则:
- 用户 job detail 过滤 `executor_completed` 事件。
- 用户 job summary 如果内部 `current_step=executor_completed`,对外映射为 `workflow_planned` 和“工作流已规划”。
- 用户公开 metadata 白名单不包含 executor task keys 或 coverage 字段。
## 3. 测试覆盖
修改文件:
- `backend/tests/test_generation_jobs.py`
- `backend/tests/test_admin_providers.py`
- `backend/tests/harness-evaluation-test-cases.md`
新增或更新覆盖:
- 资产生成/重试事件序列包含内部 `executor_completed`
- 用户 job detail 不返回 `executor_completed` 或 task keys。
- 用户 job summary 不暴露内部 executor step。
- admin trace 可读取完整 `executor_completed`
- admin coverage 聚合 total runs、task counts、coverage ratio、task keys 和 result assets。
- admin coverage 支持 `plan_mode` 过滤并拒绝非法 plan mode。
- admin coverage 未鉴权返回 `401`
## 4. 当前验证结果
已执行:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py tests/test_harness_runtime.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
结果:
- 定向 generation/admin/harness 测试:`59 passed`
- 后端全量测试:`161 passed`
- Ruff`All checks passed!`
- 用户前端构建:通过
- 管理端构建:通过
补充敏感公开面扫描:
```bash
rg -n "executors/coverage|ExecutorCoverage|admin_executor|executor_completed|executed_task_keys|ignored_task_keys|coverage_ratio|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py
```
结果:仅命中 `backend/app/services/generation_jobs.py` 中对 `executor_completed` 的过滤和 current step 映射逻辑。用户前端、公开 schema 和用户 API route 未暴露 executor coverage、task keys、评测分数、golden/replay 或内部 request payload 字段。
构建提示:
- `frontend``admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。
- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。
- 以上均为依赖数据 freshness 提示,不影响当前构建结果。
## 5. 自审结论
本阶段保留了“内部完整、用户最小”的边界:
- executor task keys 是内部执行证据,只进入 admin-only trace/coverage。
- 用户端仍只看到可用功能和进度,不看到 task keys、coverage ratio 或内部 executor step。
- admin coverage 聚合不返回故事正文、prompt 或评测评分 reason。
## 6. Bug 与风险记录
已发现并即时修复的问题:
- 初版 admin coverage bucket 使用通用模型,响应中出现无关字段 `null`。已拆成专用 bucket response model减少管理端响应噪声。
- `executor_completed` 会短暂写入 `job.current_step`。已在用户 summary 中映射为安全公开的 `workflow_planned`,并补测试防止泄露。
当前风险:
- `executor_completed` 当前只覆盖资产 executor。主文本、评测和持久化仍是 plan-aware不应被 coverage 误解为全链路 executor 覆盖。
- coverage ratio 使用 executed/planned 任务数,包含 start/complete 这类 ignored task因此是执行器覆盖口径不是产品成功率。
- admin coverage 返回 task keys必须保持 admin-only不允许用户前端调用。
## 7. 后续建议
下一阶段建议进入阶段 14
1. 在 admin trace detail 中增加 executor coverage summary减少管理端自行解析事件。
2. 选择 `queue_postprocessing``complete_generation` 这类低风险主链路 task 继续小步接管。
3. 若要接管评测 task先补更严格的用户侧敏感扫描和 contract tests。

View File

@@ -0,0 +1,188 @@
# Harness Engineering 阶段 14 报告
**阶段**: Admin Trace Executor Coverage Summary
**日期**: 2026-06-23
**状态**: 已完成当前切片
## 1. 阶段目标
本阶段继续沿用原架构路径,不扩大 executor 对主文本生成、评测或持久化的接管范围,只增强管理控制面的审查能力。
目标:
- 让 admin-only 完整 generation trace 自带当前 job 的 executor coverage 摘要。
- 复用全局 executor coverage 聚合逻辑,避免全局 coverage 与单 job trace 统计口径漂移。
- 修正用户 trace summary 隔离规则,确保内部 `executor_completed` 不通过聚合数量、task key 或 result asset 泄露到用户侧。
## 2. 完成内容
### H14-1: 抽出 executor coverage 纯聚合函数
-`app/services/admin_executor_coverage.py` 中新增 `summarize_executor_coverage_rows(...)`
- `GET /admin/executors/coverage` 继续返回原有结构,但内部改为复用共享聚合函数。
- 聚合口径保持不变runs、planned/executed/ignored task counts、coverage ratio、plan mode、output mode、task keys 和 result assets。
### H14-2: admin trace 返回 `executor_coverage`
- `app/services/admin_generation_trace.py` 在完整事件流之外,新增当前 job 的 `executor_coverage` 摘要。
- trace 内嵌 summary 的 `scope``admin_internal_job_executor_coverage`
- `app/api/admin_providers.py``AdminGenerationJobTraceResponse` 增加 `executor_coverage` 字段。
### H14-3: 用户 trace summary 过滤 `executor_completed`
- `app/services/generation_jobs.py` 的 trace summary 聚合现在同时跳过 `evaluation_completed``executor_completed`
- 用户侧仍然只看到产品可解释的 workflow 进度,不看到内部 executor coverage、task keys 或 result assets。
### H14-4: 测试覆盖
- `tests/test_admin_providers.py` 增加 admin trace 内嵌 executor coverage 断言。
- `tests/test_generation_jobs.py` 增加用户 trace summary 不包含 `executor_completed` 和 task key 的断言。
- `backend/tests/harness-evaluation-test-cases.md` 增加 TC-ADM-008并更新 TC-ST-010。
### H14-5: 文档同步
- `docs/technical/harness-engineering-modernization.md` 更新至阶段 0-14。
- 新增 `Admin Trace Executor Coverage Summary` 设计章节。
- 增加 FR-015、NFR-011、阶段 14 计划、风险缓解和当前状态。
## 3. 审查结论
### 用户侧商业机密隔离
本阶段没有向用户端新增任何 evaluation 或 executor coverage 数据。
用户侧继续隐藏:
- `evaluation_completed`
- `executor_completed`
- `overall_score`
- 评分维度、阈值、golden replay
- `executed_task_keys`
- `ignored_task_keys`
- `executor_coverage`
额外修正:
- 用户 trace summary 的 `total_events` 不再统计内部 `executor_completed`,避免通过事件数量暴露内部执行器步骤。
### 管理端审查能力
管理端现在可以在单个 trace 响应里同时查看:
- 完整 request payload。
- 完整 event stream。
- 完整 evaluation metadata。
- 当前 job 的 executor coverage summary。
这让后续排查 plan-driven executor 迁移时,不必在完整 trace 和全局 coverage API 之间手动拼接数据。
### 架构边界
本阶段仍保持阶段 12 的保守边界:
- executor 只接管资产 task key。
- 主文本生成、绘本主结构、评测和持久化仍走原服务路径。
- admin-only 聚合能力不改变用户 API schema。
## 4. 验证记录
已通过:
```bash
cd backend
.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_generation_jobs.py tests/test_harness_runtime.py -q
```
结果:
```text
59 passed
```
已通过:
```bash
cd backend
.venv/bin/python -m ruff check app tests
```
结果:
```text
All checks passed!
```
已通过:
```bash
cd backend
.venv/bin/python -m pytest
```
结果:
```text
161 passed
```
已通过:
```bash
cd frontend
npm run build
```
结果:
```text
vue-tsc && vite build
✓ built
```
备注Browserslist 数据陈旧警告,不影响构建结果。
已通过:
```bash
cd admin-frontend
npm run build
```
结果:
```text
vue-tsc && vite build
✓ built
```
备注Browserslist 与 baseline-browser-mapping 数据陈旧警告,不影响构建结果。
已通过用户侧敏感字段扫描:
```bash
rg -n "executors/coverage|ExecutorCoverage|admin_executor|executor_coverage|executor_completed|executed_task_keys|ignored_task_keys|coverage_ratio|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py
```
扫描结果:
- 未在用户前端、用户 schema 或用户 story API 中发现 admin executor coverage、评测分数、golden replay、provider override 或内部 dispatch token。
- 命中项仅位于 `generation_jobs.py` 的内部事件过滤和安全进度映射逻辑。
已通过:
```bash
git diff --check
```
## 5. 风险与后续建议
| 风险 | 状态 | 建议 |
| --- | --- | --- |
| admin trace 与全局 coverage 口径漂移 | 已缓解 | 已抽共享聚合函数,后续新增字段必须先进该函数 |
| 用户 trace summary 暗含内部事件数量 | 已修正 | 保持内部事件 denylist并继续用测试覆盖 |
| executor 接管范围扩大过快 | 已控制 | 下一阶段仍应先围绕资产与 observability不急于接管主生成 |
| admin-only 数据误接用户前端 | 持续关注 | 每阶段继续运行敏感字段扫描 |
## 6. 阶段结论
阶段 14 完成了 admin trace 的审查能力增强,并补齐用户 trace summary 对 executor 内部事件的隔离。当前架构继续符合“评测驱动、admin-only 内部质量资产、用户侧只展示可用功能”的边界。

View File

@@ -0,0 +1,228 @@
# Harness Engineering 阶段 15 报告
**阶段**: Admin-Only Harness Readiness
**日期**: 2026-06-23
**状态**: 已完成当前切片
## 1. 阶段目标
本阶段继续沿用原设计路径:不扩大 executor 对主生成链路的接管范围,而是建立一个内部 readiness 审查摘要,让后续每次扩大 harness 接管范围前都能先看聚合质量门。
目标:
- 将内部 golden replay、evaluation analytics 和 executor coverage 串成一个 admin-only readiness audit。
- 保持 readiness 只返回聚合状态、阈值和覆盖摘要。
- 避免把评测数据、executor task key 或 readiness 结果分发到用户端。
- 修正运行环境风险golden replay fixture 必须随 app 发布,而不是只存在于 tests 目录。
## 2. 完成内容
### H15-1: app 内部 golden replay fixture
-`evaluation_golden_cases.json` 放入 `app/services/harness/fixtures/`
- `tests/test_harness_runtime.py` 改为读取 app 内部 fixture。
- 这样 Docker 镜像 `COPY app ./app`admin readiness 仍能读取 golden cases。
### H15-2: admin harness readiness 服务
- 新增 `app/services/admin_harness_readiness.py`
- 聚合输入:
- 内部 golden replay。
- `get_admin_evaluation_analytics(...)`
- `get_admin_executor_coverage(...)`
- 输出:
- `status`: `ready``needs_attention``blocked`
- `thresholds`: 当前内部 readiness 阈值。
- `checks`: 每个质量门的状态与聚合细节。
- `golden_replay``evaluation_analytics``executor_coverage` 聚合摘要。
当前 checks
| Check | 行为 |
| --- | --- |
| `golden_replay` | golden cases 未全部通过则 `blocked` |
| `runtime_evaluation_samples` | 当前窗口没有 evaluation 样本则 `needs_attention` |
| `runtime_evaluation_quality` | pass rate 或 average score 低于阈值则 `blocked` |
| `executor_coverage_samples` | 当前窗口没有 executor run 则 `needs_attention` |
| `executor_coverage_ratio` | coverage ratio 低于阈值则 `blocked` |
### H15-3: admin-only readiness API
- 新增 `GET /admin/harness/readiness`
- 复用 admin router 的 `admin_guard`
- 支持 `days` 查询参数,与 evaluation analytics 和 executor coverage 的窗口口径一致。
### H15-4: 测试覆盖
- `tests/test_admin_providers.py` 新增 readiness ready 路径测试。
- 新增 low runtime quality blocked 路径测试。
- 新增 admin auth required 测试。
- 测试断言 readiness 响应不包含 story title、score reason 或 quality gate message。
### H15-5: 文档同步
- `docs/technical/harness-engineering-modernization.md` 更新至阶段 0-15。
- `backend/tests/harness-evaluation-test-cases.md` 新增 TC-ADM-009、TC-ADM-010。
- 本报告记录安全边界、审查结论和验证结果。
## 3. 审查结论
### 用户侧商业机密隔离
本阶段没有新增用户端接口、用户前端类型或用户前端展示。
用户侧继续不可见:
- `GET /admin/harness/readiness`
- `golden_replay`
- `evaluation_analytics`
- `executor_coverage`
- `overall_score`
- 评分维度、评分 reason、阈值
- `executed_task_keys`
- `ignored_task_keys`
- quality gate message
### 管理端输出边界
readiness 是 admin-only 聚合摘要。它允许管理端看到:
- 当前窗口的运行期 evaluation 聚合。
- 当前窗口的 executor coverage 聚合。
- golden replay 是否通过及覆盖标签分布。
- readiness checks 和阈值。
它不返回:
- 故事正文。
- 绘本分页正文。
- 用户 prompt。
- cover prompt。
- score reason。
- quality gate message。
- 单条 evaluation event 或 executor event 明细。
### 架构边界
阶段 15 没有改变生成执行路径:
- 主文本生成仍走现有 service。
- 绘本主结构仍走现有 service。
- executor 仍只接管资产 task key。
- readiness 只读聚合数据,不写入 job 或 story 状态。
## 4. 验证记录
已通过:
```bash
cd backend
.venv/bin/python -m pytest tests/test_admin_providers.py -q
```
结果:
```text
13 passed
```
已通过:
```bash
cd backend
.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_harness_runtime.py -q
```
结果:
```text
37 passed
```
已通过:
```bash
cd backend
.venv/bin/python -m ruff check app tests
```
结果:
```text
All checks passed!
```
已通过:
```bash
cd backend
.venv/bin/python -m pytest
```
结果:
```text
164 passed
```
已通过:
```bash
cd frontend
npm run build
```
结果:
```text
vue-tsc && vite build
✓ built
```
备注Browserslist 数据陈旧警告,不影响构建结果。
已通过:
```bash
cd admin-frontend
npm run build
```
结果:
```text
vue-tsc && vite build
✓ built
```
备注Browserslist 与 baseline-browser-mapping 数据陈旧警告,不影响构建结果。
已通过用户侧敏感字段扫描:
```bash
rg -n "harness/readiness|HarnessReadiness|admin_harness|golden_replay|evaluation_analytics|executor_coverage|executors/coverage|ExecutorCoverage|admin_executor|executor_completed|executed_task_keys|ignored_task_keys|coverage_ratio|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py
```
扫描结果:
- 未在用户前端、用户 schema 或用户 story API 中发现 readiness、admin evaluation analytics、executor coverage、评分、golden replay、provider override 或内部 dispatch token。
- 命中项仅位于 `generation_jobs.py` 的内部事件过滤和安全进度映射逻辑。
已通过:
```bash
git diff --check
```
## 5. 风险与后续建议
| 风险 | 状态 | 建议 |
| --- | --- | --- |
| 生产镜像缺少 golden fixture | 已修正 | fixture 已放入 app 内部 harness fixtures |
| readiness 结果被误接用户前端 | 持续关注 | 保持 admin-only 路由,并继续运行敏感字段扫描 |
| 阈值过于简单 | 可接受 | 当前为阶段 15 最小门槛,后续可按真实样本调优 |
| readiness 输出过细 | 已控制 | 只返回聚合不返回原文、prompt、reason 或单条事件 |
## 6. 阶段结论
阶段 15 建立了 admin-only harness readiness 审查能力,把评测驱动从“有测试、有 analytics”推进到“扩大接管范围前有聚合质量门”。用户端仍然只展示可用功能和进度不接触评测数据、内部执行覆盖或 readiness 结果。

View File

@@ -0,0 +1,140 @@
# Harness Engineering 改造阶段 5 报告
**阶段**: 5 - Trace Analytics 与前端增量展示
**日期**: 2026-06-21
**状态**: 已完成
**范围**: 后端 trace summary 聚合、用户端与管理端生成轨迹展示、完整验证
---
## 1. 本阶段目标
阶段 5 的目标是让阶段 1-4 写入的标准 harness metadata 变成可见、可分析的产品能力。
本阶段明确区分两类统计:
- Provider stats只统计 Provider 调用成功率、延迟、成本和供应商失败。
- Trace summary统计 workflow step、artifact、failure category 等 harness 运行时语义。
这样质量门失败不会被误算为供应商失败,供应商看板和生成工作流看板各自保持语义清楚。
## 2. 已完成工作
### 后端
修改文件:
- `backend/app/schemas/story_schemas.py`
- `backend/app/services/generation_jobs.py`
- `backend/app/api/stories.py`
- `backend/tests/test_generation_jobs.py`
新增 API
```http
GET /api/generations/{story_id}/trace-summary
```
响应字段:
- `story_id`
- `window_days`
- `total_events`
- `failed_events`
- `by_step`
- `by_artifact`
- `failure_categories`
新增聚合能力:
- workflow step 聚合,例如 `image_generation``narrative_generation`
- artifact 聚合,例如 `cover_image``story_text`
- failure category 聚合,例如 `provider_error``schema_error`
### 用户端
修改文件:
- `frontend/src/types/generation.ts`
- `frontend/src/components/GenerationTrace.vue`
新增展示:
- 流程事件总数
- 失败事件数
- 主要步骤
- 主要失败类型
- 单个事件下方展示标准 step、artifact、failure category
### 管理端
修改文件:
- `admin-frontend/src/components/GenerationTrace.vue`
新增展示与用户端保持一致:
- trace summary 卡片
- 事件级 step/artifact/failure category 标签
## 3. 验证结果
已执行:
```bash
cd backend
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
结果:
- 后端完整测试:`139 passed`
- 后端 ruff`All checks passed!`
- 用户端生产构建:通过
- 管理端生产构建:通过
构建备注:
- Vite/Browserslist 输出了浏览器数据过期提示,不影响构建结果。
- 管理端构建输出了 `baseline-browser-mapping` 数据偏旧提示,不影响构建结果。
## 4. 自审结论
本阶段符合设计目标:
- 没有混淆 Provider stats 和 workflow trace stats。
- 前端只做增量展示,没有改变生成/重试主流程。
- 新 API 有后端测试覆盖。
- 用户端和管理端构建均通过。
- 质量门失败、Provider 失败和资产失败现在都有更清楚的可观测语义。
## 5. 当前新架构状态
Harness engineering 改造主线已完成阶段 0-5
- 设计基线完成。
- Harness runtime 基础类型完成。
- TraceRecorder 和 ExecutionControl 完成。
- 资产工作流主要抽取完成。
- WorkflowPlan 建模完成。
- 确定性 Quality Gates 完成。
- Trace Analytics 和前端展示完成。
## 6. 后续建议
下一步建议进入 **阶段 6新架构实测与执行器小步接管**
建议切片:
1. 使用 Docker demo stack 跑 smoke验证真实 API/worker/前端联动。
2. 在本地 demo provider 下创建故事和绘本,确认 trace summary 数据真实可见。
3. 回到阶段 3B让普通故事无图片路径先由 `WorkflowPlan` 驱动执行。
4. 逐步迁移带图片故事、绘本和资产任务执行器。

View File

@@ -0,0 +1,222 @@
# Harness Engineering 改造阶段 6 报告
**阶段**: 6 - 新架构真实运行烟测
**日期**: 2026-06-21
**状态**: 已完成
**范围**: 本地新代码 API、Celery worker、Docker PostgreSQL/Redis、真实 HTTP 生成链路、trace/provider 聚合验证
---
## 1. 本阶段目标
阶段 6 的目标是验证阶段 0-5 的新架构不只在单元测试和构建层面通过,也能在真实运行时闭环中工作。
本阶段重点验证:
- FastAPI 可以使用新代码启动。
- Celery worker 可以消费新代码派发的 generation job。
- `TraceRecorder` 写入的标准 metadata 能被 `trace-summary` 正确聚合。
- 主内容生成和资源重试都能进入 harness 运行时视角。
- Provider stats 继续只统计 Provider 调用,不与 workflow trace 混淆。
## 2. 运行环境
复用 Docker demo stack 中已运行的基础设施:
- PostgreSQL: `localhost:52432`
- Redis: `localhost:52379`
本地新代码进程:
- API: `127.0.0.1:53000`
- Worker: `celery -A app.core.celery_app worker --concurrency=1`
启动 API 使用的关键环境变量:
```bash
DATABASE_URL='postgresql+asyncpg://dreamweaver:dreamweaver_password@localhost:52432/dreamweaver_db'
CELERY_BROKER_URL='redis://localhost:52379/0'
CELERY_RESULT_BACKEND='redis://localhost:52379/0'
REDIS_URL='redis://localhost:52379/0'
```
## 3. 已执行烟测
### 3.1 健康检查
请求:
```bash
curl -fsS http://127.0.0.1:53000/health
```
结果:
```json
{"status":"ok"}
```
### 3.2 dev 登录与会话验证
通过 `/auth/dev/signin` 创建真实 cookie 会话,再查询 `/auth/session`
结果:
```text
login_status=302
user_id=github:dev_user_001
```
### 3.3 普通故事生成链路
请求:
```json
{
"output_mode": "story",
"type": "keywords",
"data": "星光书签, 小鹿, 学会复盘",
"education_theme": "复盘与成长",
"generate_images": false
}
```
结果:
```text
job_id=a606878c-98a7-4d05-af95-629d0cd2f194
poll=01 status=running step=request_accepted story_id=none
poll=02 status=completed step=generation_completed story_id=59
story_title=星光书签、小鹿、学会复盘的晚安冒险
```
说明:
- API 成功创建 generation job。
- Worker 成功 claim 并执行任务。
- 故事成功落库。
- job 以 `generation_completed` 收敛。
### 3.4 主生成 trace summary
结果:
```text
trace_total_events=8
trace_failed_events=0
trace_steps=[
{"name":"provider_invocation","count":2},
{"name":"context_preparation","count":1},
{"name":"narrative_generation","count":1},
{"name":"story_persistence","count":1}
]
trace_artifacts=[
{"name":"story_text","count":1}
]
```
说明:
- 标准 step 已可聚合。
- `story_text` artifact 已可聚合。
- 无失败事件。
### 3.5 图片资源重试链路
对 story `59` 执行:
```json
{"assets":["image"]}
```
结果:
```text
retry_image_status=ready
trace_before_total=8
trace_after_total=15
recent_jobs=[
{"status":"completed","output_mode":"asset_retry","current_step":"asset_retry_completed","story_id":59},
{"status":"completed","output_mode":"story","current_step":"generation_completed","story_id":59}
]
```
重试后 trace 聚合:
```text
trace_after_steps=[
{"name":"provider_invocation","count":4},
{"name":"image_generation","count":2},
{"name":"context_preparation","count":1},
{"name":"narrative_generation","count":1},
{"name":"story_persistence","count":1}
]
trace_after_artifacts=[
{"name":"cover_image","count":2},
{"name":"story_text","count":1}
]
```
Provider stats
```json
{
"story_id": 59,
"total_calls": 2,
"successful_calls": 2,
"failed_calls": 0,
"by_provider": [
{"capability":"image","adapter":"demo","call_count":1,"success_count":1,"failure_count":0},
{"capability":"text","adapter":"demo","call_count":1,"success_count":1,"failure_count":0}
],
"failure_reasons": []
}
```
说明:
- 资源重试新建了 `asset_retry` job。
- 图片生成进入 `image_generation` step。
- 封面进入 `cover_image` artifact 聚合。
- Provider stats 正确统计 text/image provider 调用。
## 4. Docker build 说明
本阶段尝试执行:
```bash
docker compose up -d --build
```
遇到两个与代码无关的外部阻塞:
1. 根目录 `.env` 中镜像代理覆盖为 `docker.1ms.run/library/node:18-alpine`,该镜像拉取失败。
2. 改用官方镜像变量后Docker Hub metadata 拉取出现网络 EOF。
因此本阶段没有把新镜像完整 build 成 Docker stack。为验证新代码运行时本阶段改用本地 API/worker 进程连接现有 Docker PostgreSQL/Redis覆盖了真实 HTTP、Celery、DB、Redis 和 demo provider 链路。
## 5. 自审结论
本阶段烟测通过,说明阶段 0-5 的 harness engineering 改造已经具备真实运行能力:
- 主内容生成链路可完成。
- 资产重试链路可完成。
- 标准 trace metadata 可以被后端聚合。
- Provider stats 和 workflow trace stats 语义保持分离。
- 前端新增的 trace summary 数据来源已经被真实 API 验证。
仍需注意:
- Docker 镜像重建受外部 registry/network 影响,后续在网络稳定或镜像源修复后应再跑一次完整 Docker build smoke。
- 阶段 3 的 `WorkflowPlan` 当前仍是建模基线,执行器接管尚未开始。
## 6. 后续建议
下一步建议进入 **阶段 7执行器小步接管**
建议切片:
1. 先让普通故事、`generate_images=false` 的最小路径由 `WorkflowPlan` 驱动。
2. 保持现有 `story_service` 作为外层编排入口,避免一次性迁移所有模式。
3. 给执行器增加一条最小集成测试,验证 step 事件顺序、质量门和持久化行为。
4. 再迁移带封面故事、绘本、资产生成和资产重试。

View File

@@ -0,0 +1,252 @@
# Harness Engineering 改造阶段 7 报告
**阶段**: 7 - 评测驱动与执行器最小接管
**日期**: 2026-06-22
**状态**: 已完成 7A/7B/7C/7D/7E 当前切片
**范围**: deterministic evaluator、evaluation trace、普通故事无图片路径的 WorkflowPlan 接入、内部 golden replay、覆盖摘要、测试与 QA 用例
---
## 1. 本阶段目标
阶段 7 的目标是响应“产品需要评测驱动”的长期要求:生成任务不能只用成功/失败判断质量,而要在主内容持久化前形成可追踪、可回归、可统计的 evaluation result。
本阶段只接管最小运行路径:
- `output_mode=story`
- `generate_images=false`
不在本阶段迁移绘本、带图片故事、资产生成或资产重试执行器,避免一次性扩大风险。
## 2. 已完成工作
### 后端 harness
新增文件:
- `backend/app/services/harness/evaluators.py`
- `backend/app/services/harness/executor.py`
- `backend/app/services/harness/evaluation_replay.py`
- `backend/tests/fixtures/evaluation_golden_cases.json`
新增能力:
- `EvaluationDimension`
- `EvaluationScore`
- `EvaluationResult`
- `evaluate_story_output`
- `EvaluationReplayCoverage`
- `EvaluationReplayCase`
- `EvaluationReplaySuiteResult.coverage_summary`
- `ExpectedEvaluation`
- `replay_evaluation_golden_cases`
- `run_evaluation_replay_cases`
- `record_workflow_plan`
- `record_evaluation_result`
当前确定性评分维度:
- `structure`
- `safety`
- `age_fit`
- `educational_value`
- `readability`
### 内部 golden replay
阶段 7D 已建立第一组内部 golden cases用固定样本锁住 deterministic evaluator 的回归基线。
阶段 7E 已将 golden cases 扩充到 11 个样本,并为每条 case 增加内部覆盖标签:
- `age_band`
- `content_shape`
- `risk_area`
- `tags`
当前样本覆盖:
- 完整普通故事通过。
- 较长普通故事通过。
- 普通故事空正文被质量门阻断。
- 普通故事封面提示词缺失被质量门阻断。
- 普通故事安全风险词被质量门阻断。
- 普通故事结构完整但阅读体验偏短,在高阈值下被评测阻断。
- 完整绘本分页通过。
- 绘本重复页码被质量门阻断。
- 绘本没有分页内容被质量门阻断。
- 绘本分页安全风险词被质量门阻断。
- 绘本分页正文过短触发 warning并在高阈值下被评测阻断。
当前覆盖摘要已由单测锁定:
- artifact: `story=6``storybook=5`
- age_band: `3-4=4``5-6=4``7-8=1``unknown=2`
- risk_area: `schema_error=4``happy_path=2``readability_warning=2``safety_error=2``length_boundary=1`
- outcome: `passed=3``blocked=8`
实现边界:
- replay fixture 只被后端测试和内部工具读取。
- 线上生成链路不会自动读取 golden cases。
- 不新增用户端 API。
- 不改变公开 schema。
- 不把 replay 结果、评分、维度或阈值分发到用户前端。
- 覆盖摘要只用于后端测试和内部评测基线审查,不进入用户端 API。
replay 会比较:
- `passed`
- `blocking`
- `overall_score` 区间
- 必需维度是否存在
- quality gate issue code
- warning 文案片段
- coverage summary
### 事件模型
新增标准 step
- `evaluation`
新增事件:
- `workflow_planned`
- `evaluation_completed`
新增进度:
- `workflow_planned`: `8%`,工作流已规划
- `evaluation_completed`: `52%`,内容评测已完成
### story service
普通故事无图片路径现在会:
1. 构建 `WorkflowPlan`
2. 写入 `workflow_planned`
3. 准备上下文
4. 调用文本 provider
5. 执行 deterministic evaluator
6. 写入 `evaluation_completed`
7. 通过后写入 `narrative_generated`
8. 持久化故事
9. 收敛 job
质量门失败时会同时写入:
- `quality_gate_failed`
- `evaluation_completed`
这样 failed job 的阻断原因和评分事实都能被追踪。
阶段 7C 已将绘本主内容纳入内部 deterministic evaluator
- 绘本 Provider 输出后、持久化前执行 `evaluate_storybook_output`
- 绘本质量门失败会写入内部 `quality_gate_failed``evaluation_completed`
- 绘本评测通过会写入内部 `evaluation_completed`artifact 标记为 `storybook_pages`
- 用户可访问的 job detail 仍会过滤 `evaluation_completed`
### 前端与管理端
管理端生成轨迹已补充内部新事件/步骤中文标签:
- `workflow_planned`: 工作流规划
- `evaluation_completed`: 内容评测
- `evaluation`: 内容评测
安全边界修正:
- 用户端不展示评测分数、维度、通过率或阻断阈值。
- 用户可访问的 job detail 不返回 `evaluation_completed` 事件。
- 用户可访问的 `trace-summary` 不返回 `evaluation` 聚合对象。
- 用户端生成轨迹组件不保留 `evaluation_completed``evaluation` 展示标签。
- 评测 metadata 只保留在内部 job events 中,后续如需展示必须通过 admin-only 内部接口。
### Trace Summary
`GET /api/generations/{story_id}/trace-summary` 继续只返回用户可解释的工作流摘要:
- `total_events`
- `failed_events`
- `by_step`
- `by_artifact`
- `failure_categories`
该接口会跳过 `evaluation_completed`,且 `total_events` 也只统计公开事件,避免把评测分数、维度、阻断策略或内部评测步骤数量分发给普通用户。
## 3. 验证结果
已执行:
```bash
cd backend
.venv/bin/python -m pytest tests/test_harness_runtime.py tests/test_generation_jobs.py
.venv/bin/python -m ruff check app tests
.venv/bin/python -m pytest
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
最新结果:
- 定向测试:`42 passed`
- Harness runtime 定向测试:`22 passed`
- 后端完整测试:`146 passed`
- Ruff`All checks passed!`
- 用户端构建:通过
- 管理端构建:通过
构建备注:
- Vite/Browserslist 输出浏览器数据过期提示,不影响构建结果。
- 管理端输出 `baseline-browser-mapping` 数据偏旧提示,不影响构建结果。
## 4. 自审结论
本阶段目前符合小步迁移原则:
- 没有引入外部评测服务和额外成本。
- 没有改变 API 响应结构。
- 公共 `trace-summary` 不分发 evaluation summary。
- 公共 `trace-summary``total_events` 不统计 `evaluation_completed`
- 只接入普通故事无图片路径。
- 质量门阻断仍然发生在持久化前。
- evaluation metadata 已进入内部 job event但用户接口会脱敏。
- 用户端只展示可用功能和可解释状态,不展示评测数据。
- 文本故事和绘本主内容都已经在持久化前进入内部 deterministic evaluator。
- 内部 golden replay 已能在单测中检查评测基线漂移。
- 内部 replay 覆盖摘要已能检查年龄段、内容形态、风险区域、标签和 outcome 分布。
- replay 结果未接入任何用户端接口或前端展示。
## 5. Bug 与风险记录
当前没有必须立即阻断的已知 bug。
已发现并即时修复的问题:
- 首次插入 plan-aware 分支时storybook 返回块缩进被补丁碰歪;已在继续测试前修复。
- 后端新增 `workflow_planned``evaluation_completed` 后,用户端/管理端事件标签一开始没有同步;审查发现后已补中文标签并重新构建通过。
- 阶段 7B 曾短暂把 evaluation summary 接入用户端和用户可访问 API经产品安全边界复核后已移除并补充测试确保公共响应不包含 `evaluation`、用户 job detail 不包含 `evaluation_completed`
- 阶段 7D 初次新增 replay 模块后 Ruff 发现 import 顺序问题;已用 Ruff 修复并重新跑定向测试。
后续风险:
- 当前 evaluator 是确定性启发式,适合做回归基线,但不能替代高质量模型评测或人工样本评审。
- 当前 golden cases 已扩展到 11 条但仍偏工程回归样本后续需要补充真实用户输入分布、Provider 输出变体、教育主题缺失/弱相关、不同绘本页数和更细年龄分层。
- 旧同步接口调用 `generate_and_save_story` 时也会执行 evaluator但没有 job 时不会记录事件;这是兼容选择,后续可以考虑为同步接口生成 lightweight evaluation response。
- 后续如果要看 evaluation summary必须新建 admin-only 内部接口,并确认不会被用户端调用。
## 6. 后续建议
下一步继续阶段 8
1. 设计 admin-only evaluation analytics明确权限边界和脱敏规则。
2. 逐步让带图片故事和绘本执行路径由 `WorkflowPlan` 接管。
3. 扩充 golden cases 到真实用户输入分布和 Provider 输出变体。
4. 在 Docker registry 网络恢复后重新跑完整 build smoke。

View File

@@ -0,0 +1,142 @@
# Harness Engineering 改造阶段 8 报告
**阶段**: 8 - Admin-Only Evaluation Analytics
**日期**: 2026-06-22
**状态**: 已完成当前切片
**范围**: admin-only 内部评测聚合、权限边界、过滤、测试和用户端隔离审查
---
## 1. 本阶段目标
阶段 8 的目标是在不泄露商业机密的前提下,让内部团队可以看到内容评测的聚合质量趋势。
本阶段只做管理控制面后端接口:
- 不做用户端接口。
- 不做用户端前端展示。
- 不做管理端可视化页面。
- 不返回原始故事内容、prompt、单条 evaluation event 或评分 reason。
## 2. 已完成工作
### 后端服务
新增文件:
- `backend/app/services/admin_evaluation_analytics.py`
新增能力:
- 聚合内部 `evaluation_completed` 事件。
- 支持 `days` 时间窗口过滤。
- 支持 `artifact=story_text|storybook_pages` 过滤。
- 汇总通过数、阻断数、通过率、平均分、artifact、output mode、score band、dimension score、quality gate issue、failure category 和 warning。
### Admin-only API
在既有 admin router 中新增:
```text
GET /admin/evaluations/analytics
```
该接口受现有 admin 控制面保护:
- `ENABLE_ADMIN_CONSOLE=true` 时才挂载 admin router。
- 路由继承 `Depends(admin_guard)`
- Basic Auth 失败时返回 `401`
查询参数:
- `days`: `1-365`
- `artifact`: `story_text``storybook_pages`
### 响应边界
该接口只返回聚合摘要:
- `total_evaluations`
- `passed_evaluations`
- `blocked_evaluations`
- `pass_rate`
- `average_score`
- `job_count`
- `story_count`
- `user_count`
- `by_artifact`
- `by_output_mode`
- `score_bands`
- `dimension_scores`
- `quality_gate_issues`
- `failure_categories`
- `warnings`
该接口不会返回:
- 故事正文
- 绘本分页正文
- 用户 prompt
- cover prompt
- 单条 job event
- 单条 evaluation event
- 评分 reason
- quality gate message
## 3. 验证结果
已执行:
```bash
cd backend
.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_generation_jobs.py
.venv/bin/python -m ruff check app/services/admin_evaluation_analytics.py app/api/admin_providers.py tests/test_admin_providers.py
```
结果:
- Admin + 用户侧脱敏定向测试:`26 passed`
- Ruff`All checks passed!`
已做用户端隔离扫描:
```bash
rg -n "evaluations/analytics|EvaluationAnalytics|admin_evaluation|evaluation_completed|overall_score|golden|replay" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py
```
扫描结论:
- 用户端前端没有 evaluation analytics 接口、类型或展示命中。
- 用户端公开 schema 没有新增 evaluation analytics 响应模型。
- 用户侧后端只保留 `evaluation_completed` 的过滤/脱敏逻辑。
## 4. 自审结论
本阶段符合评测数据内部分级原则:
- 评测 analytics 是 admin-only。
- 用户端 API 没有新增评测数据。
- 用户前端没有新增评测入口。
- 响应为聚合摘要,不返回原始内容或单条评测明细。
- 权限测试覆盖未授权访问。
- 用户端脱敏测试继续通过。
## 5. Bug 与风险记录
已发现并即时修复的问题:
- 初次测试时 `dimension_scores` 的排序预期与实现不一致;实现按覆盖次数优先排序,更适合运营视图,因此已修正测试预期。
当前风险:
- 当前接口返回 warning 文案聚合。warning 文案来自内部 evaluator目前不包含原始内容但后续新增 warning 时必须避免拼接用户正文或 prompt。
- 当前只做后端 admin API尚未做管理端页面。后续做 UI 时仍需避免展示单条评测明细和原文内容。
- analytics 聚合目前使用 Python 读取 JSON metadata 聚合,适合当前数据量和 SQLite/PostgreSQL 兼容;后续数据量变大时可考虑离线物化或数据库 JSON 聚合。
## 6. 后续建议
下一步建议进入阶段 9
1. 继续让带图片故事和绘本路径由 `WorkflowPlan` 更完整接管。
2. 或先做 admin-only evaluation analytics 的管理端只读页面,但必须保持聚合摘要边界。
3. 扩充真实用户输入分布的 golden cases特别是教育主题弱相关和不同年龄段样本。

View File

@@ -0,0 +1,144 @@
# Harness Engineering 改造阶段 9 报告
**阶段**: 9 - WorkflowPlan 接管扩展
**日期**: 2026-06-22
**状态**: 已完成当前切片
**范围**: 普通故事带图片、绘本生成路径的计划快照接入、事件顺序测试、用户端评测隔离复核
---
## 1. 本阶段目标
阶段 9 的目标是把 `WorkflowPlan` 从普通故事无图片路径扩展到三条主生成路径:
- 普通故事无图片:已在阶段 7 接入,本阶段继续作为基线。
- 普通故事带图片:新增 `story_with_assets` plan。
- 绘本:新增 `storybook` plan。
本阶段不重写完整执行器,也不改变用户侧 API 响应结构。目标是先让计划快照成为稳定的运行时事实,为后续把执行分支逐步迁移到 executor 打基础。
## 2. 已完成工作
### 后端生成路径
修改文件:
- `backend/app/services/story_service.py`
新增行为:
- `output_mode=storybook` 时,在调用 `generate_storybook_service` 前记录 `workflow_planned`
- `output_mode=story``generate_images=true` 时,在调用 `generate_full_story_service` 前记录 `workflow_planned`
- `generate_images=false` 的普通故事路径继续复用已有 `_execute_story_without_assets_plan`
### WorkflowPlan 快照
普通故事带图片路径:
- `plan.mode=story_with_assets`
- tasks 包含:
- `prepare_context`
- `generate_narrative`
- `evaluate_narrative`
- `persist_story`
- `generate_cover_image`
- `queue_postprocessing`
- `complete_generation`
- `generate_cover_image.required=false`
- `generate_cover_image.recoverable=true`
绘本路径:
- `plan.mode=storybook`
- tasks 包含:
- `prepare_context`
- `generate_storybook_pages`
- `evaluate_storybook_pages`
- `generate_storybook_images`
- `persist_storybook`
- `queue_postprocessing`
- `complete_generation`
- `generate_storybook_images.required=false`
- `generate_storybook_images.recoverable=true`
### 测试
修改文件:
- `backend/tests/test_generation_jobs.py`
新增或更新覆盖:
- 新增 `test_story_with_images_worker_records_plan_before_assets`
- 更新绘本 worker 测试,断言 `workflow_planned` 事件顺序和 `storybook` plan 快照。
- 继续确认用户 job detail 不返回 `evaluation_completed`
### 文档
修改文件:
- `docs/technical/harness-engineering-modernization.md`
- `backend/tests/harness-evaluation-test-cases.md`
新增内容:
- 设计文档新增 Workflow Plan Coverage。
- 阶段计划新增阶段 9。
- QA 用例新增带图片故事和绘本计划快照状态转换测试。
## 3. 验证结果
已执行:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
结果:
- 定向生成任务测试:`21 passed`
- 后端全量测试:`151 passed`
- Ruff`All checks passed!`
- 用户前端构建:通过
- 管理端构建:通过
构建提示:
- `frontend``admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。
- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。
- 以上均为依赖数据 freshness 提示,不影响当前构建结果。
## 4. 自审结论
本阶段改动符合当前 Harness Engineering 路径:
- 改动面集中在生成入口,不重写 Provider、质量门或持久化逻辑。
- 三条主路径的计划事件顺序一致:`worker_started` 后、`context_prepared` 前记录 `workflow_planned`
- 图片类任务在 plan 中明确为可恢复资产,不阻断主内容阅读。
- `evaluation_completed` 继续作为内部事件存在,用户端 detail 和 trace summary 不分发评分数据。
- 新增测试断言 plan 快照,而不是只断言事件名称,能更早发现后续执行器迁移时的计划漂移。
## 5. Bug 与风险记录
本阶段未发现需要统一后置处理的 bug。
当前风险:
- `_generate_generation_service_with_job` 仍保留分支式执行,只是补齐了 plan 记录。后续如果要真正由 executor 编排执行,需要继续拆分 story、storybook、asset workflow 的最小执行单元。
- `workflow_planned` 当前在用户侧可见。它不包含评测分数、阈值或 replay 信息,可以展示为“工作流规划”;后续如果 plan metadata 增加内部策略字段,必须先做 public sanitizer。
- 当前 plan 快照写入 job event metadata。数据量较小适合现在的 trace 需求;后续若引入更复杂 DAG 或重放执行状态,可考虑独立表或压缩摘要。
## 6. 后续建议
下一阶段建议进入阶段 10
1. 将资产生成和重试路径也纳入 `WorkflowPlan` 记录,统一 `asset_generation``asset_retry` 的计划快照。
2. 为用户侧 job/event 输出增加公共 metadata sanitizer明确允许字段白名单避免未来 plan 或 trace 字段扩展时误泄露内部质量策略。
3. 继续扩展评测驱动 golden cases优先覆盖教育主题弱相关、不同年龄段长度边界和绘本分页一致性。

View File

@@ -1,10 +1,10 @@
# Harness Engineering 架构改造技术设计
**项目**: DreamWeaver 梦语织机
**版本**: 0.1
**日期**: 2026-06-21
**状态**: 阶段 0 已建立设计基线
**作者**: Codex
**项目**: DreamWeaver 梦语织机
**版本**: 0.1
**日期**: 2026-06-23
**状态**: 阶段 0-15 当前切片已完成,主生成与资产任务均已写入 WorkflowPlan 快照,资产生成/重试已开始由 plan-driven executor 驱动executor coverage 已进入 admin-only 聚合并嵌入 admin traceadmin-only harness readiness 审查已建立,用户侧 job event/request payload 已使用白名单脱敏,文本故事和绘本已纳入内部评测驱动,内部 golden replay 基线、覆盖摘要、admin-only evaluation analytics 和 admin-only 完整 trace 已建立
**作者**: Codex
---
@@ -36,6 +36,7 @@ DreamWeaver 当前已经完成统一生成工作流的第一轮落地:`POST /a
-`story_service` 中的运行时控制职责抽到 harness 层。
- 让 workflow step、artifact、trace、failure category 成为一等概念。
- 让内容生成结果在持久化和发布前具备可追踪、可回归的评测结果。
- 保持 `/api/generations`、旧兼容接口、现有状态字段和主要测试行为不破坏。
- 优先做渐进式重构,不引入复杂工作流引擎,不进行大爆炸重写。
- 每个大阶段都产出阶段报告,包含实现、审查、验证和风险。
@@ -50,19 +51,25 @@ DreamWeaver 当前已经完成统一生成工作流的第一轮落地:`POST /a
## 3. 架构原则
1. **主内容优先可读**
1. **主内容优先可读**
文本故事或绘本结构是 blocking artifact封面、分页插图、音频是 recoverable artifact。
2. **API 稳定优先**
2. **API 稳定优先**
先重构内部边界再考虑扩展响应字段。现有前端、smoke、测试不应被第一阶段打断。
3. **事件结构稳定**
3. **事件结构稳定**
继续复用 `generation_job_events`,但逐步标准化 metadata避免每个调用点随手定义不同结构。
4. **Provider 不等于产品能力**
4. **Provider 不等于产品能力**
Provider 只是 tool invocation 的实现。产品能力应由 capability、workflow step、artifact 和 recovery policy 共同定义。
5. **小步可验证**
5. **评测驱动优先**
生成成功不等于产品成功。每条新执行路径必须先定义可追踪 evaluation 事件、评分维度、阻断阈值和回归测试,再扩大迁移范围。
6. **评测数据内部分级**
评测分数、维度、阈值和阻断细节属于内部质量资产与商业机密,不通过用户端接口或用户前端分发。用户端只展示可操作功能、可解释进度和可恢复状态。
7. **小步可验证**
每个最小任务都必须能通过单测、局部测试或文档审查验证。
## 4. 目标架构
@@ -78,6 +85,7 @@ flowchart TB
HARNESS --> TRACE["Trace Recorder<br/>job events / step metadata / provider trace"]
HARNESS --> ARTIFACT["Artifact Workflows<br/>story_text / storybook_pages / image / audio"]
HARNESS --> GUARD["Quality Gates<br/>schema / 儿童安全 / 内容完整性"]
HARNESS --> EVAL["Evaluators<br/>结构 / 安全 / 年龄适配 / 教育价值 / 可读性"]
ARTIFACT --> ROUTER["Provider Router<br/>策略 / failover / 熔断 / 成本"]
ROUTER --> ADAPTERS["Provider Adapters"]
@@ -112,10 +120,11 @@ flowchart TB
| Step | 当前事件 | 是否阻塞主内容 |
| --- | --- | --- |
| `request_acceptance` | `request_accepted``retry_queued` | 是 |
| `request_acceptance` | `request_accepted``retry_queued``workflow_planned` | 是 |
| `worker_start` | `worker_started` | 是 |
| `context_preparation` | `context_prepared` | 是 |
| `narrative_generation` | `narrative_generated` | 是 |
| `evaluation` | `evaluation_completed` | 是 |
| `story_persistence` | `story_saved` | 是 |
| `image_generation` | `cover_image_*``storybook_*image*` | 否 |
| `audio_generation` | `audio_*` | 否 |
@@ -172,11 +181,204 @@ flowchart TB
}
```
### 5.6 Evaluation Result
每次主内容生成必须逐步产出可追踪评测结果。第一阶段使用确定性启发式,后续可替换或叠加模型评测、人审样本集和离线 replay。
标准字段:
| 字段 | 说明 |
| --- | --- |
| `overall_score` | `0.0-1.0` 总分 |
| `passed` | 是否通过当前阈值 |
| `blocking` | 是否阻断持久化或发布 |
| `scores` | 维度评分列表 |
| `quality_gate` | 质量门失败详情,可为空 |
| `warnings` | 非阻断风险提示 |
当前维度:
- `structure`
- `safety`
- `age_fit`
- `educational_value`
- `readability`
标准事件:
- `workflow_planned`
- `evaluation_completed`
短期兼容要求:
- 不删除现有 metadata 字段。
- 新增字段必须向后兼容。
- 前端仍可使用当前 `event_type``status``message``event_metadata`
- 用户端 API 和用户前端不得返回或展示 `overall_score`、维度分数、阈值、阻断策略或 golden replay 结果。
### 5.7 Workflow Plan Coverage
`WorkflowPlan` 是生成 harness 的显式执行骨架。当前主生成路径和资产路径都会写入 `workflow_planned` 事件:
| 模式 | plan mode | 关键任务 | 备注 |
| --- | --- | --- | --- |
| 普通故事无图片 | `story` | `prepare_context``generate_narrative``evaluate_narrative``persist_story``queue_postprocessing``complete_generation` | 当前最小 plan-aware 路径 |
| 普通故事带图片 | `story_with_assets` | 在普通故事任务基础上增加 `generate_cover_image` | 封面图为 `required=false``recoverable=true` |
| 绘本 | `storybook` | `prepare_context``generate_storybook_pages``evaluate_storybook_pages`、可选 `generate_storybook_images``persist_storybook``queue_postprocessing``complete_generation` | 绘本图片为可恢复资产 |
| 资产生成 | `asset_generation` | `start_asset_generation``complete_image_asset``complete_audio_asset``complete_asset_generation` | 图片/音频均为 `required=false``recoverable=true` |
| 资产重试 | `asset_retry` | `start_asset_retry``complete_image_asset``complete_audio_asset``complete_asset_retry` | 同步重试路径也记录 plan |
当前边界:
- `workflow_planned` 可进入用户侧进度,因为它只描述产品步骤,不包含评分、阈值或 golden replay 信息。
- 用户端只返回 coarse plan metadata`plan_mode``planned_task_count``recoverable_task_count`
- 用户端不返回原始 `plan.tasks`、任务 key、内部阈值或执行策略。
- `evaluation_completed` 只保留在内部事件、内部测试和 admin-only 聚合中。
- 用户端 job detail 会过滤 `evaluation_completed`
- 用户端 trace summary 不统计 `evaluation_completed` 的事件数量、step、artifact 或失败分类。
- 用户端 trace summary 不统计 `executor_completed` 的事件数量、task key 或 result asset。
### 5.8 Public Event Metadata Sanitizer
用户侧 job detail 的 `events[*].event_metadata` 使用白名单输出。数据库中的内部 metadata 不被删除,内部分析、测试和 admin-only 能力仍可读取完整事件;普通用户 API 只返回产品可解释且可操作的字段。
允许公开的类别:
- 标准 step、artifact、failure_category。
- 资源状态和资产范围,如 `asset``assets``status``image_status``audio_status`
- 用户可理解的执行上下文,如 `mode``output_mode``input_type``page_count``page_number`
- Provider 运营摘要,如 `adapter``capability``strategy``latency_ms``estimated_cost_usd`
- coarse plan 摘要:`plan_mode``planned_task_count``recoverable_task_count`
禁止公开的类别:
- `evaluation_completed` 事件本身。
- `overall_score`、维度分数、评分 reason、阈值、质量门 issue 明细。
- 原始 `plan``plan.tasks`
- `result_snapshot`、内部错误原文、内部阈值、replay/golden case 信息。
- 任意未来新增 metadata 字段,除非显式加入白名单。
### 5.9 Internal Evaluation Replay
内部 evaluation replay 用于把固定 golden cases 和当前 evaluator 输出做对比,帮助我们在调整质量门、评分维度或 Provider 输出结构时快速发现评测基线漂移。
当前边界:
- replay 输入和结果只用于后端测试、内部工具或未来 admin-only 能力。
- replay fixture 不被线上生成链路自动读取。
- replay 不新增公开 API不改变用户端 schema不进入用户前端 bundle。
- 用户端 trace summary 的 `total_events` 不统计内部 `evaluation_completed`
- replay 断言只检查内部质量事实:`passed``blocking``overall_score` 区间、维度存在性、质量门 issue code 和 warning。
- replay case 可以携带内部覆盖标签:年龄段、内容形态、风险区域和标签集合。
- replay suite 可以生成内部覆盖摘要artifact、age_band、content_shape、risk_area、tags、outcome。
当前 golden case 覆盖:
- 完整普通故事通过。
- 较长普通故事通过。
- 普通故事空正文被质量门阻断。
- 普通故事封面提示词缺失被质量门阻断。
- 普通故事安全风险词被质量门阻断。
- 普通故事结构合格但在高阈值下因阅读体验偏短被评测阻断。
- 完整绘本分页通过。
- 绘本重复页码被质量门阻断。
- 绘本没有分页内容被质量门阻断。
- 绘本分页安全风险词被质量门阻断。
- 绘本分页正文过短触发内部 warning 并在高阈值下阻断。
### 5.10 Admin-Only Evaluation Analytics
内部评测 analytics 只允许在管理控制面读取,用于质量运营和评测策略复盘。该能力不得进入用户端 `/api/generations` 路由、用户前端类型或用户前端 bundle。
当前 admin-only 聚合字段:
| 字段 | 说明 |
| --- | --- |
| `total_evaluations` | 内部评测事件数量 |
| `passed_evaluations` | 通过数量 |
| `blocked_evaluations` | 阻断数量 |
| `pass_rate` | 通过率 |
| `average_score` | 总分平均值 |
| `by_artifact` | 按 `story_text` / `storybook_pages` 聚合 |
| `by_output_mode` | 按 story / storybook 聚合 |
| `score_bands` | 按分数段聚合 |
| `dimension_scores` | 各评分维度平均分 |
| `quality_gate_issues` | 质量门 issue code 聚合 |
| `failure_categories` | 质量门 failure category 聚合 |
| `warnings` | 内部 warning 文案聚合 |
安全边界:
- 只挂载在 admin router 下,受 `ENABLE_ADMIN_CONSOLE` 和 Basic Auth admin guard 保护。
- 不返回故事正文、绘本分页正文、用户 prompt、cover prompt、score reason、quality gate message、单条 evaluation event 或 job event 明细。
- 用户端 API 继续过滤 `evaluation_completed`
- 用户端 trace summary 继续不统计内部 `evaluation_completed`
- 用户端前端不包含该接口调用、类型定义或展示组件。
### 5.11 Admin-Only Executor Coverage
内部 executor coverage 用于审查 `WorkflowPlan` 是否真正驱动了资产执行,以及哪些 task key 仍只是计划占位或被当前 runner 忽略。该能力只属于管理控制面,不进入用户 API 或用户前端。
当前 admin-only 聚合字段:
| 字段 | 说明 |
| --- | --- |
| `total_runs` | executor 完成事件数量 |
| `total_planned_tasks` | 计划任务总数 |
| `total_executed_tasks` | 实际执行任务总数 |
| `total_ignored_tasks` | 被 runner 忽略的任务总数 |
| `coverage_ratio` | `executed / planned` |
| `by_plan_mode` | 按 `asset_generation` / `asset_retry` 等模式聚合 |
| `by_output_mode` | 按生成 job 的 output mode 聚合 |
| `executed_task_keys` | 已执行 task key 聚合 |
| `ignored_task_keys` | 已忽略 task key 聚合 |
| `result_assets` | 返回资产聚合 |
安全边界:
- 只挂载在 admin router 下,受 `ENABLE_ADMIN_CONSOLE` 和 Basic Auth admin guard 保护。
- `executor_completed` 事件、task key、ignored task key 和 result asset 明细不进入用户 job detail。
- 用户 job summary 如果短暂停留在内部 `executor_completed` step会映射为安全公开的 `workflow_planned`
- 用户 trace summary 不统计 `executor_completed`,避免通过事件数量或聚合维度泄露内部执行器结构。
- 用户前端不包含 `/admin/executors/coverage` 调用、类型定义或展示组件。
### 5.12 Admin Trace Executor Coverage Summary
管理端单个 generation trace 在完整事件流之外,额外返回 `executor_coverage` 摘要,用于一次请求内同时完成“看事件”和“看执行覆盖”的审查。
设计边界:
- `GET /admin/generations/jobs/{job_id}/trace` 复用全局 executor coverage 的聚合函数,避免两个 admin 视图统计口径不一致。
- trace 内嵌 summary 的 `scope``admin_internal_job_executor_coverage`,只统计当前 job 的 `executor_completed` 事件。
- trace 内嵌 summary 允许返回 task key、ignored task key 和 result asset因为该接口已经是 admin-only 完整内部 trace。
- 用户侧 `/api/generations/jobs/{job_id}``/api/generations/{story_id}/jobs``/api/generations/{story_id}/trace-summary` 均不返回该字段。
### 5.13 Admin-Only Harness Readiness
内部 harness readiness 用于在扩大 plan-driven executor 或评测策略接管范围前,给管理控制面提供一个聚合级别的上线前审查摘要。
输入来源:
- 内部 golden replay fixture随后端 app 一起发布,避免运行环境缺少测试目录。
- admin-only evaluation analytics 聚合。
- admin-only executor coverage 聚合。
当前 readiness checks
| Check | 说明 | 默认门槛 |
| --- | --- | --- |
| `golden_replay` | 内部 golden cases 是否全部符合预期 | 必须全部通过 |
| `runtime_evaluation_samples` | 当前窗口是否有运行期 evaluation 样本 | 至少 1 条 |
| `runtime_evaluation_quality` | 运行期 evaluation 通过率和均分是否达标 | pass rate >= 0.7average score >= 0.7 |
| `executor_coverage_samples` | 当前窗口是否有 executor coverage 样本 | 至少 1 次 run |
| `executor_coverage_ratio` | executor 实际执行任务占计划任务比例 | coverage ratio >= 0.2 |
安全边界:
- 只挂载在 admin router 下,受 `ENABLE_ADMIN_CONSOLE` 和 Basic Auth admin guard 保护。
- 只返回聚合结果、阈值、状态和 coverage summary。
- 不返回故事正文、绘本分页正文、用户 prompt、cover prompt、score reason、quality gate message 或单条事件明细。
- 用户端 API 和用户前端不包含该接口调用、类型定义或展示组件。
## 6. 模块设计
@@ -431,6 +633,358 @@ npm run build
- `docs/planning/harness-stage-5-report.md`
### 阶段 6: 新架构真实运行烟测
目标:
- 使用新代码启动本地 API 与 Celery worker。
- 复用 Docker demo stack 中的 PostgreSQL 与 Redis。
- 通过真实 HTTP API 覆盖登录、生成、worker 执行、故事落库、trace summary 和 provider stats。
- 覆盖主内容工作流与资源重试工作流。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H6-1 | 启动本地新代码 API | `/health` 返回 `{"status":"ok"}` |
| H6-2 | 启动本地新代码 worker | 生成任务可被 worker claim 并执行 |
| H6-3 | 使用 dev 登录创建真实 cookie 会话 | `/auth/session` 返回开发用户 |
| H6-4 | 提交普通故事生成 | job 进入 completed/degraded 且 story 落库 |
| H6-5 | 查询 trace summary/provider stats | 返回 step、artifact、provider 聚合 |
| H6-6 | 执行图片资源重试 | trace summary 聚合出 `image_generation``cover_image` |
| H6-7 | 清理临时进程并恢复 Docker worker | `docker compose ps` 环境回到可用状态 |
验证命令:
```bash
cd backend
DATABASE_URL='postgresql+asyncpg://dreamweaver:dreamweaver_password@localhost:52432/dreamweaver_db' \
CELERY_BROKER_URL='redis://localhost:52379/0' \
CELERY_RESULT_BACKEND='redis://localhost:52379/0' \
REDIS_URL='redis://localhost:52379/0' \
.venv/bin/python -m uvicorn app.main:app --host 127.0.0.1 --port 53000
DATABASE_URL='postgresql+asyncpg://dreamweaver:dreamweaver_password@localhost:52432/dreamweaver_db' \
CELERY_BROKER_URL='redis://localhost:52379/0' \
CELERY_RESULT_BACKEND='redis://localhost:52379/0' \
REDIS_URL='redis://localhost:52379/0' \
.venv/bin/celery -A app.core.celery_app worker --loglevel=info --concurrency=1
```
阶段报告:
- `docs/planning/harness-stage-6-report.md`
### 阶段 7: 评测驱动与执行器最小接管
目标:
- 将“生成是否合格”从隐含质量门升级为结构化 evaluation result。
- 让普通故事、`generate_images=false` 的最小路径由 `WorkflowPlan` 参与执行。
- 在 job events 中记录 `workflow_planned``evaluation_completed`
- 用测试锁住评分、阻断、事件顺序和 trace 聚合。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H7-1 | 新增 deterministic evaluator | 通过/阻断用例有单测 |
| H7-2 | 新增 plan-aware executor helper | 任务写入 `workflow_planned` |
| H7-3 | 普通故事无图片路径接入 plan | worker 事件序列包含 plan/evaluation |
| H7-4 | 质量门失败也写入 evaluation | failed job 可解释阻断原因 |
| H7-5 | 增加评测驱动 QA 用例文档 | 覆盖功能、边界、错误和状态转换 |
| H7-6 | 阶段报告记录 bug/风险 | 大 bug 可后续统一处理 |
| H7-7 | 增加内部 golden replay 基线 | 固定样本可离线回放并被单测锁定 |
| H7-8 | 增加 replay 覆盖摘要 | 年龄段、内容形态、风险区域和 outcome 分布可被单测锁定 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_harness_runtime.py tests/test_generation_jobs.py
.venv/bin/python -m ruff check app tests
```
阶段报告:
- `docs/planning/harness-stage-7-report.md`
### 阶段 8: Admin-Only Evaluation Analytics
目标:
- 提供管理控制面内部评测摘要,用于质量运营和评测策略复盘。
- 明确 admin-only 权限边界,避免将评测数据分发给普通用户。
- 只返回聚合摘要不返回原始内容、prompt、单条评测明细或评分 reason。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H8-1 | 新增 admin evaluation analytics 服务 | 可聚合 `evaluation_completed` |
| H8-2 | 新增 admin-only 路由 | `/admin/evaluations/analytics` 受 admin guard 保护 |
| H8-3 | 支持 days/artifact 过滤 | 过滤测试通过 |
| H8-4 | 锁定用户端隔离 | 用户端扫描无 evaluation analytics 入口 |
| H8-5 | 阶段报告记录安全边界 | 明确不返回原始内容和单条明细 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_generation_jobs.py
.venv/bin/python -m ruff check app tests
```
阶段报告:
- `docs/planning/harness-stage-8-report.md`
### 阶段 9: WorkflowPlan 接管扩展
目标:
- 让普通故事无图片、普通故事带图片、绘本三条主生成路径都写入显式 `workflow_planned`
- 将计划快照用于锁定事件顺序、可恢复资产任务和后续执行器迁移边界。
- 继续保持评测数据内部分级,用户端只看到可用进度和可恢复状态。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H9-1 | 带图片故事路径记录 `story_with_assets` plan | 事件顺序中 `workflow_planned` 位于 `worker_started``context_prepared` 之间 |
| H9-2 | 绘本路径记录 `storybook` plan | plan 快照包含 `evaluate_storybook_pages` 和可恢复图片任务 |
| H9-3 | 补主路径事件顺序测试 | story、story_with_assets、storybook 三条路径均被测试覆盖 |
| H9-4 | 锁定用户端评测隔离 | 用户 API 不返回 `evaluation_completed`、评分、维度或 replay 数据 |
| H9-5 | 阶段报告记录执行偏差和验证结果 | 报告包含实现、审查、测试和风险 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
阶段报告:
- `docs/planning/harness-stage-9-report.md`
### 阶段 10: 资产计划与 Public Metadata Sanitizer
目标:
-`asset_generation``asset_retry` 也纳入 `WorkflowPlan` 记录。
- 让用户侧 job event metadata 使用白名单脱敏,避免未来内部 metadata 扩展时误泄露质量策略。
- 保留用户前端需要的可解释字段step、artifact、failure category、资源状态、Provider 运营摘要和 coarse plan 摘要。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H10-1 | 后台资产生成记录 `asset_generation` plan | worker 事件顺序包含 `workflow_planned` |
| H10-2 | 资源重试记录 `asset_retry` plan | 同步 retry events 包含 plan 快照 |
| H10-3 | 旧封面/音频生成接口记录资产 plan | 兼容接口不破坏现有响应 |
| H10-4 | 用户 event metadata 白名单脱敏 | 用户 API 不返回原始 `plan.tasks``result_snapshot`、内部错误和阈值 |
| H10-5 | 补资产计划和 sanitizer 回归测试 | `tests/test_generation_jobs.py` 覆盖相关路径 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
阶段报告:
- `docs/planning/harness-stage-10-report.md`
### 阶段 11: Trace 访问分级与 Request Payload Sanitizer
目标:
- 用户侧 job detail 的 `request_payload` 改为白名单脱敏避免内部调度参数、Provider override、评测策略或原始输入被接口原样回传。
- 新增 admin-only generation trace detail`admin_guard` 保护下返回完整内部 request payload、workflow plan 和 evaluation metadata。
- 明确用户前端与管理控制面的 trace 数据边界,为后续 executor 接管保留完整取证能力。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H11-1 | 用户侧 request payload 白名单脱敏 | 用户 job detail 只返回 output/input mode、资产、故事 ID、页数、图片请求开关等安全控制字段 |
| H11-2 | 新增 admin-only trace detail 服务 | 管理端可按 job id 读取完整内部 request payload 和完整 event metadata |
| H11-3 | 新增 admin trace 路由与响应模型 | `GET /admin/generations/jobs/{job_id}/trace``admin_guard` 保护 |
| H11-4 | 补用户脱敏和 admin 完整 trace 测试 | 用户接口不含内部字段admin 接口包含 `evaluation_completed` 和完整 plan |
| H11-5 | 阶段报告记录商业机密边界 | 报告说明用户端不分发评测数据admin-only 数据用途和剩余风险 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
阶段报告:
- `docs/planning/harness-stage-11-report.md`
### 阶段 12: Plan-Driven Asset Executor 试点
目标:
-`WorkflowPlan` 从“记录计划”进入“驱动执行”的第一步。
- 先接管低风险资产任务:`asset_generation``asset_retry`、旧封面生成、旧音频生成。
- 保留现有 asset workflow 的 provider 调用、状态同步、取消检查和事件记录,不把细节复制进 executor。
- 保持用户侧公开面不新增评测数据或内部 task metadata。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H12-1 | 新增 `run_asset_plan` | 按 `WorkflowTask.key` 顺序执行图片/音频任务,并返回执行结果 |
| H12-2 | 后台 `asset_generation` 接入 plan runner | 多资产 job 按 plan 顺序生成音频和图片,事件顺序稳定 |
| H12-3 | 同步 `asset_retry` 接入 plan runner | 图片/音频重试仍保持原有完成和失败语义 |
| H12-4 | 旧封面/音频接口接入 plan runner | `/api/image/generate/{id}``/api/audio/{id}` 行为兼容 |
| H12-5 | 补 executor 与资产路径回归测试 | harness 单测覆盖执行顺序generation job 测试覆盖组合资产执行 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_harness_runtime.py tests/test_generation_jobs.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
阶段报告:
- `docs/planning/harness-stage-12-report.md`
### 阶段 13: Admin-Only Executor Coverage
目标:
- 将资产 executor 的执行结果记录成内部 `executor_completed` 事件。
- 新增 admin-only executor coverage 聚合,用于审查 plan-driven execution 覆盖率。
- 用户侧 job detail、job list 和 trace summary 继续隐藏内部 executor task key 与 coverage metadata。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H13-1 | executor result 生成 coverage metadata | metadata 包含 plan mode、planned/executed/ignored counts、task keys、result assets |
| H13-2 | 资产路径记录 `executor_completed` | asset generation/retry 和旧资源接口写入内部 executor 事件 |
| H13-3 | 新增 admin-only coverage API | `GET /admin/executors/coverage` 受 admin guard 保护 |
| H13-4 | 用户侧过滤 executor 事件和 step | 用户 API 不返回 `executor_completed` 或 task keys |
| H13-5 | 补 admin coverage 与用户隔离测试 | 聚合、过滤、鉴权和用户隔离均被测试覆盖 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py tests/test_harness_runtime.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
阶段报告:
- `docs/planning/harness-stage-13-report.md`
### 阶段 14: Admin Trace Executor Coverage Summary
目标:
- 让 admin-only 完整 generation trace 自带 executor coverage 摘要。
- 复用全局 executor coverage 聚合逻辑,保持 `/admin/executors/coverage` 与单 job trace 的统计口径一致。
- 修正用户 trace summary 的隔离边界,确保内部 `executor_completed` 不通过聚合数量或 task key 泄露。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H14-1 | 抽出 executor coverage 纯聚合函数 | 全局 coverage API 与单 job trace 复用同一函数 |
| H14-2 | admin trace 返回 `executor_coverage` | `GET /admin/generations/jobs/{job_id}/trace` 包含当前 job executor summary |
| H14-3 | 用户 trace summary 过滤 `executor_completed` | 用户 trace summary 不统计内部 executor 事件数量或 task key |
| H14-4 | 补 admin trace summary 与用户隔离测试 | admin 可见覆盖摘要;用户 detail/list/trace summary 不可见 |
| H14-5 | 阶段报告记录审查与验证 | 报告包含实现、风险、命令和结果 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py tests/test_harness_runtime.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
阶段报告:
- `docs/planning/harness-stage-14-report.md`
### 阶段 15: Admin-Only Harness Readiness
目标:
- 建立一个 admin-only readiness audit在扩大 harness 接管范围前给出聚合质量门。
- 复用 golden replay、evaluation analytics 和 executor coverage避免新增独立统计口径。
- 保持用户侧完全不可见不向用户端分发评测数据、executor task key 或 readiness 结果。
最小任务:
| ID | 任务 | 验收 |
| --- | --- | --- |
| H15-1 | 将 golden replay fixture 放入 app 内部路径 | Docker 运行环境可读取内部 golden cases |
| H15-2 | 新增 admin harness readiness 服务 | 聚合 golden replay、evaluation analytics 和 executor coverage |
| H15-3 | 新增 admin-only readiness API | `GET /admin/harness/readiness` 受 admin guard 保护 |
| H15-4 | 补 readiness ready/blocked/鉴权测试 | ready、blocked、needs_attention 和 401 均被覆盖 |
| H15-5 | 阶段报告记录安全边界和验证 | 报告说明不返回正文、prompt、score reason 或单条事件 |
验证命令:
```bash
cd backend
.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_harness_runtime.py -q
.venv/bin/python -m pytest
.venv/bin/python -m ruff check app tests
cd ../frontend
npm run build
cd ../admin-frontend
npm run build
```
阶段报告:
- `docs/planning/harness-stage-15-report.md`
## 8. 需求与验收
### 功能需求
@@ -446,6 +1000,13 @@ npm run build
| FR-007 | SHOULD | 资产工作流应从主 service 拆出 | `story_service` 行数和职责减少 |
| FR-008 | SHOULD | 输出验证应在持久化前执行 | schema 缺失可被测试捕获 |
| FR-009 | COULD | 前端展示标准 step/failure category | 构建通过且无布局溢出 |
| FR-010 | MUST | 用户侧事件 metadata 必须白名单脱敏 | 用户 API 不返回评测分数、原始 plan、result snapshot 或内部错误原文 |
| FR-011 | MUST | 用户侧 request payload 必须白名单脱敏 | 用户 job detail 不返回原始输入、内部调度参数、provider override 或评测策略 |
| FR-012 | SHOULD | 管理控制面可读取完整内部 trace | admin-only trace endpoint 返回完整 request payload 和完整 event metadata |
| FR-013 | SHOULD | 资产任务应由 WorkflowPlan 驱动执行 | asset generation/retry 按 plan task key 执行图片和音频任务 |
| FR-014 | SHOULD | 管理控制面可审查 executor 覆盖率 | admin-only coverage endpoint 聚合 executor runs、task counts 和 result assets |
| FR-015 | SHOULD | 管理端单 job trace 可审查 executor 覆盖摘要 | admin-only trace endpoint 返回当前 job 的 executor coverage summary |
| FR-016 | SHOULD | 管理控制面可执行 harness readiness 审查 | admin-only readiness endpoint 聚合 golden replay、evaluation analytics 和 executor coverage |
### 非功能需求
@@ -457,6 +1018,12 @@ npm run build
| NFR-004 | SHOULD | 低耦合 | harness 类型模块不依赖 FastAPI 和 SQLAlchemy |
| NFR-005 | SHOULD | 性能稳定 | 不新增阻塞式外部调用 |
| NFR-006 | SHOULD | 中文一致性 | 文档、用户可见文案和新增注释使用简体中文 |
| NFR-007 | MUST | 默认不公开内部 metadata | 未加入白名单的新字段不会出现在用户侧 job event 响应中 |
| NFR-008 | MUST | Trace 数据访问分级 | 用户接口只返回安全公开字段;完整评测和内部调度数据仅在 admin guard 后提供 |
| NFR-009 | SHOULD | Executor 接管必须小步可回退 | 先接资产任务;主文本生成仍保持原有服务路径 |
| NFR-010 | MUST | Executor coverage 默认不公开 | `executor_completed`、task keys 和 coverage metadata 不进入用户端接口 |
| NFR-011 | MUST | Admin trace 统计口径一致 | 单 job trace 与全局 executor coverage 复用同一聚合逻辑 |
| NFR-012 | MUST | Readiness 数据默认不公开 | readiness endpoint 只在 admin guard 后提供不返回正文、prompt、score reason 或单条事件 |
## 9. 风险与缓解
@@ -468,6 +1035,14 @@ npm run build
| Provider trace 与 job event 重复 | 低 | 保持 Provider 事件专注调用层workflow 事件专注产品步骤 |
| 文档与实现偏离 | 中 | 每个阶段报告必须记录实现偏差 |
| 质量门误伤内容 | 中 | 第四阶段先做确定性低风险检查,模型评审延后 |
| 评测 analytics 泄露商业机密 | 高 | 仅 admin-only 聚合摘要;用户端 API/前端不接入;测试覆盖 admin guard 和用户端隔离 |
| 新增 trace metadata 误进用户 API | 高 | `public_generation_event_metadata` 使用 allowlist新增字段默认不公开 |
| 请求 payload 混入内部字段 | 高 | `public_generation_request_payload` 使用 allowlist完整 payload 仅 admin-only trace endpoint 可见 |
| Executor 抽象过早扩大范围 | 中 | 阶段 12 只接管资产 task key主文本、评测和持久化暂不迁移 |
| Executor coverage 泄露内部执行策略 | 中 | `executor_completed` 全量过滤用户侧响应,只在 admin-only coverage/trace 中提供 |
| Admin trace 与全局 coverage 口径漂移 | 中 | 抽出共享聚合函数,测试同时覆盖 admin trace 和全局 coverage API |
| Readiness 运行环境缺少 golden fixture | 中 | golden cases 放入 app 内部 harness fixtures随 Docker `COPY app ./app` 发布 |
| Readiness 聚合泄露内部内容 | 高 | 只返回聚合状态和覆盖摘要;测试断言不包含 story title、score reason 或 quality gate message |
## 10. 审查清单
@@ -490,4 +1065,14 @@ npm run build
| 阶段 2 | 已完成主要资产补全抽取 | 封面、音频、持久化绘本缺失图片补全已迁入 harness asset workflows |
| 阶段 3 | 已完成计划建模基线 | 已定义 WorkflowPlan/WorkflowTask 和核心模式计划快照;执行器接管留待后续 |
| 阶段 4 | 已完成确定性质量门 | 已接入文本故事和绘本结构完整性/儿童安全基础检查 |
| 阶段 5 | 待执行 | Trace Analytics 与前端展示 |
| 阶段 5 | 已完成 trace analytics 与前端展示 | 已新增 trace summary API并在用户端/管理端生成轨迹中展示 step、artifact、failure category |
| 阶段 6 | 已完成真实运行烟测 | 已通过本地新代码 API/worker + Docker PostgreSQL/Redis 覆盖主生成和图片资源重试链路 |
| 阶段 7 | 已完成 7A/7B/7C/7D/7E 当前切片 | 已接入 deterministic evaluator、`workflow_planned``evaluation_completed`、普通故事无图片 plan-aware 路径、绘本内部评测、内部 golden replay 和覆盖摘要;已修正并锁定用户侧不分发评测数据 |
| 阶段 8 | 已完成 admin-only evaluation analytics 当前切片 | 已新增 `/admin/evaluations/analytics` 聚合接口、admin guard 测试、days/artifact 过滤和用户端隔离扫描 |
| 阶段 9 | 已完成 WorkflowPlan 接管扩展当前切片 | 普通故事带图片和绘本路径已记录 plan 快照,三条主路径事件顺序与用户端评测隔离已由测试覆盖 |
| 阶段 10 | 已完成资产计划与 public metadata sanitizer 当前切片 | 资产生成/重试路径已记录 plan用户侧 event metadata 改为白名单并隐藏原始 plan、result snapshot 和内部字段 |
| 阶段 11 | 已完成 trace 访问分级当前切片 | 用户侧 request payload 改为白名单;新增 admin-only 完整 trace endpoint 并覆盖鉴权和内部事件测试 |
| 阶段 12 | 已完成 plan-driven asset executor 当前切片 | `run_asset_plan` 已按 plan task key 驱动图片/音频资产任务;后台资产生成、资源重试和旧封面/音频接口已接入 |
| 阶段 13 | 已完成 admin-only executor coverage 当前切片 | 资产 executor 已记录内部 `executor_completed`;新增 `/admin/executors/coverage`,用户侧继续过滤 executor 事件和 task keys |
| 阶段 14 | 已完成 admin trace executor coverage summary 当前切片 | admin trace 已内嵌单 job executor coverage 摘要;用户 trace summary 继续过滤内部 executor 事件 |
| 阶段 15 | 已完成 admin-only harness readiness 当前切片 | 新增 `/admin/harness/readiness` 聚合 golden replay、evaluation analytics 与 executor coverage用户侧继续不可见 |

View File

@@ -7,6 +7,7 @@ import type {
GenerationJobEvent,
GenerationJobSummary,
GenerationProviderStats,
GenerationTraceSummary,
} from '../types/generation'
import LoadingSpinner from './ui/LoadingSpinner.vue'
@@ -27,6 +28,7 @@ const props = withDefaults(
const jobHistory = ref<GenerationJobSummary[]>([])
const activeJob = ref<GenerationJobDetail | null>(null)
const providerStats = ref<GenerationProviderStats | null>(null)
const traceSummary = ref<GenerationTraceSummary | null>(null)
const loading = ref(false)
const actionLoading = ref(false)
const error = ref('')
@@ -42,6 +44,8 @@ const providerSuccessRate = computed(() => {
if (!providerStats.value?.total_calls) return null
return Math.round((providerStats.value.successful_calls / providerStats.value.total_calls) * 100)
})
const topTraceStep = computed(() => traceSummary.value?.by_step[0] ?? null)
const topFailureCategory = computed(() => traceSummary.value?.failure_categories[0] ?? null)
const containerClass = computed(() => (
isDark.value
@@ -100,6 +104,7 @@ function getJobStatusLabel(status?: string) {
function getEventLabel(eventType: string) {
const labels: Record<string, string> = {
request_accepted: '请求接收',
workflow_planned: '工作流规划',
worker_started: '后台任务开始',
retry_queued: '重新排队',
cancel_requested: '已请求取消',
@@ -122,6 +127,7 @@ function getEventLabel(eventType: string) {
provider_call_started: '供应商调用',
provider_call_succeeded: '供应商成功',
provider_call_failed: '供应商失败',
quality_gate_failed: '质量门失败',
asset_retry_started: '资源重试开始',
asset_retry_completed: '资源重试完成',
asset_retry_failed: '资源重试失败',
@@ -134,6 +140,72 @@ function getEventLabel(eventType: string) {
return labels[eventType] ?? eventType
}
function getStepLabel(step?: unknown) {
const labels: Record<string, string> = {
request_acceptance: '请求接收',
worker_start: '后台启动',
context_preparation: '上下文准备',
narrative_generation: '主内容生成',
story_persistence: '故事保存',
provider_invocation: '供应商调用',
image_generation: '图片生成',
audio_generation: '音频生成',
asset_retry: '资源重试',
asset_generation: '资源生成',
postprocessing: '后处理',
completion: '任务完成',
cancellation: '取消',
stale_recovery: '超时收敛',
unknown: '未知步骤',
}
const key = typeof step === 'string' ? step : ''
return labels[key] ?? key
}
function getArtifactLabel(artifact?: unknown) {
const labels: Record<string, string> = {
story_text: '故事正文',
storybook_pages: '绘本分页',
cover_image: '封面图',
page_image: '分页插图',
image: '图片资源',
audio: '音频',
achievement_memory: '成长记忆',
none: '无资源',
unknown: '未知资源',
}
const key = typeof artifact === 'string' ? artifact : ''
return labels[key] ?? key
}
function getFailureCategoryLabel(category?: unknown) {
const labels: Record<string, string> = {
provider_error: '供应商失败',
schema_error: '结构不完整',
safety_error: '儿童安全风险',
timeout: '超时',
canceled: '用户取消',
stale_job: '任务卡住',
storage_error: '存储失败',
validation_error: '输入校验失败',
unknown_error: '未知失败',
}
const key = typeof category === 'string' ? category : ''
return labels[key] ?? key
}
function getTraceMetaText(event: GenerationJobEvent) {
const meta = event.event_metadata
const step = getStepLabel(meta.step)
const artifact = getArtifactLabel(meta.artifact)
const failureCategory = meta.failure_category
? getFailureCategoryLabel(meta.failure_category)
: ''
return [step, artifact && artifact !== '无资源' ? artifact : '', failureCategory]
.filter(Boolean)
.join(' · ')
}
function formatDateTime(value: string) {
return new Intl.DateTimeFormat('zh-CN', {
hour: '2-digit',
@@ -175,22 +247,25 @@ async function selectGenerationJob(jobId: string) {
async function refresh() {
if (props.storyId === null) {
jobHistory.value = []
activeJob.value = null
providerStats.value = null
return
jobHistory.value = []
activeJob.value = null
providerStats.value = null
traceSummary.value = null
return
}
error.value = ''
const selectedJobId = activeJob.value?.id ?? null
try {
const [jobs, stats] = await Promise.all([
const [jobs, stats, trace] = await Promise.all([
api.get<GenerationJobSummary[]>(`/api/generations/${props.storyId}/jobs`),
api.get<GenerationProviderStats>(`/api/generations/${props.storyId}/provider-stats`),
api.get<GenerationTraceSummary>(`/api/generations/${props.storyId}/trace-summary`),
])
jobHistory.value = jobs
providerStats.value = stats
traceSummary.value = trace
const nextJobId = (
selectedJobId
? jobHistory.value.find((job) => job.id === selectedJobId)?.id
@@ -205,6 +280,7 @@ async function refresh() {
jobHistory.value = []
activeJob.value = null
providerStats.value = null
traceSummary.value = null
error.value = e instanceof Error ? e.message : '生成轨迹加载失败'
}
}
@@ -318,6 +394,32 @@ defineExpose({ refresh })
</div>
</div>
<div
v-if="traceSummary?.total_events"
class="grid gap-3 md:grid-cols-4"
>
<div class="rounded-lg border p-3" :class="panelClass">
<div class="text-xs" :class="mutedTextClass">流程事件</div>
<div class="mt-1 text-xl font-semibold">{{ traceSummary.total_events }}</div>
</div>
<div class="rounded-lg border p-3" :class="panelClass">
<div class="text-xs" :class="mutedTextClass">失败事件</div>
<div class="mt-1 text-xl font-semibold">{{ traceSummary.failed_events }}</div>
</div>
<div class="rounded-lg border p-3" :class="panelClass">
<div class="text-xs" :class="mutedTextClass">主要步骤</div>
<div class="mt-1 text-base font-semibold">
{{ topTraceStep ? `${getStepLabel(topTraceStep.name)} · ${topTraceStep.count}` : '暂无' }}
</div>
</div>
<div class="rounded-lg border p-3" :class="panelClass">
<div class="text-xs" :class="mutedTextClass">主要失败</div>
<div class="mt-1 text-base font-semibold">
{{ topFailureCategory ? `${getFailureCategoryLabel(topFailureCategory.name)} · ${topFailureCategory.count}` : '暂无' }}
</div>
</div>
</div>
<div v-if="!jobHistory.length" class="rounded-lg border border-dashed border-gray-200 p-4 text-sm" :class="mutedTextClass">
暂无生成轨迹旧数据会在下一次资源补全后开始记录
</div>
@@ -432,6 +534,9 @@ defineExpose({ refresh })
<p v-else-if="event.message" class="mt-1 text-xs" :class="mutedTextClass">
{{ event.message }}
</p>
<p v-if="getTraceMetaText(event)" class="mt-1 text-xs" :class="mutedTextClass">
{{ getTraceMetaText(event) }}
</p>
</div>
</li>
</ol>

View File

@@ -58,6 +58,21 @@ export interface GenerationProviderStats {
}>
}
export interface GenerationTraceBucket {
name: string
count: number
}
export interface GenerationTraceSummary {
story_id: number
window_days: number | null
total_events: number
failed_events: number
by_step: GenerationTraceBucket[]
by_artifact: GenerationTraceBucket[]
failure_categories: GenerationTraceBucket[]
}
export interface GenerationProviderAnalytics {
window_days: number | null
capability: string | null