From 1f34d800830f5f6014cffef1e1e173cbef8167b3 Mon Sep 17 00:00:00 2001 From: Yuyan Date: Wed, 24 Jun 2026 10:48:23 +0800 Subject: [PATCH] Expand generation harness observability --- .../src/components/GenerationTrace.vue | 131 +++- backend/app/api/admin_providers.py | 219 +++++- backend/app/api/stories.py | 21 + backend/app/schemas/story_schemas.py | 19 + .../services/admin_evaluation_analytics.py | 204 ++++++ .../app/services/admin_executor_coverage.py | 147 ++++ .../app/services/admin_generation_trace.py | 52 ++ .../app/services/admin_harness_readiness.py | 262 +++++++ backend/app/services/generation_jobs.py | 255 ++++++- .../app/services/harness/evaluation_replay.py | 322 +++++++++ backend/app/services/harness/evaluators.py | 267 +++++++ backend/app/services/harness/executor.py | 150 ++++ .../fixtures/evaluation_golden_cases.json | 400 +++++++++++ backend/app/services/harness/plans.py | 10 + backend/app/services/harness/types.py | 5 + backend/app/services/story_service.py | 337 +++++++-- .../fixtures/evaluation_golden_cases.json | 400 +++++++++++ .../tests/harness-evaluation-test-cases.md | 610 ++++++++++++++++ backend/tests/test_admin_providers.py | 653 +++++++++++++++++ backend/tests/test_generation_jobs.py | 664 +++++++++++++++++- backend/tests/test_harness_runtime.py | 276 +++++++- docs/planning/harness-stage-10-report.md | 159 +++++ docs/planning/harness-stage-11-report.md | 165 +++++ docs/planning/harness-stage-12-report.md | 150 ++++ docs/planning/harness-stage-13-report.md | 182 +++++ docs/planning/harness-stage-14-report.md | 188 +++++ docs/planning/harness-stage-15-report.md | 228 ++++++ docs/planning/harness-stage-5-report.md | 140 ++++ docs/planning/harness-stage-6-report.md | 222 ++++++ docs/planning/harness-stage-7-report.md | 252 +++++++ docs/planning/harness-stage-8-report.md | 142 ++++ docs/planning/harness-stage-9-report.md | 144 ++++ .../harness-engineering-modernization.md | 609 +++++++++++++++- frontend/src/components/GenerationTrace.vue | 115 ++- frontend/src/types/generation.ts | 15 + 35 files changed, 8003 insertions(+), 112 deletions(-) create mode 100644 backend/app/services/admin_evaluation_analytics.py create mode 100644 backend/app/services/admin_executor_coverage.py create mode 100644 backend/app/services/admin_generation_trace.py create mode 100644 backend/app/services/admin_harness_readiness.py create mode 100644 backend/app/services/harness/evaluation_replay.py create mode 100644 backend/app/services/harness/evaluators.py create mode 100644 backend/app/services/harness/executor.py create mode 100644 backend/app/services/harness/fixtures/evaluation_golden_cases.json create mode 100644 backend/tests/fixtures/evaluation_golden_cases.json create mode 100644 backend/tests/harness-evaluation-test-cases.md create mode 100644 docs/planning/harness-stage-10-report.md create mode 100644 docs/planning/harness-stage-11-report.md create mode 100644 docs/planning/harness-stage-12-report.md create mode 100644 docs/planning/harness-stage-13-report.md create mode 100644 docs/planning/harness-stage-14-report.md create mode 100644 docs/planning/harness-stage-15-report.md create mode 100644 docs/planning/harness-stage-5-report.md create mode 100644 docs/planning/harness-stage-6-report.md create mode 100644 docs/planning/harness-stage-7-report.md create mode 100644 docs/planning/harness-stage-8-report.md create mode 100644 docs/planning/harness-stage-9-report.md diff --git a/admin-frontend/src/components/GenerationTrace.vue b/admin-frontend/src/components/GenerationTrace.vue index 519a094..a134448 100644 --- a/admin-frontend/src/components/GenerationTrace.vue +++ b/admin-frontend/src/components/GenerationTrace.vue @@ -47,6 +47,21 @@ interface GenerationProviderStats { estimated_cost_usd: number } +interface GenerationTraceBucket { + name: string + count: number +} + +interface GenerationTraceSummary { + story_id: number + window_days: number | null + total_events: number + failed_events: number + by_step: GenerationTraceBucket[] + by_artifact: GenerationTraceBucket[] + failure_categories: GenerationTraceBucket[] +} + const props = withDefaults( defineProps<{ storyId: number | null @@ -64,6 +79,7 @@ const props = withDefaults( const jobs = ref([]) const activeJob = ref(null) const providerStats = ref(null) +const traceSummary = ref(null) const loading = ref(false) const actionLoading = ref(false) const error = ref('') @@ -79,6 +95,8 @@ const providerSuccessRate = computed(() => { if (!providerStats.value?.total_calls) return null return Math.round((providerStats.value.successful_calls / providerStats.value.total_calls) * 100) }) +const topTraceStep = computed(() => traceSummary.value?.by_step[0] ?? null) +const topFailureCategory = computed(() => traceSummary.value?.failure_categories[0] ?? null) const mutedClass = computed(() => (isDark.value ? 'text-white/65' : 'text-gray-500')) const shellClass = computed(() => ( isDark.value ? 'border-white/10 bg-white/10 text-white backdrop-blur' : 'border-gray-100 bg-white/85 text-gray-900' @@ -117,15 +135,18 @@ function statusLabel(status?: string) { function eventLabel(eventType: string) { const labels: Record = { request_accepted: '请求接收', + workflow_planned: '工作流规划', worker_started: '后台任务开始', retry_queued: '重新排队', cancel_requested: '已请求取消', context_prepared: '上下文准备', + evaluation_completed: '内容评测', narrative_generated: '正文生成', story_saved: '故事保存', provider_call_started: '供应商调用', provider_call_succeeded: '供应商成功', provider_call_failed: '供应商失败', + quality_gate_failed: '质量门失败', cover_image_started: '封面开始', cover_image_succeeded: '封面就绪', cover_image_failed: '封面失败', @@ -147,6 +168,73 @@ function eventLabel(eventType: string) { return labels[eventType] ?? eventType } +function stepLabel(step?: unknown) { + const labels: Record = { + request_acceptance: '请求接收', + worker_start: '后台启动', + context_preparation: '上下文准备', + narrative_generation: '主内容生成', + evaluation: '内容评测', + story_persistence: '故事保存', + provider_invocation: '供应商调用', + image_generation: '图片生成', + audio_generation: '音频生成', + asset_retry: '资源重试', + asset_generation: '资源生成', + postprocessing: '后处理', + completion: '任务完成', + cancellation: '取消', + stale_recovery: '超时收敛', + unknown: '未知步骤', + } + const key = typeof step === 'string' ? step : '' + return labels[key] ?? key +} + +function artifactLabel(artifact?: unknown) { + const labels: Record = { + story_text: '故事正文', + storybook_pages: '绘本分页', + cover_image: '封面图', + page_image: '分页插图', + image: '图片资源', + audio: '音频', + achievement_memory: '成长记忆', + none: '无资源', + unknown: '未知资源', + } + const key = typeof artifact === 'string' ? artifact : '' + return labels[key] ?? key +} + +function failureCategoryLabel(category?: unknown) { + const labels: Record = { + provider_error: '供应商失败', + schema_error: '结构不完整', + safety_error: '儿童安全风险', + timeout: '超时', + canceled: '用户取消', + stale_job: '任务卡住', + storage_error: '存储失败', + validation_error: '输入校验失败', + unknown_error: '未知失败', + } + const key = typeof category === 'string' ? category : '' + return labels[key] ?? key +} + +function traceMetaText(event: GenerationJobEvent) { + const meta = event.event_metadata + const step = stepLabel(meta.step) + const artifact = artifactLabel(meta.artifact) + const failureCategory = meta.failure_category + ? failureCategoryLabel(meta.failure_category) + : '' + return [step, artifact && artifact !== '无资源' ? artifact : '', failureCategory] + .filter(Boolean) + .join(' · ') +} + function formatTime(value: string) { return new Intl.DateTimeFormat('zh-CN', { hour: '2-digit', @@ -188,22 +276,25 @@ async function selectJob(jobId: string) { async function refresh() { if (props.storyId === null) { - jobs.value = [] - activeJob.value = null - providerStats.value = null - return + jobs.value = [] + activeJob.value = null + providerStats.value = null + traceSummary.value = null + return } error.value = '' const selectedJobId = activeJob.value?.id ?? null try { - const [nextJobs, stats] = await Promise.all([ + const [nextJobs, stats, trace] = await Promise.all([ api.get(`/api/generations/${props.storyId}/jobs`), api.get(`/api/generations/${props.storyId}/provider-stats`), + api.get(`/api/generations/${props.storyId}/trace-summary`), ]) jobs.value = nextJobs providerStats.value = stats + traceSummary.value = trace const nextJobId = ( selectedJobId ? jobs.value.find((job) => job.id === selectedJobId)?.id @@ -218,6 +309,7 @@ async function refresh() { jobs.value = [] activeJob.value = null providerStats.value = null + traceSummary.value = null error.value = e instanceof Error ? e.message : '生成轨迹加载失败' } } @@ -331,6 +423,32 @@ defineExpose({ refresh }) +
+
+
流程事件
+
{{ traceSummary.total_events }}
+
+
+
失败事件
+
{{ traceSummary.failed_events }}
+
+
+
主要步骤
+
+ {{ topTraceStep ? `${stepLabel(topTraceStep.name)} · ${topTraceStep.count}` : '暂无' }} +
+
+
+
主要失败
+
+ {{ topFailureCategory ? `${failureCategoryLabel(topFailureCategory.name)} · ${topFailureCategory.count}` : '暂无' }} +
+
+
+
暂无生成轨迹。旧数据会在下一次资源补全后开始记录。
@@ -445,6 +563,9 @@ defineExpose({ refresh })

{{ event.message }}

+

+ {{ traceMetaText(event) }} +

diff --git a/backend/app/api/admin_providers.py b/backend/app/api/admin_providers.py index fa570cc..1e7db29 100644 --- a/backend/app/api/admin_providers.py +++ b/backend/app/api/admin_providers.py @@ -1,4 +1,5 @@ -from typing import Literal +from datetime import datetime +from typing import Any, Literal from fastapi import APIRouter, Depends, HTTPException, Query from pydantic import BaseModel, ConfigDict, Field @@ -9,6 +10,10 @@ from app.core.admin_auth import admin_guard from app.db.admin_models import Provider from app.db.database import get_db from app.services.adapters.registry import AdapterRegistry +from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics +from app.services.admin_executor_coverage import get_admin_executor_coverage +from app.services.admin_generation_trace import get_admin_generation_job_trace +from app.services.admin_harness_readiness import get_admin_harness_readiness from app.services.admin_provider_analytics import get_admin_provider_analytics from app.services.cost_tracker import cost_tracker from app.services.provider_policy import DEFAULT_PROVIDERS, list_capability_policies @@ -103,6 +108,169 @@ class ProviderAnalyticsResponse(BaseModel): by_user: list[ProviderAnalyticsUserBucket] failure_reasons: list[ProviderAnalyticsFailureReason] + +class EvaluationAnalyticsArtifactBucket(BaseModel): + artifact: str + count: int + + +class EvaluationAnalyticsOutputModeBucket(BaseModel): + output_mode: str + count: int + + +class EvaluationAnalyticsScoreBandBucket(BaseModel): + band: str + count: int + + +class EvaluationAnalyticsDimensionScore(BaseModel): + dimension: str + average_score: float + count: int + + +class EvaluationAnalyticsQualityGateIssue(BaseModel): + code: str + count: int + + +class EvaluationAnalyticsFailureCategory(BaseModel): + category: str + count: int + + +class EvaluationAnalyticsWarning(BaseModel): + message: str + count: int + + +class EvaluationAnalyticsResponse(BaseModel): + scope: str + window_days: int | None = None + artifact: str | None = None + total_evaluations: int + passed_evaluations: int + blocked_evaluations: int + pass_rate: float + average_score: float | None = None + job_count: int + story_count: int + user_count: int + by_artifact: list[EvaluationAnalyticsArtifactBucket] + by_output_mode: list[EvaluationAnalyticsOutputModeBucket] + score_bands: list[EvaluationAnalyticsScoreBandBucket] + dimension_scores: list[EvaluationAnalyticsDimensionScore] + quality_gate_issues: list[EvaluationAnalyticsQualityGateIssue] + failure_categories: list[EvaluationAnalyticsFailureCategory] + warnings: list[EvaluationAnalyticsWarning] + + +class ExecutorCoveragePlanModeBucket(BaseModel): + plan_mode: str + count: int + + +class ExecutorCoverageOutputModeBucket(BaseModel): + output_mode: str + count: int + + +class ExecutorCoverageTaskKeyBucket(BaseModel): + task_key: str + count: int + + +class ExecutorCoverageAssetBucket(BaseModel): + asset: str + count: int + + +class ExecutorCoverageResponse(BaseModel): + scope: str + window_days: int | None = None + plan_mode: str | None = None + total_runs: int + total_planned_tasks: int + total_executed_tasks: int + total_ignored_tasks: int + coverage_ratio: float + job_count: int + story_count: int + user_count: int + by_plan_mode: list[ExecutorCoveragePlanModeBucket] + by_output_mode: list[ExecutorCoverageOutputModeBucket] + executed_task_keys: list[ExecutorCoverageTaskKeyBucket] + ignored_task_keys: list[ExecutorCoverageTaskKeyBucket] + result_assets: list[ExecutorCoverageAssetBucket] + + +class AdminGenerationJobEventResponse(BaseModel): + id: int + job_id: str + story_id: int | None = None + event_type: str + status: str + message: str | None = None + event_metadata: dict[str, Any] = Field(default_factory=dict) + created_at: datetime + + +class AdminGenerationJobTraceResponse(BaseModel): + id: str + user_id: str + story_id: int | None = None + output_mode: str + input_type: str + status: str + current_step: str + progress_percent: int + progress_label: str + is_terminal: bool + can_cancel: bool = False + can_retry: bool = False + result_snapshot: dict[str, Any] = Field(default_factory=dict) + error_message: str | None = None + request_payload: dict[str, Any] = Field(default_factory=dict) + executor_coverage: ExecutorCoverageResponse + events: list[AdminGenerationJobEventResponse] = Field(default_factory=list) + created_at: datetime + updated_at: datetime + + +class HarnessReadinessCheck(BaseModel): + code: str + status: Literal["ready", "needs_attention", "blocked"] + message: str + details: dict[str, Any] = Field(default_factory=dict) + + +class HarnessReadinessGoldenReplay(BaseModel): + passed: bool + total_cases: int + failed_case_ids: list[str] + coverage_summary: dict[str, dict[str, int]] = Field(default_factory=dict) + + +class HarnessReadinessThresholds(BaseModel): + min_runtime_evaluations: int + min_executor_runs: int + min_evaluation_pass_rate: float + min_evaluation_average_score: float + min_executor_coverage_ratio: float + + +class HarnessReadinessResponse(BaseModel): + scope: str + window_days: int | None = None + status: Literal["ready", "needs_attention", "blocked"] + thresholds: HarnessReadinessThresholds + checks: list[HarnessReadinessCheck] + golden_replay: HarnessReadinessGoldenReplay + evaluation_analytics: EvaluationAnalyticsResponse + executor_coverage: ExecutorCoverageResponse + + @router.get("/providers/adapters") async def list_available_adapters(): """获取所有可用的适配器类型 (定义的类)。""" @@ -137,6 +305,55 @@ async def get_provider_analytics( ) +@router.get("/evaluations/analytics", response_model=EvaluationAnalyticsResponse) +async def get_evaluation_analytics( + days: int | None = Query(default=None, ge=1, le=365), + artifact: Literal["story_text", "storybook_pages"] | None = Query(default=None), + db: AsyncSession = Depends(get_db), +): + """获取内部内容评测摘要,仅供管理控制面使用。""" + return await get_admin_evaluation_analytics( + db, + days=days, + artifact=artifact, + ) + + +@router.get("/executors/coverage", response_model=ExecutorCoverageResponse) +async def get_executor_coverage( + days: int | None = Query(default=None, ge=1, le=365), + plan_mode: Literal["asset_generation", "asset_retry"] | None = Query(default=None), + db: AsyncSession = Depends(get_db), +): + """获取内部 executor 执行覆盖率,仅供管理控制面使用。""" + return await get_admin_executor_coverage( + db, + days=days, + plan_mode=plan_mode, + ) + + +@router.get("/harness/readiness", response_model=HarnessReadinessResponse) +async def get_harness_readiness( + days: int | None = Query(default=None, ge=1, le=365), + db: AsyncSession = Depends(get_db), +): + """获取内部 harness readiness 审查摘要,仅供管理控制面使用。""" + return await get_admin_harness_readiness(db, days=days) + + +@router.get( + "/generations/jobs/{job_id}/trace", + response_model=AdminGenerationJobTraceResponse, +) +async def get_generation_job_trace( + job_id: str, + db: AsyncSession = Depends(get_db), +): + """获取完整内部生成链路,仅供管理控制面排查与审查使用。""" + return await get_admin_generation_job_trace(db, job_id=job_id) + + @router.get("/providers", response_model=list[ProviderResponse]) async def list_providers(db: AsyncSession = Depends(get_db)): result = await db.execute(select(Provider)) diff --git a/backend/app/api/stories.py b/backend/app/api/stories.py index a24384e..89ce5e0 100644 --- a/backend/app/api/stories.py +++ b/backend/app/api/stories.py @@ -24,6 +24,7 @@ from app.schemas.story_schemas import ( GenerationProviderStatsResponse, GenerationRequest, GenerationResponse, + GenerationTraceSummaryResponse, StoryAssetRetryRequest, StoryAudioStatusResponse, StorybookRequest, @@ -37,6 +38,7 @@ from app.services import story_service from app.services.generation_jobs import ( get_generation_job_detail, get_story_provider_stats, + get_story_trace_summary, get_user_generation_ops_summary, get_user_provider_analytics, list_story_generation_jobs, @@ -181,6 +183,25 @@ async def get_generation_provider_stats( ) +@router.get( + "/generations/{story_id}/trace-summary", + response_model=GenerationTraceSummaryResponse, +) +async def get_generation_trace_summary( + story_id: int, + days: int | None = Query(default=None, ge=1, le=365), + user: User = Depends(require_user), + db: AsyncSession = Depends(get_db), +): + """Get workflow trace summary aggregated from generation job events.""" + return await get_story_trace_summary( + db, + story_id=story_id, + user_id=user.id, + days=days, + ) + + @router.get("/generations/{story_id}", response_model=StoryDetailResponse) async def get_generation( story_id: int, diff --git a/backend/app/schemas/story_schemas.py b/backend/app/schemas/story_schemas.py index 176a5d9..e4c3ad3 100644 --- a/backend/app/schemas/story_schemas.py +++ b/backend/app/schemas/story_schemas.py @@ -244,6 +244,25 @@ class GenerationProviderStatsResponse(BaseModel): failure_reasons: list[GenerationProviderFailureReasonResponse] = Field(default_factory=list) +class GenerationTraceBucketResponse(BaseModel): + """Aggregated generation trace bucket.""" + + name: str + count: int + + +class GenerationTraceSummaryResponse(BaseModel): + """Workflow trace summary aggregated from generation job events.""" + + story_id: int + window_days: int | None = None + total_events: int + failed_events: int + by_step: list[GenerationTraceBucketResponse] = Field(default_factory=list) + by_artifact: list[GenerationTraceBucketResponse] = Field(default_factory=list) + failure_categories: list[GenerationTraceBucketResponse] = Field(default_factory=list) + + class GenerationProviderAnalyticsResponse(BaseModel): """Provider call stats aggregated across one user's generation history.""" diff --git a/backend/app/services/admin_evaluation_analytics.py b/backend/app/services/admin_evaluation_analytics.py new file mode 100644 index 0000000..b5a703e --- /dev/null +++ b/backend/app/services/admin_evaluation_analytics.py @@ -0,0 +1,204 @@ +"""Admin-only analytics for internal generation evaluation events.""" + +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from typing import Any + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models import GenerationJob, GenerationJobEvent + + +def _as_float(value: Any) -> float | None: + if isinstance(value, int | float): + return float(value) + return None + + +def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]: + return [ + {key_name: name, "count": count} + for name, count in sorted( + counts.items(), + key=lambda item: (-item[1], item[0]), + ) + ] + + +def _average_bucket( + totals: dict[str, float], + counts: dict[str, int], + *, + key_name: str, +) -> list[dict[str, Any]]: + rows = [ + { + key_name: name, + "average_score": round(totals[name] / counts[name], 4), + "count": counts[name], + } + for name in totals + if counts.get(name) + ] + rows.sort(key=lambda item: (-int(item["count"]), str(item[key_name]))) + return rows + + +def _score_band(score: float) -> str: + if score >= 0.9: + return "excellent" + if score >= 0.8: + return "good" + if score >= 0.7: + return "pass" + if score > 0: + return "blocked_low_score" + return "blocked_quality_gate" + + +def _metadata_scores(metadata: dict[str, Any]) -> list[dict[str, Any]]: + raw_scores = metadata.get("scores") + if not isinstance(raw_scores, list): + return [] + return [score for score in raw_scores if isinstance(score, dict)] + + +def _quality_gate_issues(metadata: dict[str, Any]) -> list[dict[str, Any]]: + quality_gate = metadata.get("quality_gate") + if not isinstance(quality_gate, dict): + return [] + raw_issues = quality_gate.get("issues") + if not isinstance(raw_issues, list): + return [] + return [issue for issue in raw_issues if isinstance(issue, dict)] + + +async def get_admin_evaluation_analytics( + db: AsyncSession, + *, + days: int | None = None, + artifact: str | None = None, +) -> dict[str, Any]: + """Aggregate internal evaluation results for the admin control plane.""" + + cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None + + query = ( + select(GenerationJobEvent, GenerationJob) + .join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id) + .where(GenerationJobEvent.event_type == "evaluation_completed") + .order_by(GenerationJobEvent.id) + ) + if cutoff is not None: + query = query.where(GenerationJobEvent.created_at >= cutoff) + + rows = (await db.execute(query)).all() + + total_evaluations = 0 + passed_evaluations = 0 + blocked_evaluations = 0 + score_total = 0.0 + score_count = 0 + job_ids: set[str] = set() + story_ids: set[int] = set() + user_ids: set[str] = set() + artifacts: dict[str, int] = {} + output_modes: dict[str, int] = {} + score_bands: dict[str, int] = {} + dimension_totals: dict[str, float] = {} + dimension_counts: dict[str, int] = {} + quality_gate_codes: dict[str, int] = {} + failure_categories: dict[str, int] = {} + warning_counts: dict[str, int] = {} + + for event, job in rows: + metadata = event.event_metadata or {} + event_artifact = str(metadata.get("artifact") or "unknown") + if artifact is not None and event_artifact != artifact: + continue + + total_evaluations += 1 + job_ids.add(job.id) + user_ids.add(job.user_id) + if event.story_id is not None: + story_ids.add(int(event.story_id)) + elif job.story_id is not None: + story_ids.add(int(job.story_id)) + + artifacts[event_artifact] = artifacts.get(event_artifact, 0) + 1 + output_modes[job.output_mode] = output_modes.get(job.output_mode, 0) + 1 + + passed = metadata.get("passed") is True + blocking = metadata.get("blocking") is True + if passed: + passed_evaluations += 1 + if blocking: + blocked_evaluations += 1 + + overall_score = _as_float(metadata.get("overall_score")) + if overall_score is not None: + score_total += overall_score + score_count += 1 + band = _score_band(overall_score) + score_bands[band] = score_bands.get(band, 0) + 1 + + for score in _metadata_scores(metadata): + dimension = score.get("dimension") + dimension_score = _as_float(score.get("score")) + if not isinstance(dimension, str) or dimension_score is None: + continue + dimension_totals[dimension] = dimension_totals.get(dimension, 0.0) + dimension_score + dimension_counts[dimension] = dimension_counts.get(dimension, 0) + 1 + + for issue in _quality_gate_issues(metadata): + code = issue.get("code") + if isinstance(code, str) and code: + quality_gate_codes[code] = quality_gate_codes.get(code, 0) + 1 + failure_category = issue.get("failure_category") + if isinstance(failure_category, str) and failure_category: + failure_categories[failure_category] = ( + failure_categories.get(failure_category, 0) + 1 + ) + + warnings = metadata.get("warnings") + if isinstance(warnings, list): + for warning in warnings: + if isinstance(warning, str) and warning: + warning_counts[warning] = warning_counts.get(warning, 0) + 1 + + return { + "scope": "admin_internal_evaluations", + "window_days": days, + "artifact": artifact, + "total_evaluations": total_evaluations, + "passed_evaluations": passed_evaluations, + "blocked_evaluations": blocked_evaluations, + "pass_rate": ( + round(passed_evaluations / total_evaluations, 4) + if total_evaluations + else 0.0 + ), + "average_score": round(score_total / score_count, 4) if score_count else None, + "job_count": len(job_ids), + "story_count": len(story_ids), + "user_count": len(user_ids), + "by_artifact": _sorted_count_buckets(artifacts, key_name="artifact"), + "by_output_mode": _sorted_count_buckets(output_modes, key_name="output_mode"), + "score_bands": _sorted_count_buckets(score_bands, key_name="band"), + "dimension_scores": _average_bucket( + dimension_totals, + dimension_counts, + key_name="dimension", + ), + "quality_gate_issues": _sorted_count_buckets( + quality_gate_codes, + key_name="code", + ), + "failure_categories": _sorted_count_buckets( + failure_categories, + key_name="category", + ), + "warnings": _sorted_count_buckets(warning_counts, key_name="message"), + } diff --git a/backend/app/services/admin_executor_coverage.py b/backend/app/services/admin_executor_coverage.py new file mode 100644 index 0000000..fead0b6 --- /dev/null +++ b/backend/app/services/admin_executor_coverage.py @@ -0,0 +1,147 @@ +"""Admin-only analytics for internal workflow executor coverage.""" + +from __future__ import annotations + +from collections.abc import Iterable +from datetime import datetime, timedelta, timezone +from typing import Any + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models import GenerationJob, GenerationJobEvent + + +def _as_int(value: Any) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + return 0 + + +def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]: + return [ + {key_name: name, "count": count} + for name, count in sorted( + counts.items(), + key=lambda item: (-item[1], item[0]), + ) + ] + + +def _iter_strings(value: Any) -> Iterable[str]: + if not isinstance(value, list | tuple | set): + return + + for item in value: + if isinstance(item, str) and item: + yield item + + +def summarize_executor_coverage_rows( + rows: Iterable[tuple[GenerationJobEvent, GenerationJob]], + *, + days: int | None = None, + plan_mode: str | None = None, + scope: str = "admin_internal_executor_coverage", +) -> dict[str, Any]: + """Aggregate internal executor coverage rows into an admin-only summary.""" + + total_runs = 0 + total_planned_tasks = 0 + total_executed_tasks = 0 + total_ignored_tasks = 0 + job_ids: set[str] = set() + story_ids: set[int] = set() + user_ids: set[str] = set() + by_plan_mode: dict[str, int] = {} + by_output_mode: dict[str, int] = {} + executed_task_keys: dict[str, int] = {} + ignored_task_keys: dict[str, int] = {} + result_assets: dict[str, int] = {} + + for event, job in rows: + metadata = event.event_metadata or {} + event_plan_mode = str(metadata.get("plan_mode") or "unknown") + if plan_mode is not None and event_plan_mode != plan_mode: + continue + + total_runs += 1 + job_ids.add(job.id) + user_ids.add(job.user_id) + if event.story_id is not None: + story_ids.add(int(event.story_id)) + elif job.story_id is not None: + story_ids.add(int(job.story_id)) + + by_plan_mode[event_plan_mode] = by_plan_mode.get(event_plan_mode, 0) + 1 + by_output_mode[job.output_mode] = by_output_mode.get(job.output_mode, 0) + 1 + + total_planned_tasks += _as_int(metadata.get("planned_task_count")) + total_executed_tasks += _as_int(metadata.get("executed_task_count")) + total_ignored_tasks += _as_int(metadata.get("ignored_task_count")) + + for key in _iter_strings(metadata.get("executed_task_keys")): + executed_task_keys[key] = executed_task_keys.get(key, 0) + 1 + + for key in _iter_strings(metadata.get("ignored_task_keys")): + ignored_task_keys[key] = ignored_task_keys.get(key, 0) + 1 + + for asset in _iter_strings(metadata.get("result_assets")): + result_assets[asset] = result_assets.get(asset, 0) + 1 + + coverage_ratio = ( + round(total_executed_tasks / total_planned_tasks, 4) + if total_planned_tasks + else 0.0 + ) + + return { + "scope": scope, + "window_days": days, + "plan_mode": plan_mode, + "total_runs": total_runs, + "total_planned_tasks": total_planned_tasks, + "total_executed_tasks": total_executed_tasks, + "total_ignored_tasks": total_ignored_tasks, + "coverage_ratio": coverage_ratio, + "job_count": len(job_ids), + "story_count": len(story_ids), + "user_count": len(user_ids), + "by_plan_mode": _sorted_count_buckets(by_plan_mode, key_name="plan_mode"), + "by_output_mode": _sorted_count_buckets(by_output_mode, key_name="output_mode"), + "executed_task_keys": _sorted_count_buckets( + executed_task_keys, + key_name="task_key", + ), + "ignored_task_keys": _sorted_count_buckets( + ignored_task_keys, + key_name="task_key", + ), + "result_assets": _sorted_count_buckets(result_assets, key_name="asset"), + } + + +async def get_admin_executor_coverage( + db: AsyncSession, + *, + days: int | None = None, + plan_mode: str | None = None, +) -> dict[str, Any]: + """Aggregate internal executor coverage events for the admin control plane.""" + + cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None + query = ( + select(GenerationJobEvent, GenerationJob) + .join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id) + .where(GenerationJobEvent.event_type == "executor_completed") + .order_by(GenerationJobEvent.id) + ) + if cutoff is not None: + query = query.where(GenerationJobEvent.created_at >= cutoff) + + rows = (await db.execute(query)).all() + return summarize_executor_coverage_rows(rows, days=days, plan_mode=plan_mode) diff --git a/backend/app/services/admin_generation_trace.py b/backend/app/services/admin_generation_trace.py new file mode 100644 index 0000000..6f55eba --- /dev/null +++ b/backend/app/services/admin_generation_trace.py @@ -0,0 +1,52 @@ +"""Admin-only generation trace detail service.""" + +from __future__ import annotations + +from typing import Any + +from fastapi import HTTPException +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models import GenerationJob, GenerationJobEvent +from app.services.admin_executor_coverage import summarize_executor_coverage_rows +from app.services.generation_jobs import ( + generation_event_to_response, + generation_job_to_summary, +) + + +async def get_admin_generation_job_trace( + db: AsyncSession, + *, + job_id: str, +) -> dict[str, Any]: + """Return a complete internal generation trace for the admin control plane.""" + + job = ( + await db.execute(select(GenerationJob).where(GenerationJob.id == job_id)) + ).scalar_one_or_none() + if job is None: + raise HTTPException(status_code=404, detail="Generation job not found") + + events = ( + await db.execute( + select(GenerationJobEvent) + .where(GenerationJobEvent.job_id == job.id) + .order_by(GenerationJobEvent.id) + ) + ).scalars().all() + executor_rows = [ + (event, job) for event in events if event.event_type == "executor_completed" + ] + + return { + **generation_job_to_summary(job), + "user_id": job.user_id, + "request_payload": job.request_payload or {}, + "executor_coverage": summarize_executor_coverage_rows( + executor_rows, + scope="admin_internal_job_executor_coverage", + ), + "events": [generation_event_to_response(event) for event in events], + } diff --git a/backend/app/services/admin_harness_readiness.py b/backend/app/services/admin_harness_readiness.py new file mode 100644 index 0000000..92b0f65 --- /dev/null +++ b/backend/app/services/admin_harness_readiness.py @@ -0,0 +1,262 @@ +"""Admin-only readiness audit for harness-driven generation.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics +from app.services.admin_executor_coverage import get_admin_executor_coverage +from app.services.harness.evaluation_replay import replay_evaluation_golden_cases + +_GOLDEN_CASES_PATH = ( + Path(__file__).resolve().parent + / "harness" + / "fixtures" + / "evaluation_golden_cases.json" +) + +_MIN_RUNTIME_EVALUATIONS = 1 +_MIN_EXECUTOR_RUNS = 1 +_MIN_EVALUATION_PASS_RATE = 0.7 +_MIN_EVALUATION_AVERAGE_SCORE = 0.7 +_MIN_EXECUTOR_COVERAGE_RATIO = 0.2 + + +def _check( + *, + code: str, + status: str, + message: str, + details: dict[str, Any] | None = None, +) -> dict[str, Any]: + return { + "code": code, + "status": status, + "message": message, + "details": details or {}, + } + + +def _overall_status(checks: list[dict[str, Any]]) -> str: + statuses = {check["status"] for check in checks} + if "blocked" in statuses: + return "blocked" + if "needs_attention" in statuses: + return "needs_attention" + return "ready" + + +def _run_golden_replay() -> dict[str, Any]: + if not _GOLDEN_CASES_PATH.exists(): + return { + "passed": False, + "total_cases": 0, + "failed_case_ids": ["fixture_missing"], + "coverage_summary": {}, + } + + result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH) + return { + "passed": result.passed, + "total_cases": len(result.cases), + "failed_case_ids": list(result.failed_case_ids), + "coverage_summary": result.coverage_summary(), + } + + +def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]: + if golden_replay["passed"] and golden_replay["total_cases"] > 0: + return _check( + code="golden_replay", + status="ready", + message="内部 golden replay 全部通过。", + details={ + "total_cases": golden_replay["total_cases"], + "failed_case_count": len(golden_replay["failed_case_ids"]), + }, + ) + + return _check( + code="golden_replay", + status="blocked", + message="内部 golden replay 未通过,暂停扩大 harness 接管范围。", + details={ + "total_cases": golden_replay["total_cases"], + "failed_case_count": len(golden_replay["failed_case_ids"]), + "failed_case_ids": golden_replay["failed_case_ids"], + }, + ) + + +def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]: + total = int(evaluation_analytics["total_evaluations"]) + if total >= _MIN_RUNTIME_EVALUATIONS: + return _check( + code="runtime_evaluation_samples", + status="ready", + message="当前窗口已有内部 evaluation 运行样本。", + details={ + "total_evaluations": total, + "min_required": _MIN_RUNTIME_EVALUATIONS, + }, + ) + + return _check( + code="runtime_evaluation_samples", + status="needs_attention", + message="当前窗口缺少内部 evaluation 运行样本,建议先跑生成烟测。", + details={ + "total_evaluations": total, + "min_required": _MIN_RUNTIME_EVALUATIONS, + }, + ) + + +def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]: + total = int(evaluation_analytics["total_evaluations"]) + pass_rate = float(evaluation_analytics["pass_rate"]) + average_score = evaluation_analytics["average_score"] + + if total == 0: + return _check( + code="runtime_evaluation_quality", + status="needs_attention", + message="暂无运行期 evaluation 质量样本。", + details={ + "total_evaluations": total, + "min_pass_rate": _MIN_EVALUATION_PASS_RATE, + "min_average_score": _MIN_EVALUATION_AVERAGE_SCORE, + }, + ) + + if pass_rate < _MIN_EVALUATION_PASS_RATE or ( + average_score is not None + and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE + ): + return _check( + code="runtime_evaluation_quality", + status="blocked", + message="运行期 evaluation 质量未达到内部 readiness 门槛。", + details={ + "pass_rate": pass_rate, + "average_score": average_score, + "blocked_evaluations": evaluation_analytics["blocked_evaluations"], + "min_pass_rate": _MIN_EVALUATION_PASS_RATE, + "min_average_score": _MIN_EVALUATION_AVERAGE_SCORE, + }, + ) + + return _check( + code="runtime_evaluation_quality", + status="ready", + message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。", + details={ + "pass_rate": pass_rate, + "average_score": average_score, + "blocked_evaluations": evaluation_analytics["blocked_evaluations"], + }, + ) + + +def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]: + total_runs = int(executor_coverage["total_runs"]) + if total_runs >= _MIN_EXECUTOR_RUNS: + return _check( + code="executor_coverage_samples", + status="ready", + message="当前窗口已有 executor coverage 运行样本。", + details={ + "total_runs": total_runs, + "min_required": _MIN_EXECUTOR_RUNS, + }, + ) + + return _check( + code="executor_coverage_samples", + status="needs_attention", + message="当前窗口缺少 executor coverage 样本,建议先跑资产生成或重试烟测。", + details={ + "total_runs": total_runs, + "min_required": _MIN_EXECUTOR_RUNS, + }, + ) + + +def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]: + total_runs = int(executor_coverage["total_runs"]) + coverage_ratio = float(executor_coverage["coverage_ratio"]) + + if total_runs == 0: + return _check( + code="executor_coverage_ratio", + status="needs_attention", + message="暂无 executor coverage 运行样本。", + details={ + "total_runs": total_runs, + "min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO, + }, + ) + + if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO: + return _check( + code="executor_coverage_ratio", + status="blocked", + message="executor coverage ratio 未达到内部 readiness 门槛。", + details={ + "coverage_ratio": coverage_ratio, + "min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO, + "total_planned_tasks": executor_coverage["total_planned_tasks"], + "total_executed_tasks": executor_coverage["total_executed_tasks"], + }, + ) + + return _check( + code="executor_coverage_ratio", + status="ready", + message="executor coverage ratio 达到内部 readiness 门槛。", + details={ + "coverage_ratio": coverage_ratio, + "total_planned_tasks": executor_coverage["total_planned_tasks"], + "total_executed_tasks": executor_coverage["total_executed_tasks"], + }, + ) + + +async def get_admin_harness_readiness( + db: AsyncSession, + *, + days: int | None = None, +) -> dict[str, Any]: + """Return an admin-only readiness audit for harness release decisions.""" + + golden_replay = _run_golden_replay() + evaluation_analytics = await get_admin_evaluation_analytics(db, days=days) + executor_coverage = await get_admin_executor_coverage(db, days=days) + + checks = [ + _golden_replay_check(golden_replay), + _evaluation_sample_check(evaluation_analytics), + _evaluation_quality_check(evaluation_analytics), + _executor_sample_check(executor_coverage), + _executor_ratio_check(executor_coverage), + ] + + return { + "scope": "admin_internal_harness_readiness", + "window_days": days, + "status": _overall_status(checks), + "thresholds": { + "min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS, + "min_executor_runs": _MIN_EXECUTOR_RUNS, + "min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE, + "min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE, + "min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO, + }, + "checks": checks, + "golden_replay": golden_replay, + "evaluation_analytics": evaluation_analytics, + "executor_coverage": executor_coverage, + } diff --git a/backend/app/services/generation_jobs.py b/backend/app/services/generation_jobs.py index 109db17..7a184bd 100644 --- a/backend/app/services/generation_jobs.py +++ b/backend/app/services/generation_jobs.py @@ -90,11 +90,13 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]: progress_map: dict[str, tuple[int, str]] = { "request_accepted": (5, "已接收请求"), + "workflow_planned": (8, "工作流已规划"), "retry_queued": (8, "重新排队中"), "worker_started": (12, "后台任务已开始"), "cancel_requested": (15, "已请求取消"), "context_prepared": (20, "上下文已准备"), "narrative_generated": (45, "正文已生成"), + "evaluation_completed": (52, "内容评测已完成"), "story_saved": (60, "主记录已保存"), "provider_call_started": (65, "Provider 调用中"), "provider_call_succeeded": (72, "Provider 调用成功"), @@ -307,6 +309,137 @@ def generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any]: } +_PUBLIC_EVENT_METADATA_KEYS = { + "adapter", + "artifact", + "asset", + "assets", + "attempted_cover", + "audio_status", + "blocks_main_result", + "capability", + "completed_pages", + "cover_prompt_present", + "estimated_cost_usd", + "failed_pages", + "failure_category", + "generation_status", + "has_memory_context", + "image_status", + "input_type", + "latency_ms", + "mode", + "output_mode", + "page_count", + "page_number", + "recoverable", + "requested_from_step", + "retryable", + "scope", + "stale_after_minutes", + "status", + "step", + "strategy", + "text_status", +} + +_PUBLIC_REQUEST_PAYLOAD_KEYS = { + "assets", + "child_profile_id", + "generate_images", + "input_type", + "output_mode", + "page_count", + "story_id", + "type", + "universe_id", +} + + +def _public_metadata_value(value: Any) -> Any: + """Return a JSON-safe public value or None when the value is internal.""" + + if isinstance(value, str | int | float | bool) or value is None: + return value + if isinstance(value, list): + public_items = [ + item + for item in value + if isinstance(item, str | int | float | bool) or item is None + ] + return public_items + return None + + +def public_generation_request_payload(job: GenerationJob) -> dict[str, Any]: + """Return request payload fields safe for user-facing job details.""" + + payload = job.request_payload or {} + public_payload: dict[str, Any] = {} + + for key in sorted(_PUBLIC_REQUEST_PAYLOAD_KEYS): + if key not in payload: + continue + value = _public_metadata_value(payload[key]) + if value is not None: + public_payload[key] = value + + return public_payload + + +def _public_plan_metadata(metadata: dict[str, Any]) -> dict[str, Any]: + """Expose only coarse workflow plan metadata to user-facing responses.""" + + plan = metadata.get("plan") + if not isinstance(plan, dict): + return {} + + public: dict[str, Any] = {} + mode = plan.get("mode") + if isinstance(mode, str): + public["plan_mode"] = mode + + tasks = plan.get("tasks") + if isinstance(tasks, list): + public["planned_task_count"] = len(tasks) + public["recoverable_task_count"] = sum( + 1 + for task in tasks + if isinstance(task, dict) and task.get("recoverable") is True + ) + + return public + + +def public_generation_event_metadata(event: GenerationJobEvent) -> dict[str, Any]: + """Return event metadata safe for user-facing job event streams.""" + + metadata = event.event_metadata or {} + public_metadata: dict[str, Any] = {} + + for key in sorted(_PUBLIC_EVENT_METADATA_KEYS): + if key not in metadata: + continue + value = _public_metadata_value(metadata[key]) + if value is not None: + public_metadata[key] = value + + if event.event_type == "workflow_planned": + public_metadata.update(_public_plan_metadata(metadata)) + + return public_metadata + + +def public_generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any] | None: + """Convert a generation event for user-facing APIs with internal data removed.""" + + if event.event_type in {"evaluation_completed", "executor_completed"}: + return None + response = generation_event_to_response(event) + response["event_metadata"] = public_generation_event_metadata(event) + return response + + def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]: """Convert a generation job ORM object to an API summary dict.""" @@ -328,6 +461,23 @@ def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]: } +def public_generation_job_to_summary(job: GenerationJob) -> dict[str, Any]: + """Convert a generation job for user-facing APIs with internal steps hidden.""" + + summary = generation_job_to_summary(job) + if summary["current_step"] == "evaluation_completed": + summary["current_step"] = "narrative_generated" + summary["progress_percent"] = 45 + summary["progress_label"] = "正文已生成" + summary["is_terminal"] = False + elif summary["current_step"] == "executor_completed": + summary["current_step"] = "workflow_planned" + summary["progress_percent"] = 8 + summary["progress_label"] = "工作流已规划" + summary["is_terminal"] = False + return summary + + async def get_generation_job_for_user( db: AsyncSession, *, @@ -362,13 +512,13 @@ async def request_generation_job_cancel( raise HTTPException(status_code=409, detail="当前任务不支持取消") if job.status == "canceled": - return generation_job_to_summary(job) + return public_generation_job_to_summary(job) if _is_terminal_status(job.status): raise HTTPException(status_code=409, detail="当前任务已终止,无法取消") if job.current_step == "cancel_requested": - return generation_job_to_summary(job) + return public_generation_job_to_summary(job) if job.current_step in {"request_accepted", "retry_queued"}: story = None @@ -391,7 +541,7 @@ async def request_generation_job_cancel( error_message="Generation canceled by user before worker execution started.", message="Generation job was canceled before worker execution started.", ) - return generation_job_to_summary(job) + return public_generation_job_to_summary(job) previous_step = job.current_step job.error_message = "Cancellation requested by user." @@ -407,7 +557,7 @@ async def request_generation_job_cancel( ) await db.commit() await db.refresh(job) - return generation_job_to_summary(job) + return public_generation_job_to_summary(job) async def get_generation_job_detail( @@ -437,9 +587,13 @@ async def get_generation_job_detail( ).scalars().all() return { - **generation_job_to_summary(job), - "request_payload": job.request_payload or {}, - "events": [generation_event_to_response(event) for event in events], + **public_generation_job_to_summary(job), + "request_payload": public_generation_request_payload(job), + "events": [ + response + for event in events + if (response := public_generation_event_to_response(event)) is not None + ], } @@ -461,7 +615,7 @@ async def list_story_generation_jobs( .order_by(desc(GenerationJob.created_at), desc(GenerationJob.id)) ) ).scalars().all() - return [generation_job_to_summary(job) for job in jobs] + return [public_generation_job_to_summary(job) for job in jobs] async def get_active_story_generation_job( @@ -513,6 +667,59 @@ def _as_float(value: Any) -> float | None: return None +def _sorted_buckets(counts: dict[str, int]) -> list[dict[str, Any]]: + return [ + {"name": name, "count": count} + for name, count in sorted( + counts.items(), + key=lambda item: (-item[1], item[0]), + ) + ] + + +def _aggregate_trace_events(events: list[GenerationJobEvent]) -> dict[str, Any]: + """Aggregate workflow trace metadata across job events.""" + + by_step: dict[str, int] = {} + by_artifact: dict[str, int] = {} + failure_categories: dict[str, int] = {} + failed_events = 0 + total_events = 0 + + for event in events: + if event.event_type in {"evaluation_completed", "executor_completed"}: + continue + + total_events += 1 + metadata = event.event_metadata or {} + step = metadata.get("step") + artifact = metadata.get("artifact") + failure_category = metadata.get("failure_category") + + if isinstance(step, str) and step: + by_step[step] = by_step.get(step, 0) + 1 + + if isinstance(artifact, str) and artifact and artifact != "none": + by_artifact[artifact] = by_artifact.get(artifact, 0) + 1 + + if event.status == "failed": + failed_events += 1 + category = ( + failure_category + if isinstance(failure_category, str) and failure_category + else "unknown_error" + ) + failure_categories[category] = failure_categories.get(category, 0) + 1 + + return { + "total_events": total_events, + "failed_events": failed_events, + "by_step": _sorted_buckets(by_step), + "by_artifact": _sorted_buckets(by_artifact), + "failure_categories": _sorted_buckets(failure_categories), + } + + def _aggregate_provider_events( events: list[GenerationJobEvent], *, @@ -679,6 +886,38 @@ async def get_story_provider_stats( } +async def get_story_trace_summary( + db: AsyncSession, + *, + story_id: int, + user_id: str, + days: int | None = None, +) -> dict[str, Any]: + """Aggregate workflow trace metadata from all user-owned jobs for one story.""" + + query = ( + select(GenerationJobEvent) + .join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id) + .where( + GenerationJob.story_id == story_id, + GenerationJob.user_id == user_id, + ) + .order_by(GenerationJobEvent.id) + ) + + if days is not None: + cutoff = datetime.now(timezone.utc) - timedelta(days=days) + query = query.where(GenerationJobEvent.created_at >= cutoff) + + events = (await db.execute(query)).scalars().all() + + return { + "story_id": story_id, + "window_days": days, + **_aggregate_trace_events(events), + } + + async def get_user_provider_analytics( db: AsyncSession, *, diff --git a/backend/app/services/harness/evaluation_replay.py b/backend/app/services/harness/evaluation_replay.py new file mode 100644 index 0000000..d4f6c5e --- /dev/null +++ b/backend/app/services/harness/evaluation_replay.py @@ -0,0 +1,322 @@ +"""Internal golden-case replay support for harness evaluations. + +The replay helpers are intentionally not wired to user-facing APIs. They exist +to make evaluation behavior reproducible in tests and internal tooling. +""" + +import json +from collections import Counter +from dataclasses import dataclass, field +from enum import StrEnum +from pathlib import Path +from typing import Any, Iterable + +from app.services.adapters.storybook.primary import Storybook, StorybookPage +from app.services.adapters.text.models import StoryOutput +from app.services.harness.evaluators import ( + EvaluationDimension, + EvaluationResult, + evaluate_story_output, + evaluate_storybook_output, +) + + +class EvaluationReplayArtifact(StrEnum): + """Artifacts supported by deterministic evaluation replay.""" + + STORY = "story" + STORYBOOK = "storybook" + + +@dataclass(frozen=True) +class ExpectedEvaluation: + """Expected evaluation outcome for one golden case.""" + + passed: bool + blocking: bool + min_overall_score: float | None = None + max_overall_score: float | None = None + required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple) + quality_gate_codes: tuple[str, ...] = field(default_factory=tuple) + warning_substrings: tuple[str, ...] = field(default_factory=tuple) + + @classmethod + def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation": + """Build expectations from a JSON-safe payload.""" + + return cls( + passed=bool(payload["passed"]), + blocking=bool(payload["blocking"]), + min_overall_score=payload.get("min_overall_score"), + max_overall_score=payload.get("max_overall_score"), + required_dimensions=tuple( + EvaluationDimension(dimension) + for dimension in payload.get("required_dimensions", []) + ), + quality_gate_codes=tuple(payload.get("quality_gate_codes", [])), + warning_substrings=tuple(payload.get("warning_substrings", [])), + ) + + +@dataclass(frozen=True) +class EvaluationReplayCoverage: + """Internal coverage labels for one golden replay case.""" + + age_band: str = "unknown" + content_shape: str = "unknown" + risk_area: str = "unknown" + tags: tuple[str, ...] = field(default_factory=tuple) + + @classmethod + def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage": + """Build coverage labels from a JSON-safe payload.""" + + payload = payload or {} + return cls( + age_band=str(payload.get("age_band", "unknown")), + content_shape=str(payload.get("content_shape", "unknown")), + risk_area=str(payload.get("risk_area", "unknown")), + tags=tuple(str(tag) for tag in payload.get("tags", [])), + ) + + +@dataclass(frozen=True) +class EvaluationReplayCase: + """One internal golden evaluation case.""" + + case_id: str + artifact: EvaluationReplayArtifact + output_payload: dict[str, Any] + expected: ExpectedEvaluation + education_theme: str | None = None + minimum_score: float = 0.7 + description: str = "" + input_payload: dict[str, Any] = field(default_factory=dict) + coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage) + + @classmethod + def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase": + """Build a replay case from a JSON-safe payload.""" + + input_payload = dict(payload.get("input", {})) + minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7)) + education_theme = input_payload.get("education_theme", payload.get("education_theme")) + + return cls( + case_id=str(payload["id"]), + artifact=EvaluationReplayArtifact(payload["artifact"]), + description=str(payload.get("description", "")), + input_payload=input_payload, + output_payload=dict(payload["output"]), + education_theme=education_theme, + minimum_score=float(minimum_score), + expected=ExpectedEvaluation.from_payload(payload["expected"]), + coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")), + ) + + def evaluate(self) -> EvaluationResult: + """Run the deterministic evaluator for this case.""" + + if self.artifact == EvaluationReplayArtifact.STORY: + return evaluate_story_output( + _story_output_from_payload(self.output_payload), + education_theme=self.education_theme, + minimum_score=self.minimum_score, + ) + + return evaluate_storybook_output( + _storybook_from_payload(self.output_payload), + education_theme=self.education_theme, + minimum_score=self.minimum_score, + ) + + def replay(self) -> "EvaluationReplayCaseResult": + """Evaluate the case and compare it with expected outcomes.""" + + evaluation = self.evaluate() + failures = tuple(_compare_evaluation(self, evaluation)) + return EvaluationReplayCaseResult( + case_id=self.case_id, + artifact=self.artifact, + coverage=self.coverage, + evaluation=evaluation, + failures=failures, + ) + + +@dataclass(frozen=True) +class EvaluationReplayCaseResult: + """Replay result for one golden case.""" + + case_id: str + artifact: EvaluationReplayArtifact + coverage: EvaluationReplayCoverage + evaluation: EvaluationResult + failures: tuple[str, ...] = field(default_factory=tuple) + + @property + def expectations_met(self) -> bool: + """Return whether the case matched all expectations.""" + + return not self.failures + + +@dataclass(frozen=True) +class EvaluationReplaySuiteResult: + """Replay result for a set of golden cases.""" + + cases: tuple[EvaluationReplayCaseResult, ...] + + @property + def passed(self) -> bool: + """Return whether every replay case matched expectations.""" + + return all(case.expectations_met for case in self.cases) + + @property + def failed_case_ids(self) -> tuple[str, ...]: + """Return case IDs with expectation mismatches.""" + + return tuple(case.case_id for case in self.cases if not case.expectations_met) + + def failure_report(self) -> str: + """Return a compact failure report for assertion messages.""" + + lines: list[str] = [] + for case in self.cases: + for failure in case.failures: + lines.append(f"{case.case_id}: {failure}") + return "\n".join(lines) + + def coverage_summary(self) -> dict[str, dict[str, int]]: + """Return internal coverage counts for golden replay review.""" + + return { + "artifact": _count_values(case.artifact.value for case in self.cases), + "age_band": _count_values(case.coverage.age_band for case in self.cases), + "content_shape": _count_values( + case.coverage.content_shape for case in self.cases + ), + "risk_area": _count_values(case.coverage.risk_area for case in self.cases), + "tags": _count_values( + tag for case in self.cases for tag in case.coverage.tags + ), + "outcome": _count_values( + "passed" if case.evaluation.passed else "blocked" + for case in self.cases + ), + } + + +def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]: + """Load internal golden replay cases from a JSON file.""" + + raw_cases = json.loads(Path(path).read_text(encoding="utf-8")) + if not isinstance(raw_cases, list): + raise ValueError("Evaluation replay fixture must be a JSON array.") + return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases) + + +def run_evaluation_replay_cases( + cases: Iterable[EvaluationReplayCase], +) -> EvaluationReplaySuiteResult: + """Run a set of internal golden evaluation replay cases.""" + + return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases)) + + +def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult: + """Load and run internal golden evaluation replay cases.""" + + return run_evaluation_replay_cases(load_evaluation_replay_cases(path)) + + +def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput: + return StoryOutput( + mode=payload.get("mode", "generated"), + title=payload.get("title", ""), + story_text=payload.get("story_text", ""), + cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""), + ) + + +def _storybook_from_payload(payload: dict[str, Any]) -> Storybook: + pages = [ + StorybookPage( + page_number=page.get("page_number", index + 1), + text=page.get("text", ""), + image_prompt=page.get("image_prompt", ""), + image_url=page.get("image_url"), + ) + for index, page in enumerate(payload.get("pages", [])) + ] + + return Storybook( + title=payload.get("title", ""), + main_character=payload.get("main_character", ""), + art_style=payload.get("art_style", ""), + pages=pages, + cover_prompt=payload.get("cover_prompt", ""), + cover_url=payload.get("cover_url"), + ) + + +def _count_values(values: Iterable[str]) -> dict[str, int]: + counts = Counter(value for value in values if value) + return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0]))) + + +def _compare_evaluation( + case: EvaluationReplayCase, + evaluation: EvaluationResult, +) -> list[str]: + expected = case.expected + failures: list[str] = [] + + if evaluation.passed != expected.passed: + failures.append(f"expected passed={expected.passed}, got {evaluation.passed}") + + if evaluation.blocking != expected.blocking: + failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}") + + if ( + expected.min_overall_score is not None + and evaluation.overall_score < expected.min_overall_score + ): + failures.append( + "expected overall_score >= " + f"{expected.min_overall_score}, got {evaluation.overall_score}" + ) + + if ( + expected.max_overall_score is not None + and evaluation.overall_score > expected.max_overall_score + ): + failures.append( + "expected overall_score <= " + f"{expected.max_overall_score}, got {evaluation.overall_score}" + ) + + actual_dimensions = {score.dimension for score in evaluation.scores} + missing_dimensions = [ + dimension.value + for dimension in expected.required_dimensions + if dimension not in actual_dimensions + ] + if missing_dimensions: + failures.append(f"missing dimensions: {', '.join(missing_dimensions)}") + + actual_quality_gate_codes = tuple( + issue.code.value for issue in evaluation.gate_error.issues + ) if evaluation.gate_error is not None else () + if actual_quality_gate_codes != expected.quality_gate_codes: + failures.append( + "expected quality_gate_codes=" + f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}" + ) + + for expected_warning in expected.warning_substrings: + if not any(expected_warning in warning for warning in evaluation.warnings): + failures.append(f"missing warning containing: {expected_warning}") + + return failures diff --git a/backend/app/services/harness/evaluators.py b/backend/app/services/harness/evaluators.py new file mode 100644 index 0000000..f7fbd53 --- /dev/null +++ b/backend/app/services/harness/evaluators.py @@ -0,0 +1,267 @@ +"""Deterministic evaluation helpers for generated child-facing content.""" + +from dataclasses import dataclass, field +from enum import StrEnum +from typing import Any + +from app.services.adapters.storybook.primary import Storybook +from app.services.adapters.text.models import StoryOutput +from app.services.harness.quality_gates import ( + QualityGateError, + validate_story_output, + validate_storybook_output, +) + + +class EvaluationDimension(StrEnum): + """Stable dimensions used by harness evaluations.""" + + STRUCTURE = "structure" + SAFETY = "safety" + AGE_FIT = "age_fit" + EDUCATIONAL_VALUE = "educational_value" + READABILITY = "readability" + + +@dataclass(frozen=True) +class EvaluationScore: + """One scored evaluation dimension.""" + + dimension: EvaluationDimension + score: float + reason: str + + def to_metadata(self) -> dict[str, Any]: + """Return a JSON-safe metadata payload.""" + + return { + "dimension": self.dimension.value, + "score": self.score, + "reason": self.reason, + } + + +@dataclass(frozen=True) +class EvaluationResult: + """Deterministic evaluation result for one generated artifact.""" + + overall_score: float + passed: bool + blocking: bool + scores: tuple[EvaluationScore, ...] + gate_error: QualityGateError | None = None + warnings: tuple[str, ...] = field(default_factory=tuple) + + def to_metadata(self) -> dict[str, Any]: + """Return a JSON-safe metadata payload.""" + + metadata: dict[str, Any] = { + "overall_score": self.overall_score, + "passed": self.passed, + "blocking": self.blocking, + "scores": [score.to_metadata() for score in self.scores], + "warnings": list(self.warnings), + } + if self.gate_error is not None: + metadata["quality_gate"] = self.gate_error.to_metadata() + return metadata + + +def _clamp_score(value: float) -> float: + return max(0.0, min(1.0, round(value, 2))) + + +def _story_text_readability_score(story_text: str) -> float: + """Score text length with a conservative 3-8 age readability heuristic.""" + + normalized_length = len(story_text.strip()) + if normalized_length < 30: + return 0.45 + if normalized_length > 2500: + return 0.72 + if normalized_length > 1800: + return 0.84 + return 0.96 + + +def _educational_value_score(story_text: str, education_theme: str | None) -> float: + if not education_theme: + return 0.82 + return 0.96 if education_theme.strip() in story_text else 0.88 + + +def _storybook_readability_score(page_texts: list[str]) -> float: + if not page_texts: + return 0.0 + + page_lengths = [len(text.strip()) for text in page_texts] + if any(length < 8 for length in page_lengths): + return 0.62 + if any(length > 320 for length in page_lengths): + return 0.78 + if any(length > 220 for length in page_lengths): + return 0.88 + return 0.96 + + +def _storybook_educational_value_score( + page_texts: list[str], + education_theme: str | None, +) -> float: + if not education_theme: + return 0.82 + combined_text = " ".join(page_texts) + return 0.96 if education_theme.strip() in combined_text else 0.88 + + +def evaluate_story_output( + output: StoryOutput, + *, + education_theme: str | None = None, + minimum_score: float = 0.7, +) -> EvaluationResult: + """Evaluate a generated text story before persistence.""" + + try: + validate_story_output(output) + except QualityGateError as exc: + scores = ( + EvaluationScore( + dimension=EvaluationDimension.STRUCTURE, + score=0.0, + reason="故事结构未通过质量门。", + ), + EvaluationScore( + dimension=EvaluationDimension.SAFETY, + score=0.0, + reason="内容未通过儿童安全或结构完整性检查。", + ), + ) + return EvaluationResult( + overall_score=0.0, + passed=False, + blocking=True, + scores=scores, + gate_error=exc, + ) + + readability_score = _story_text_readability_score(output.story_text) + educational_score = _educational_value_score(output.story_text, education_theme) + warnings: list[str] = [] + + if readability_score < 0.8: + warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。") + + scores = ( + EvaluationScore( + dimension=EvaluationDimension.STRUCTURE, + score=1.0, + reason="标题、正文和封面提示词完整。", + ), + EvaluationScore( + dimension=EvaluationDimension.SAFETY, + score=1.0, + reason="未命中确定性儿童安全风险词。", + ), + EvaluationScore( + dimension=EvaluationDimension.AGE_FIT, + score=readability_score, + reason="根据正文长度估算低龄儿童阅读适配度。", + ), + EvaluationScore( + dimension=EvaluationDimension.EDUCATIONAL_VALUE, + score=educational_score, + reason="根据教育主题是否清晰融入正文估算。", + ), + EvaluationScore( + dimension=EvaluationDimension.READABILITY, + score=readability_score, + reason="根据正文长度估算朗读和亲子共读流畅度。", + ), + ) + overall_score = _clamp_score(sum(score.score for score in scores) / len(scores)) + + return EvaluationResult( + overall_score=overall_score, + passed=overall_score >= minimum_score, + blocking=overall_score < minimum_score, + scores=scores, + warnings=tuple(warnings), + ) + + +def evaluate_storybook_output( + output: Storybook, + *, + education_theme: str | None = None, + minimum_score: float = 0.7, +) -> EvaluationResult: + """Evaluate generated storybook structure before persistence.""" + + try: + validate_storybook_output(output) + except QualityGateError as exc: + scores = ( + EvaluationScore( + dimension=EvaluationDimension.STRUCTURE, + score=0.0, + reason="绘本结构未通过质量门。", + ), + EvaluationScore( + dimension=EvaluationDimension.SAFETY, + score=0.0, + reason="绘本内容未通过儿童安全或结构完整性检查。", + ), + ) + return EvaluationResult( + overall_score=0.0, + passed=False, + blocking=True, + scores=scores, + gate_error=exc, + ) + + page_texts = [page.text for page in output.pages] + readability_score = _storybook_readability_score(page_texts) + educational_score = _storybook_educational_value_score(page_texts, education_theme) + warnings: list[str] = [] + + if readability_score < 0.8: + warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。") + + scores = ( + EvaluationScore( + dimension=EvaluationDimension.STRUCTURE, + score=1.0, + reason="绘本标题、分页和页码结构完整。", + ), + EvaluationScore( + dimension=EvaluationDimension.SAFETY, + score=1.0, + reason="未命中确定性儿童安全风险词。", + ), + EvaluationScore( + dimension=EvaluationDimension.AGE_FIT, + score=readability_score, + reason="根据每页正文长度估算低龄儿童翻页阅读适配度。", + ), + EvaluationScore( + dimension=EvaluationDimension.EDUCATIONAL_VALUE, + score=educational_score, + reason="根据教育主题是否清晰融入分页正文估算。", + ), + EvaluationScore( + dimension=EvaluationDimension.READABILITY, + score=readability_score, + reason="根据分页正文长度估算亲子共读流畅度。", + ), + ) + overall_score = _clamp_score(sum(score.score for score in scores) / len(scores)) + + return EvaluationResult( + overall_score=overall_score, + passed=overall_score >= minimum_score, + blocking=overall_score < minimum_score, + scores=scores, + warnings=tuple(warnings), + ) diff --git a/backend/app/services/harness/executor.py b/backend/app/services/harness/executor.py new file mode 100644 index 0000000..5c1f3ff --- /dev/null +++ b/backend/app/services/harness/executor.py @@ -0,0 +1,150 @@ +"""Small-step workflow executor helpers for generation harness adoption.""" + +from collections.abc import Awaitable, Callable +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.services.harness.artifacts import AssetCompletionResult +from app.services.harness.plans import WorkflowPlan +from app.services.harness.trace import TraceRecorder +from app.services.harness.types import ArtifactKind, WorkflowStep + +if TYPE_CHECKING: + from app.db.models import GenerationJob + +AssetTask = Callable[[], Awaitable[AssetCompletionResult]] + + +@dataclass(frozen=True) +class AssetPlanRunResult: + """Result of executing asset-producing tasks from one workflow plan.""" + + task_results: tuple[AssetCompletionResult, ...] + executed_task_keys: tuple[str, ...] + ignored_task_keys: tuple[str, ...] + + @property + def result_assets(self) -> tuple[str, ...]: + """Assets returned by executed task handlers.""" + + return tuple(result.asset for result in self.task_results) + + def to_metadata(self, plan: WorkflowPlan) -> dict[str, Any]: + """Return internal executor coverage metadata for admin-only analytics.""" + + return { + "plan_mode": plan.mode.value, + "planned_task_count": len(plan.tasks), + "executed_task_count": len(self.executed_task_keys), + "ignored_task_count": len(self.ignored_task_keys), + "result_count": len(self.task_results), + "executed_task_keys": list(self.executed_task_keys), + "ignored_task_keys": list(self.ignored_task_keys), + "result_assets": list(self.result_assets), + } + + +async def record_workflow_plan( + db: AsyncSession, + *, + job: "GenerationJob | None", + plan: WorkflowPlan, +) -> None: + """Persist a workflow plan snapshot for a tracked job.""" + + await TraceRecorder(db).record_step( + job=job, + event_type="workflow_planned", + status="succeeded", + message="Workflow plan selected for this generation request.", + metadata={"plan": plan.to_snapshot()}, + step=WorkflowStep.REQUEST_ACCEPTANCE, + artifact=ArtifactKind.NONE, + blocks_main_result=True, + ) + + +async def record_evaluation_result( + db: AsyncSession, + *, + job: "GenerationJob | None", + story_id: int | None = None, + metadata: dict[str, Any], + status: str, + artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT, +) -> None: + """Persist a deterministic evaluation result for a tracked job.""" + + await TraceRecorder(db).record_step( + job=job, + story_id=story_id, + event_type="evaluation_completed", + status=status, + message="Generated content evaluation completed.", + metadata=metadata, + step=WorkflowStep.EVALUATION, + artifact=artifact, + blocks_main_result=status != "succeeded", + ) + + +async def record_executor_result( + db: AsyncSession, + *, + job: "GenerationJob | None", + plan: WorkflowPlan, + result: AssetPlanRunResult, +) -> None: + """Persist internal executor coverage metadata for a tracked job.""" + + await TraceRecorder(db).record_step( + job=job, + event_type="executor_completed", + status="succeeded", + message="Workflow executor completed planned asset tasks.", + metadata=result.to_metadata(plan), + step=WorkflowStep.UNKNOWN, + artifact=ArtifactKind.NONE, + blocks_main_result=False, + ) + + +async def run_asset_plan( + plan: WorkflowPlan, + *, + image_task: AssetTask | None = None, + audio_task: AssetTask | None = None, +) -> AssetPlanRunResult: + """Execute asset-producing tasks in the order declared by a workflow plan.""" + + if plan.mode.value not in {"asset_generation", "asset_retry"}: + raise ValueError("run_asset_plan only supports asset workflow plans") + + task_results: list[AssetCompletionResult] = [] + executed_task_keys: list[str] = [] + ignored_task_keys: list[str] = [] + + for task in plan.tasks: + if task.key == "complete_image_asset": + if image_task is None: + raise ValueError("Asset workflow plan requires an image task handler") + task_results.append(await image_task()) + executed_task_keys.append(task.key) + continue + + if task.key == "complete_audio_asset": + if audio_task is None: + raise ValueError("Asset workflow plan requires an audio task handler") + task_results.append(await audio_task()) + executed_task_keys.append(task.key) + continue + + ignored_task_keys.append(task.key) + + return AssetPlanRunResult( + task_results=tuple(task_results), + executed_task_keys=tuple(executed_task_keys), + ignored_task_keys=tuple(ignored_task_keys), + ) diff --git a/backend/app/services/harness/fixtures/evaluation_golden_cases.json b/backend/app/services/harness/fixtures/evaluation_golden_cases.json new file mode 100644 index 0000000..9096f51 --- /dev/null +++ b/backend/app/services/harness/fixtures/evaluation_golden_cases.json @@ -0,0 +1,400 @@ +[ + { + "id": "story-safe-theme-pass", + "artifact": "story", + "description": "完整、儿童安全且清晰包含教育主题的普通故事。", + "coverage": { + "age_band": "5-6", + "content_shape": "short_story", + "risk_area": "happy_path", + "tags": ["theme_present", "safe", "story"] + }, + "input": { + "keywords": "小兔子, 月光花园", + "education_theme": "复盘" + }, + "output": { + "mode": "generated", + "title": "小兔子的月光花园", + "story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。", + "cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden" + }, + "expected": { + "passed": true, + "blocking": false, + "min_overall_score": 0.9, + "required_dimensions": [ + "structure", + "safety", + "age_fit", + "educational_value", + "readability" + ], + "quality_gate_codes": [] + } + }, + { + "id": "story-long-safe-pass", + "artifact": "story", + "description": "较长但仍适合亲子共读的普通故事。", + "coverage": { + "age_band": "7-8", + "content_shape": "long_story", + "risk_area": "length_boundary", + "tags": ["theme_present", "long_text", "story"] + }, + "input": { + "keywords": "小海豚, 图书馆", + "education_theme": "合作" + }, + "output": { + "mode": "generated", + "title": "小海豚的蓝色图书馆", + "story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。", + "cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library" + }, + "expected": { + "passed": true, + "blocking": false, + "min_overall_score": 0.9, + "required_dimensions": [ + "structure", + "safety", + "age_fit", + "educational_value", + "readability" + ], + "quality_gate_codes": [] + } + }, + { + "id": "story-missing-text-blocks", + "artifact": "story", + "description": "故事正文缺失会被确定性质量门阻断。", + "coverage": { + "age_band": "unknown", + "content_shape": "empty_story", + "risk_area": "schema_error", + "tags": ["missing_text", "story", "blocking"] + }, + "input": { + "keywords": "小熊, 星星" + }, + "output": { + "mode": "generated", + "title": "小熊找星星", + "story_text": "", + "cover_prompt_suggestion": "A bear looking at friendly stars" + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "missing_story_text" + ] + } + }, + { + "id": "story-missing-cover-prompt-blocks", + "artifact": "story", + "description": "故事正文完整但封面提示词缺失会被结构质量门阻断。", + "coverage": { + "age_band": "5-6", + "content_shape": "short_story", + "risk_area": "schema_error", + "tags": ["missing_cover_prompt", "story", "blocking"] + }, + "input": { + "keywords": "小松鼠, 风筝", + "education_theme": "勇敢" + }, + "output": { + "mode": "generated", + "title": "小松鼠的风筝", + "story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。", + "cover_prompt_suggestion": "" + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "missing_cover_prompt" + ] + } + }, + { + "id": "story-unsafe-term-blocks", + "artifact": "story", + "description": "明显不适合儿童的风险词会被安全质量门阻断。", + "coverage": { + "age_band": "3-4", + "content_shape": "short_story", + "risk_area": "safety_error", + "tags": ["unsafe_term", "story", "blocking"] + }, + "input": { + "keywords": "小猫, 城堡" + }, + "output": { + "mode": "generated", + "title": "小猫的城堡", + "story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。", + "cover_prompt_suggestion": "A cat near a castle" + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "unsafe_child_content" + ] + } + }, + { + "id": "story-short-high-threshold-blocks", + "artifact": "story", + "description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。", + "coverage": { + "age_band": "3-4", + "content_shape": "very_short_story", + "risk_area": "readability_warning", + "tags": ["short_text", "threshold_block", "story"] + }, + "input": { + "keywords": "小鹿, 书签", + "education_theme": "耐心", + "minimum_score": 0.82 + }, + "output": { + "mode": "generated", + "title": "小鹿的书签", + "story_text": "小鹿学会了耐心等待。", + "cover_prompt_suggestion": "A deer with a golden bookmark" + }, + "expected": { + "passed": false, + "blocking": true, + "min_overall_score": 0.7, + "max_overall_score": 0.8, + "required_dimensions": [ + "structure", + "safety", + "readability" + ], + "quality_gate_codes": [], + "warning_substrings": [ + "正文长度" + ] + } + }, + { + "id": "storybook-safe-theme-pass", + "artifact": "storybook", + "description": "完整、儿童安全且包含教育主题的绘本分页输出。", + "coverage": { + "age_band": "5-6", + "content_shape": "storybook_3_pages", + "risk_area": "happy_path", + "tags": ["theme_present", "safe", "storybook"] + }, + "input": { + "keywords": "小狐狸, 彩虹桥", + "education_theme": "合作" + }, + "output": { + "title": "彩虹桥上的合作", + "main_character": "小狐狸米米", + "art_style": "温暖水彩", + "cover_prompt": "A warm watercolor fox near a rainbow bridge", + "pages": [ + { + "page_number": 1, + "text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。", + "image_prompt": "A little fox finds a rainbow bridge" + }, + { + "page_number": 2, + "text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。", + "image_prompt": "Forest friends work together" + }, + { + "page_number": 3, + "text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。", + "image_prompt": "Friends carrying leaves together" + } + ] + }, + "expected": { + "passed": true, + "blocking": false, + "min_overall_score": 0.9, + "required_dimensions": [ + "structure", + "safety", + "age_fit", + "educational_value", + "readability" + ], + "quality_gate_codes": [] + } + }, + { + "id": "storybook-duplicate-page-blocks", + "artifact": "storybook", + "description": "重复页码的绘本结构会被质量门阻断。", + "coverage": { + "age_band": "5-6", + "content_shape": "storybook_invalid_pages", + "risk_area": "schema_error", + "tags": ["duplicate_page", "storybook", "blocking"] + }, + "input": { + "keywords": "小熊, 森林" + }, + "output": { + "title": "森林里的小熊", + "main_character": "小熊布布", + "art_style": "水彩", + "cover_prompt": "A bear in a forest", + "pages": [ + { + "page_number": 1, + "text": "布布在森林里找到一颗松果。", + "image_prompt": "Bear finds a pinecone" + }, + { + "page_number": 1, + "text": "布布把松果带给朋友一起观察。", + "image_prompt": "Bear shares the pinecone" + } + ] + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "invalid_storybook_page_number" + ] + } + }, + { + "id": "storybook-missing-page-blocks", + "artifact": "storybook", + "description": "没有分页内容的绘本会被结构质量门阻断。", + "coverage": { + "age_band": "unknown", + "content_shape": "storybook_empty_pages", + "risk_area": "schema_error", + "tags": ["missing_page", "storybook", "blocking"] + }, + "input": { + "keywords": "小鸟, 云朵" + }, + "output": { + "title": "小鸟和云朵", + "main_character": "小鸟啾啾", + "art_style": "柔和水彩", + "cover_prompt": "A bird near soft clouds", + "pages": [] + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "missing_storybook_page" + ] + } + }, + { + "id": "storybook-unsafe-term-blocks", + "artifact": "storybook", + "description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。", + "coverage": { + "age_band": "3-4", + "content_shape": "storybook_2_pages", + "risk_area": "safety_error", + "tags": ["unsafe_term", "storybook", "blocking"] + }, + "input": { + "keywords": "小兔子, 山洞" + }, + "output": { + "title": "山洞里的声音", + "main_character": "小兔子米粒", + "art_style": "温暖水彩", + "cover_prompt": "A rabbit near a cave", + "pages": [ + { + "page_number": 1, + "text": "米粒走到山洞边,听见奇怪的声音。", + "image_prompt": "Rabbit near a cave" + }, + { + "page_number": 2, + "text": "洞里出现血腥画面,米粒吓得跑开。", + "image_prompt": "Rabbit running away" + } + ] + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "unsafe_child_content" + ] + } + }, + { + "id": "storybook-short-page-warning", + "artifact": "storybook", + "description": "分页正文过短时保留内部警告,用于评测回归。", + "coverage": { + "age_band": "3-4", + "content_shape": "storybook_2_pages", + "risk_area": "readability_warning", + "tags": ["short_page_text", "threshold_block", "storybook"] + }, + "input": { + "keywords": "小羊, 风铃", + "minimum_score": 0.85 + }, + "output": { + "title": "风铃响了", + "main_character": "小羊团团", + "art_style": "柔和蜡笔", + "cover_prompt": "A lamb listening to a wind chime", + "pages": [ + { + "page_number": 1, + "text": "风响。", + "image_prompt": "Wind chime rings" + }, + { + "page_number": 2, + "text": "团团笑。", + "image_prompt": "Lamb smiles" + } + ] + }, + "expected": { + "passed": false, + "blocking": true, + "min_overall_score": 0.8, + "max_overall_score": 0.82, + "required_dimensions": [ + "structure", + "safety", + "readability" + ], + "quality_gate_codes": [], + "warning_substrings": [ + "分页正文长度" + ] + } + } +] diff --git a/backend/app/services/harness/plans.py b/backend/app/services/harness/plans.py index 9163ca7..a9f9247 100644 --- a/backend/app/services/harness/plans.py +++ b/backend/app/services/harness/plans.py @@ -69,6 +69,11 @@ def build_story_plan(*, generate_images: bool) -> WorkflowPlan: step=WorkflowStep.NARRATIVE_GENERATION, artifact=ArtifactKind.STORY_TEXT, ), + WorkflowTask( + key="evaluate_narrative", + step=WorkflowStep.EVALUATION, + artifact=ArtifactKind.STORY_TEXT, + ), WorkflowTask( key="persist_story", step=WorkflowStep.STORY_PERSISTENCE, @@ -124,6 +129,11 @@ def build_storybook_plan(*, generate_images: bool) -> WorkflowPlan: step=WorkflowStep.NARRATIVE_GENERATION, artifact=ArtifactKind.STORYBOOK_PAGES, ), + WorkflowTask( + key="evaluate_storybook_pages", + step=WorkflowStep.EVALUATION, + artifact=ArtifactKind.STORYBOOK_PAGES, + ), ] if generate_images: diff --git a/backend/app/services/harness/types.py b/backend/app/services/harness/types.py index 174a44d..25f18ce 100644 --- a/backend/app/services/harness/types.py +++ b/backend/app/services/harness/types.py @@ -11,6 +11,7 @@ class WorkflowStep(StrEnum): WORKER_START = "worker_start" CONTEXT_PREPARATION = "context_preparation" NARRATIVE_GENERATION = "narrative_generation" + EVALUATION = "evaluation" STORY_PERSISTENCE = "story_persistence" PROVIDER_INVOCATION = "provider_invocation" IMAGE_GENERATION = "image_generation" @@ -64,6 +65,8 @@ class StepStatus(StrEnum): EVENT_STEP_MAP: dict[str, WorkflowStep] = { "request_accepted": WorkflowStep.REQUEST_ACCEPTANCE, + "workflow_planned": WorkflowStep.REQUEST_ACCEPTANCE, + "executor_completed": WorkflowStep.UNKNOWN, "retry_queued": WorkflowStep.REQUEST_ACCEPTANCE, "worker_started": WorkflowStep.WORKER_START, "context_prepared": WorkflowStep.CONTEXT_PREPARATION, @@ -73,6 +76,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = { "provider_call_succeeded": WorkflowStep.PROVIDER_INVOCATION, "provider_call_failed": WorkflowStep.PROVIDER_INVOCATION, "quality_gate_failed": WorkflowStep.NARRATIVE_GENERATION, + "evaluation_completed": WorkflowStep.EVALUATION, "cover_image_started": WorkflowStep.IMAGE_GENERATION, "cover_image_succeeded": WorkflowStep.IMAGE_GENERATION, "cover_image_failed": WorkflowStep.IMAGE_GENERATION, @@ -100,6 +104,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = { EVENT_ARTIFACT_MAP: dict[str, ArtifactKind] = { "narrative_generated": ArtifactKind.STORY_TEXT, "quality_gate_failed": ArtifactKind.STORY_TEXT, + "evaluation_completed": ArtifactKind.STORY_TEXT, "cover_image_started": ArtifactKind.COVER_IMAGE, "cover_image_succeeded": ArtifactKind.COVER_IMAGE, "cover_image_failed": ArtifactKind.COVER_IMAGE, diff --git a/backend/app/services/story_service.py b/backend/app/services/story_service.py index 6869b91..a8a42ef 100644 --- a/backend/app/services/story_service.py +++ b/backend/app/services/story_service.py @@ -36,8 +36,8 @@ from app.services.generation_jobs import ( ensure_no_active_story_generation_job, finish_generation_job, generation_job_can_retry, - generation_job_to_summary, get_generation_job_for_user, + public_generation_job_to_summary, record_generation_event, ) from app.services.harness.artifacts import ( @@ -57,12 +57,27 @@ from app.services.harness.control import ( ExecutionControl, GenerationJobCanceledError, ) +from app.services.harness.evaluators import ( + EvaluationResult, + evaluate_story_output, + evaluate_storybook_output, +) +from app.services.harness.executor import ( + record_evaluation_result, + record_executor_result, + record_workflow_plan, + run_asset_plan, +) +from app.services.harness.plans import ( + build_asset_plan, + build_story_plan, + build_storybook_plan, +) from app.services.harness.quality_gates import ( QualityGateError, - validate_story_output, - validate_storybook_output, ) from app.services.harness.trace import TraceRecorder +from app.services.harness.types import ArtifactKind from app.services.memory_service import build_enhanced_memory_context from app.services.provider_router import ( generate_image, @@ -129,6 +144,24 @@ async def _record_quality_gate_failure_if_present( ) +async def _record_evaluation_result_if_present( + db: AsyncSession, + *, + job, + evaluation: EvaluationResult, + artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT, +) -> None: + """Append deterministic evaluation metadata for tracked worker jobs.""" + + await record_evaluation_result( + db, + job=job, + metadata=evaluation.to_metadata(), + status="succeeded" if evaluation.passed else "failed", + artifact=artifact, + ) + + def _asset_result_metadata(result: AssetCompletionResult) -> dict: """Build JSON-safe metadata for asset workflow events.""" @@ -643,18 +676,33 @@ async def generate_and_save_story( user_id=user_id, generation_job=job, ) - validate_story_output(result) - except QualityGateError as exc: - await _record_quality_gate_failure_if_present(db, job=job, error=exc) - raise HTTPException( - status_code=502, - detail="Story generation failed quality checks, please try again.", - ) from exc except Exception as exc: raise HTTPException( status_code=502, detail="Story generation failed, please try again.", ) from exc + + evaluation = evaluate_story_output( + result, + education_theme=request.education_theme, + ) + if evaluation.gate_error is not None: + await _record_quality_gate_failure_if_present( + db, + job=job, + error=evaluation.gate_error, + ) + await _record_evaluation_result_if_present( + db, + job=job, + evaluation=evaluation, + ) + if evaluation.blocking: + raise HTTPException( + status_code=502, + detail="Story generation failed quality checks, please try again.", + ) + await _record_job_event_if_present( db, job=job, @@ -758,13 +806,32 @@ async def generate_storybook_service( user_id=user_id, generation_job=job, ) - validate_storybook_output(storybook) - except QualityGateError as exc: - await _record_quality_gate_failure_if_present(db, job=job, error=exc) - raise HTTPException(status_code=500, detail=f"故事书质量检查失败: {exc}") from exc except Exception as e: logger.error("storybook_generation_failed", error=str(e)) raise HTTPException(status_code=500, detail=f"故事书生成失败: {e}") + + evaluation = evaluate_storybook_output( + storybook, + education_theme=request.education_theme, + ) + if evaluation.gate_error is not None: + await _record_quality_gate_failure_if_present( + db, + job=job, + error=evaluation.gate_error, + ) + await _record_evaluation_result_if_present( + db, + job=job, + evaluation=evaluation, + artifact=ArtifactKind.STORYBOOK_PAGES, + ) + if evaluation.blocking: + raise HTTPException( + status_code=500, + detail=f"故事书质量检查失败: {evaluation.gate_error or 'evaluation blocked'}", + ) + await _record_job_event_if_present( db, job=job, @@ -1025,28 +1092,50 @@ async def _generate_asset_generation_service_with_job( if not requested_assets: raise HTTPException(status_code=400, detail="资源任务缺少 assets。") + plan = build_asset_plan( + output_mode="asset_generation", + assets=requested_assets, + ) + await record_workflow_plan( + db, + job=job, + plan=plan, + ) + story = await get_story_detail(int(story_id), job.user_id, db) - if "image" in requested_assets: + async def complete_image() -> AssetCompletionResult: if story.mode == "storybook": - await _complete_storybook_image_assets(story, db, job=job) - else: - await _complete_cover_image_asset( - story, - db, - raise_on_failure=True, - log_event="cover_generation_failed", - job=job, - ) + return await _complete_storybook_image_assets(story, db, job=job) - if "audio" in requested_assets: - await _complete_audio_asset( + return await _complete_cover_image_asset( + story, + db, + raise_on_failure=True, + log_event="cover_generation_failed", + job=job, + ) + + async def complete_audio() -> AssetCompletionResult: + return await _complete_audio_asset( story, db, raise_on_failure=True, job=job, ) + asset_plan_result = await run_asset_plan( + plan, + image_task=complete_image if "image" in requested_assets else None, + audio_task=complete_audio if "audio" in requested_assets else None, + ) + await record_executor_result( + db, + job=job, + plan=plan, + result=asset_plan_result, + ) + story = await get_story_detail(story.id, job.user_id, db) await finish_generation_job( db, @@ -1096,7 +1185,7 @@ async def retry_generation_job_service( ) await _dispatch_generation_job(db, job=retry_job) await db.refresh(retry_job) - return generation_job_to_summary(retry_job) + return public_generation_job_to_summary(retry_job) async def _generate_generation_service_with_job( @@ -1109,6 +1198,11 @@ async def _generate_generation_service_with_job( """Run the unified generation workflow after the tracking job has been created.""" if request.output_mode == "storybook": + await record_workflow_plan( + db, + job=job, + plan=build_storybook_plan(generate_images=request.generate_images), + ) storybook = await generate_storybook_service( StorybookRequest( keywords=request.data, @@ -1155,6 +1249,9 @@ async def _generate_generation_service_with_job( retryable_assets=saved_story.retryable_assets, ) + if request.output_mode == "story" and not request.generate_images: + return await _execute_story_without_assets_plan(request, user_id, db, job=job) + generate_request = GenerateRequest( type=request.type, data=request.data, @@ -1164,6 +1261,11 @@ async def _generate_generation_service_with_job( ) if request.generate_images: + await record_workflow_plan( + db, + job=job, + plan=build_story_plan(generate_images=True), + ) story = await generate_full_story_service(generate_request, user_id, db, job=job) saved_story = await get_story_detail(story.id, user_id, db) await _record_postprocessing_event_if_needed(db, job=job, story=saved_story) @@ -1222,6 +1324,54 @@ async def _generate_generation_service_with_job( universe_id=story.universe_id, retryable_assets=story.retryable_assets, ) + + +async def _execute_story_without_assets_plan( + request: GenerationRequest, + user_id: str, + db: AsyncSession, + *, + job, +) -> GenerationResponse: + """Execute the minimal text-story workflow through an explicit plan.""" + + plan = build_story_plan(generate_images=False) + await record_workflow_plan(db, job=job, plan=plan) + + generate_request = GenerateRequest( + type=request.type, + data=request.data, + education_theme=request.education_theme, + child_profile_id=request.child_profile_id, + universe_id=request.universe_id, + ) + story = await generate_and_save_story(generate_request, user_id, db, job=job) + await _record_postprocessing_event_if_needed(db, job=job, story=story) + await finish_generation_job( + db, + job=job, + story=story, + current_step="generation_completed", + message="Story generation completed with a persisted readable narrative.", + ) + return GenerationResponse( + id=story.id, + generation_job_id=job.id, + title=story.title, + mode=story.mode, + story_text=story.story_text, + cover_prompt=story.cover_prompt, + image_url=story.image_url, + cover_url=story.image_url, + generation_status=story.generation_status, + text_status=story.text_status, + image_status=story.image_status, + audio_status=story.audio_status, + last_error=story.last_error, + child_profile_id=story.child_profile_id, + universe_id=story.universe_id, + retryable_assets=story.retryable_assets, + ) async def list_stories( @@ -1321,36 +1471,7 @@ async def queue_story_asset_generation( ) await _dispatch_generation_job(db, job=job) await db.refresh(job) - return generation_job_to_summary(job) - - -async def _retry_cover_image_asset(story: Story, db: AsyncSession, *, job=None) -> None: - """Retry cover generation for a text story.""" - - await _complete_cover_image_asset( - story, - db, - last_error_prefix="封面生成失败", - log_event="cover_asset_retry_failed", - job=job, - ) - - -async def _retry_storybook_image_assets( - story: Story, - db: AsyncSession, - *, - job=None, -) -> None: - """Retry missing storybook cover/page images.""" - - await _complete_storybook_image_assets(story, db, job=job) - - -async def _retry_audio_asset(story: Story, db: AsyncSession, *, job=None) -> None: - """Retry audio generation while preserving persisted status on provider failure.""" - - await _complete_audio_asset(story, db, raise_on_failure=False, job=job) + return public_generation_job_to_summary(job) async def retry_story_assets( @@ -1374,6 +1495,15 @@ async def retry_story_assets( try: story = await get_story_detail(story_id, user_id, db) + plan = build_asset_plan( + output_mode="asset_retry", + assets=requested_assets, + ) + await record_workflow_plan( + db, + job=job, + plan=plan, + ) await record_generation_event( db, job=job, @@ -1384,14 +1514,37 @@ async def retry_story_assets( metadata={"assets": requested_assets}, ) - if "image" in requested_assets: + async def retry_image() -> AssetCompletionResult: if story.mode == "storybook": - await _retry_storybook_image_assets(story, db, job=job) - else: - await _retry_cover_image_asset(story, db, job=job) + return await _complete_storybook_image_assets(story, db, job=job) - if "audio" in requested_assets: - await _retry_audio_asset(story, db, job=job) + return await _complete_cover_image_asset( + story, + db, + last_error_prefix="封面生成失败", + log_event="cover_asset_retry_failed", + job=job, + ) + + async def retry_audio() -> AssetCompletionResult: + return await _complete_audio_asset( + story, + db, + raise_on_failure=False, + job=job, + ) + + asset_plan_result = await run_asset_plan( + plan, + image_task=retry_image if "image" in requested_assets else None, + audio_task=retry_audio if "audio" in requested_assets else None, + ) + await record_executor_result( + db, + job=job, + plan=plan, + result=asset_plan_result, + ) story = await get_story_detail(story_id, user_id, db) await finish_generation_job( @@ -1448,13 +1601,29 @@ async def generate_story_cover( try: story = await get_story_detail(story_id, user_id, db) - image_result = await _complete_cover_image_asset( - story, + plan = build_asset_plan(output_mode="asset_generation", assets=["image"]) + await record_workflow_plan( db, - raise_on_failure=True, - log_event="cover_generation_failed", job=job, + plan=plan, ) + asset_result = await run_asset_plan( + plan, + image_task=lambda: _complete_cover_image_asset( + story, + db, + raise_on_failure=True, + log_event="cover_generation_failed", + job=job, + ), + ) + await record_executor_result( + db, + job=job, + plan=plan, + result=asset_result, + ) + image_result = asset_result.task_results[0] if asset_result.task_results else None story = await get_story_detail(story_id, user_id, db) await finish_generation_job( db, @@ -1464,7 +1633,11 @@ async def generate_story_cover( message="Cover image generation completed.", metadata={"assets": ["image"]}, ) - if image_result.succeeded and isinstance(image_result.value, str): + if ( + image_result is not None + and image_result.succeeded + and isinstance(image_result.value, str) + ): return image_result.value except HTTPException as exc: await finish_generation_job( @@ -1501,12 +1674,28 @@ async def generate_story_audio( try: story = await get_story_detail(story_id, user_id, db) - audio_result = await _complete_audio_asset( - story, + plan = build_asset_plan(output_mode="asset_generation", assets=["audio"]) + await record_workflow_plan( db, - raise_on_failure=True, job=job, + plan=plan, ) + asset_result = await run_asset_plan( + plan, + audio_task=lambda: _complete_audio_asset( + story, + db, + raise_on_failure=True, + job=job, + ), + ) + await record_executor_result( + db, + job=job, + plan=plan, + result=asset_result, + ) + audio_result = asset_result.task_results[0] if asset_result.task_results else None story = await get_story_detail(story_id, user_id, db) await finish_generation_job( db, @@ -1516,7 +1705,11 @@ async def generate_story_audio( message="Story audio generation completed.", metadata={"assets": ["audio"]}, ) - if audio_result.succeeded and isinstance(audio_result.value, bytes): + if ( + audio_result is not None + and audio_result.succeeded + and isinstance(audio_result.value, bytes) + ): return audio_result.value except HTTPException as exc: await finish_generation_job( diff --git a/backend/tests/fixtures/evaluation_golden_cases.json b/backend/tests/fixtures/evaluation_golden_cases.json new file mode 100644 index 0000000..9096f51 --- /dev/null +++ b/backend/tests/fixtures/evaluation_golden_cases.json @@ -0,0 +1,400 @@ +[ + { + "id": "story-safe-theme-pass", + "artifact": "story", + "description": "完整、儿童安全且清晰包含教育主题的普通故事。", + "coverage": { + "age_band": "5-6", + "content_shape": "short_story", + "risk_area": "happy_path", + "tags": ["theme_present", "safe", "story"] + }, + "input": { + "keywords": "小兔子, 月光花园", + "education_theme": "复盘" + }, + "output": { + "mode": "generated", + "title": "小兔子的月光花园", + "story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。", + "cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden" + }, + "expected": { + "passed": true, + "blocking": false, + "min_overall_score": 0.9, + "required_dimensions": [ + "structure", + "safety", + "age_fit", + "educational_value", + "readability" + ], + "quality_gate_codes": [] + } + }, + { + "id": "story-long-safe-pass", + "artifact": "story", + "description": "较长但仍适合亲子共读的普通故事。", + "coverage": { + "age_band": "7-8", + "content_shape": "long_story", + "risk_area": "length_boundary", + "tags": ["theme_present", "long_text", "story"] + }, + "input": { + "keywords": "小海豚, 图书馆", + "education_theme": "合作" + }, + "output": { + "mode": "generated", + "title": "小海豚的蓝色图书馆", + "story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。", + "cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library" + }, + "expected": { + "passed": true, + "blocking": false, + "min_overall_score": 0.9, + "required_dimensions": [ + "structure", + "safety", + "age_fit", + "educational_value", + "readability" + ], + "quality_gate_codes": [] + } + }, + { + "id": "story-missing-text-blocks", + "artifact": "story", + "description": "故事正文缺失会被确定性质量门阻断。", + "coverage": { + "age_band": "unknown", + "content_shape": "empty_story", + "risk_area": "schema_error", + "tags": ["missing_text", "story", "blocking"] + }, + "input": { + "keywords": "小熊, 星星" + }, + "output": { + "mode": "generated", + "title": "小熊找星星", + "story_text": "", + "cover_prompt_suggestion": "A bear looking at friendly stars" + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "missing_story_text" + ] + } + }, + { + "id": "story-missing-cover-prompt-blocks", + "artifact": "story", + "description": "故事正文完整但封面提示词缺失会被结构质量门阻断。", + "coverage": { + "age_band": "5-6", + "content_shape": "short_story", + "risk_area": "schema_error", + "tags": ["missing_cover_prompt", "story", "blocking"] + }, + "input": { + "keywords": "小松鼠, 风筝", + "education_theme": "勇敢" + }, + "output": { + "mode": "generated", + "title": "小松鼠的风筝", + "story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。", + "cover_prompt_suggestion": "" + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "missing_cover_prompt" + ] + } + }, + { + "id": "story-unsafe-term-blocks", + "artifact": "story", + "description": "明显不适合儿童的风险词会被安全质量门阻断。", + "coverage": { + "age_band": "3-4", + "content_shape": "short_story", + "risk_area": "safety_error", + "tags": ["unsafe_term", "story", "blocking"] + }, + "input": { + "keywords": "小猫, 城堡" + }, + "output": { + "mode": "generated", + "title": "小猫的城堡", + "story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。", + "cover_prompt_suggestion": "A cat near a castle" + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "unsafe_child_content" + ] + } + }, + { + "id": "story-short-high-threshold-blocks", + "artifact": "story", + "description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。", + "coverage": { + "age_band": "3-4", + "content_shape": "very_short_story", + "risk_area": "readability_warning", + "tags": ["short_text", "threshold_block", "story"] + }, + "input": { + "keywords": "小鹿, 书签", + "education_theme": "耐心", + "minimum_score": 0.82 + }, + "output": { + "mode": "generated", + "title": "小鹿的书签", + "story_text": "小鹿学会了耐心等待。", + "cover_prompt_suggestion": "A deer with a golden bookmark" + }, + "expected": { + "passed": false, + "blocking": true, + "min_overall_score": 0.7, + "max_overall_score": 0.8, + "required_dimensions": [ + "structure", + "safety", + "readability" + ], + "quality_gate_codes": [], + "warning_substrings": [ + "正文长度" + ] + } + }, + { + "id": "storybook-safe-theme-pass", + "artifact": "storybook", + "description": "完整、儿童安全且包含教育主题的绘本分页输出。", + "coverage": { + "age_band": "5-6", + "content_shape": "storybook_3_pages", + "risk_area": "happy_path", + "tags": ["theme_present", "safe", "storybook"] + }, + "input": { + "keywords": "小狐狸, 彩虹桥", + "education_theme": "合作" + }, + "output": { + "title": "彩虹桥上的合作", + "main_character": "小狐狸米米", + "art_style": "温暖水彩", + "cover_prompt": "A warm watercolor fox near a rainbow bridge", + "pages": [ + { + "page_number": 1, + "text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。", + "image_prompt": "A little fox finds a rainbow bridge" + }, + { + "page_number": 2, + "text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。", + "image_prompt": "Forest friends work together" + }, + { + "page_number": 3, + "text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。", + "image_prompt": "Friends carrying leaves together" + } + ] + }, + "expected": { + "passed": true, + "blocking": false, + "min_overall_score": 0.9, + "required_dimensions": [ + "structure", + "safety", + "age_fit", + "educational_value", + "readability" + ], + "quality_gate_codes": [] + } + }, + { + "id": "storybook-duplicate-page-blocks", + "artifact": "storybook", + "description": "重复页码的绘本结构会被质量门阻断。", + "coverage": { + "age_band": "5-6", + "content_shape": "storybook_invalid_pages", + "risk_area": "schema_error", + "tags": ["duplicate_page", "storybook", "blocking"] + }, + "input": { + "keywords": "小熊, 森林" + }, + "output": { + "title": "森林里的小熊", + "main_character": "小熊布布", + "art_style": "水彩", + "cover_prompt": "A bear in a forest", + "pages": [ + { + "page_number": 1, + "text": "布布在森林里找到一颗松果。", + "image_prompt": "Bear finds a pinecone" + }, + { + "page_number": 1, + "text": "布布把松果带给朋友一起观察。", + "image_prompt": "Bear shares the pinecone" + } + ] + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "invalid_storybook_page_number" + ] + } + }, + { + "id": "storybook-missing-page-blocks", + "artifact": "storybook", + "description": "没有分页内容的绘本会被结构质量门阻断。", + "coverage": { + "age_band": "unknown", + "content_shape": "storybook_empty_pages", + "risk_area": "schema_error", + "tags": ["missing_page", "storybook", "blocking"] + }, + "input": { + "keywords": "小鸟, 云朵" + }, + "output": { + "title": "小鸟和云朵", + "main_character": "小鸟啾啾", + "art_style": "柔和水彩", + "cover_prompt": "A bird near soft clouds", + "pages": [] + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "missing_storybook_page" + ] + } + }, + { + "id": "storybook-unsafe-term-blocks", + "artifact": "storybook", + "description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。", + "coverage": { + "age_band": "3-4", + "content_shape": "storybook_2_pages", + "risk_area": "safety_error", + "tags": ["unsafe_term", "storybook", "blocking"] + }, + "input": { + "keywords": "小兔子, 山洞" + }, + "output": { + "title": "山洞里的声音", + "main_character": "小兔子米粒", + "art_style": "温暖水彩", + "cover_prompt": "A rabbit near a cave", + "pages": [ + { + "page_number": 1, + "text": "米粒走到山洞边,听见奇怪的声音。", + "image_prompt": "Rabbit near a cave" + }, + { + "page_number": 2, + "text": "洞里出现血腥画面,米粒吓得跑开。", + "image_prompt": "Rabbit running away" + } + ] + }, + "expected": { + "passed": false, + "blocking": true, + "max_overall_score": 0.0, + "quality_gate_codes": [ + "unsafe_child_content" + ] + } + }, + { + "id": "storybook-short-page-warning", + "artifact": "storybook", + "description": "分页正文过短时保留内部警告,用于评测回归。", + "coverage": { + "age_band": "3-4", + "content_shape": "storybook_2_pages", + "risk_area": "readability_warning", + "tags": ["short_page_text", "threshold_block", "storybook"] + }, + "input": { + "keywords": "小羊, 风铃", + "minimum_score": 0.85 + }, + "output": { + "title": "风铃响了", + "main_character": "小羊团团", + "art_style": "柔和蜡笔", + "cover_prompt": "A lamb listening to a wind chime", + "pages": [ + { + "page_number": 1, + "text": "风响。", + "image_prompt": "Wind chime rings" + }, + { + "page_number": 2, + "text": "团团笑。", + "image_prompt": "Lamb smiles" + } + ] + }, + "expected": { + "passed": false, + "blocking": true, + "min_overall_score": 0.8, + "max_overall_score": 0.82, + "required_dimensions": [ + "structure", + "safety", + "readability" + ], + "quality_gate_codes": [], + "warning_substrings": [ + "分页正文长度" + ] + } + } +] diff --git a/backend/tests/harness-evaluation-test-cases.md b/backend/tests/harness-evaluation-test-cases.md new file mode 100644 index 0000000..c4b3d2f --- /dev/null +++ b/backend/tests/harness-evaluation-test-cases.md @@ -0,0 +1,610 @@ +# Test Cases: Harness Evaluation Driven Generation + +## Overview + +- **Feature**: Harness evaluation driven generation +- **Requirements Source**: `docs/technical/harness-engineering-modernization.md` +- **Test Coverage**: evaluation scoring, blocking quality failures, workflow plan events, trace aggregation, state transitions, internal golden replay, admin-only analytics, admin-only executor coverage summary, admin-only harness readiness +- **Last Updated**: 2026-06-23 + +## Test Case Categories + +### 1. Functional Tests + +#### TC-F-001: 普通故事无图片生成写入评测事件 + +- **Requirement**: H7-3, H7-4 +- **Priority**: High +- **Preconditions**: + - 用户已登录。 + - 文本 provider 返回完整、儿童安全的故事。 +- **Test Steps**: + 1. 调用 `POST /api/generations`,设置 `output_mode=story`、`generate_images=false`。 + 2. 执行 worker 任务。 + 3. 查询 job detail。 +- **Expected Results**: + - job 状态为 `completed`。 + - event 顺序包含 `workflow_planned`。 + - event 顺序包含 `evaluation_completed`。 + - `evaluation_completed.event_metadata.passed=true`。 + - `evaluation_completed.event_metadata.overall_score >= 0.7`。 +- **Postconditions**: 故事已持久化,`story_id` 写入 job。 + +#### TC-F-003: 用户 Trace summary 不返回评测摘要 + +- **Requirement**: H7-4, H7B-1 +- **Priority**: High +- **Preconditions**: + - 故事已有 `evaluation_completed` job event。 +- **Test Steps**: + 1. 调用 `GET /api/generations/{story_id}/trace-summary`。 + 2. 检查响应字段。 +- **Expected Results**: + - 响应不包含 `evaluation` 字段。 + - `by_step` 不包含 `evaluation`。 + - `by_artifact` 不因 `evaluation_completed` 增加 `story_text` 计数。 + - `failed_events` 不统计 `evaluation_completed`。 + - `total_events` 不统计 `evaluation_completed`,避免通过事件数量泄露内部评测步骤。 +- **Postconditions**: 无数据修改。 + +#### TC-F-004: 用户 Job detail 不返回评测事件 + +- **Requirement**: H7-4, H7B-2 +- **Priority**: High +- **Preconditions**: + - job 已记录 `evaluation_completed` 事件。 +- **Test Steps**: + 1. 调用 `GET /api/generations/jobs/{job_id}`。 + 2. 检查 `events` 列表。 +- **Expected Results**: + - `events` 不包含 `evaluation_completed`。 + - 响应不包含评测分数、维度分数、通过率或阻断阈值。 +- **Postconditions**: 内部数据库事件不被删除。 + +#### TC-F-002: 完整故事输出获得通过评分 + +- **Requirement**: H7-1 +- **Priority**: High +- **Preconditions**: + - 构造完整 `StoryOutput`。 +- **Test Steps**: + 1. 调用 `evaluate_story_output`。 + 2. 读取 `EvaluationResult`。 +- **Expected Results**: + - `passed=true`。 + - `blocking=false`。 + - scores 包含 `structure`、`safety`、`age_fit`、`educational_value`、`readability`。 +- **Postconditions**: 无持久化副作用。 + +#### TC-F-005: 完整绘本输出获得通过评分 + +- **Requirement**: H7-1, H7C-1 +- **Priority**: High +- **Preconditions**: + - 构造完整 `Storybook`。 +- **Test Steps**: + 1. 调用 `evaluate_storybook_output`。 + 2. 读取 `EvaluationResult`。 +- **Expected Results**: + - `passed=true`。 + - `blocking=false`。 + - scores 包含 `structure`、`safety`、`age_fit`、`educational_value`、`readability`。 +- **Postconditions**: 无持久化副作用。 + +#### TC-F-006: 内部 golden cases 可回放且全部符合预期 + +- **Requirement**: H7-7, H7-8 +- **Priority**: High +- **Preconditions**: + - `backend/app/services/harness/fixtures/evaluation_golden_cases.json` 存在。 + - fixture 只由后端测试、内部工具或 admin-only readiness 读取。 +- **Test Steps**: + 1. 调用 `replay_evaluation_golden_cases`。 + 2. 读取 `EvaluationReplaySuiteResult`。 +- **Expected Results**: + - `passed=true`。 + - `failed_case_ids` 为空。 + - 普通故事和绘本样本都被覆盖。 + - 样本覆盖完整普通故事、较长普通故事、空正文、缺失封面提示词、安全风险词、短文本阈值阻断、绘本重复页码、绘本缺页、绘本安全风险和绘本短分页。 + - 结果不通过任何用户端 API 返回。 +- **Postconditions**: 无持久化副作用。 + +#### TC-F-007: 内部 golden replay 覆盖摘要稳定 + +- **Requirement**: H7-8 +- **Priority**: High +- **Preconditions**: + - golden replay suite 已执行。 +- **Test Steps**: + 1. 调用 `coverage_summary`。 + 2. 检查 artifact、age_band、risk_area、tags 和 outcome 分布。 +- **Expected Results**: + - artifact 覆盖 `story=6`、`storybook=5`。 + - age_band 覆盖 `3-4`、`5-6`、`7-8` 和 `unknown`。 + - risk_area 覆盖 `happy_path`、`schema_error`、`safety_error`、`readability_warning`、`length_boundary`。 + - outcome 覆盖 `passed=3`、`blocked=8`。 + - 覆盖摘要不通过任何用户端 API 返回。 +- **Postconditions**: 无持久化副作用。 + +### 2. Edge Case Tests + +#### TC-E-001: 很短故事通过结构但产生低龄阅读体验警告 + +- **Requirement**: H7-1 +- **Priority**: Medium +- **Preconditions**: + - 构造标题、正文、封面提示词完整但正文很短的 `StoryOutput`。 +- **Test Steps**: + 1. 调用 `evaluate_story_output`。 + 2. 读取 warnings 和维度分数。 +- **Expected Results**: + - 不触发质量门异常。 + - `age_fit` 或 `readability` 分数低于完整故事。 + - warnings 包含阅读体验提示。 +- **Postconditions**: 无持久化副作用。 + +#### TC-E-002: 内部 golden replay 能报告预期不匹配 + +- **Requirement**: H7-7 +- **Priority**: Medium +- **Preconditions**: + - 构造一个实际得分低于期望阈值的 `EvaluationReplayCase`。 +- **Test Steps**: + 1. 调用 `run_evaluation_replay_cases`。 + 2. 读取 `failure_report`。 +- **Expected Results**: + - `passed=false`。 + - `failed_case_ids` 包含该 case id。 + - `failure_report` 包含 `overall_score` 差异。 +- **Postconditions**: 无持久化副作用。 + +### 3. Error Handling Tests + +#### TC-ERR-001: 空正文阻断持久化 + +- **Requirement**: H7-4 +- **Priority**: High +- **Preconditions**: + - 文本 provider 返回空 `story_text`。 +- **Test Steps**: + 1. 执行 worker 任务。 + 2. 查询 job 和 story 表。 + 3. 查询 job events。 +- **Expected Results**: + - job 状态为 `failed`。 + - 没有 story 被持久化。 + - events 包含 `quality_gate_failed`。 + - events 包含 `evaluation_completed`。 + - `evaluation_completed.event_metadata.blocking=true`。 +- **Postconditions**: 用户可重试该 job。 + +#### TC-ERR-002: 不适龄风险词阻断生成 + +- **Requirement**: H7-1 +- **Priority**: High +- **Preconditions**: + - 构造包含明显不适龄风险词的 `StoryOutput`。 +- **Test Steps**: + 1. 调用 `evaluate_story_output`。 + 2. 读取 `quality_gate` metadata。 +- **Expected Results**: + - `passed=false`。 + - `blocking=true`。 + - `quality_gate.issues[0].failure_category=safety_error`。 +- **Postconditions**: 无持久化副作用。 + +#### TC-ERR-003: 绘本结构错误阻断生成 + +- **Requirement**: H7-1, H7C-1 +- **Priority**: High +- **Preconditions**: + - 构造页码重复或页面缺失的 `Storybook`。 +- **Test Steps**: + 1. 调用 `evaluate_storybook_output`。 + 2. 读取 `quality_gate` metadata。 +- **Expected Results**: + - `passed=false`。 + - `blocking=true`。 + - `quality_gate.issues[0].code=invalid_storybook_page_number` 或对应结构错误。 +- **Postconditions**: 无持久化副作用。 + +### 4. State Transition Tests + +#### TC-ST-001: 普通故事无图片路径事件顺序稳定 + +- **Requirement**: H7-3 +- **Priority**: High +- **Preconditions**: + - job 初始状态为 `running/request_accepted`。 +- **Test Steps**: + 1. 执行 worker 任务。 + 2. 按 id 查询 events。 +- **Expected Results**: + - event 顺序为 `request_accepted`、`worker_started`、`workflow_planned`、`context_prepared`、`evaluation_completed`、`narrative_generated`、`story_saved`、`generation_completed`。 +- **Postconditions**: job `current_step=generation_completed`。 + +#### TC-ST-002: 普通故事带图片路径记录可恢复资产计划 + +- **Requirement**: H9-1, H9-3 +- **Priority**: High +- **Preconditions**: + - job 初始状态为 `running/request_accepted`。 + - 请求设置 `output_mode=story`、`generate_images=true`。 + - 文本 provider 返回合格故事,图片 provider 返回封面 URL。 +- **Test Steps**: + 1. 执行 worker 任务。 + 2. 按 id 查询内部 events。 + 3. 读取 `workflow_planned.event_metadata.plan`。 +- **Expected Results**: + - event 顺序为 `request_accepted`、`worker_started`、`workflow_planned`、`context_prepared`、`evaluation_completed`、`narrative_generated`、`story_saved`、`cover_image_started`、`cover_image_succeeded`、`generation_completed`。 + - `plan.mode=story_with_assets`。 + - plan tasks 包含 `evaluate_narrative`。 + - plan tasks 包含 `generate_cover_image`。 + - `generate_cover_image.required=false`。 + - `generate_cover_image.recoverable=true`。 +- **Postconditions**: job `current_step=generation_completed`,故事 `image_status=ready`。 + +#### TC-ST-003: 绘本路径记录绘本计划快照 + +- **Requirement**: H9-2, H9-3 +- **Priority**: High +- **Preconditions**: + - job 初始状态为 `running/request_accepted`。 + - 请求设置 `output_mode=storybook`。 +- **Test Steps**: + 1. 执行 worker 任务。 + 2. 按 id 查询内部 events。 + 3. 读取 `workflow_planned.event_metadata.plan`。 +- **Expected Results**: + - event 顺序包含 `workflow_planned`,且位于 `worker_started` 和 `context_prepared` 之间。 + - `plan.mode=storybook`。 + - plan tasks 包含 `generate_storybook_pages`。 + - plan tasks 包含 `evaluate_storybook_pages`。 + - 当 `generate_images=true` 时,plan tasks 包含 `generate_storybook_images`。 + - `generate_storybook_images.required=false`。 + - `generate_storybook_images.recoverable=true`。 +- **Postconditions**: job `current_step=generation_completed`。 + +#### TC-ST-004: 绘本生成内部记录评测但用户事件脱敏 + +- **Requirement**: H7C-1, H7B-2, H9-4 +- **Priority**: High +- **Preconditions**: + - 绘本生成 job 已执行完成。 +- **Test Steps**: + 1. 直接查询内部 `generation_job_events`。 + 2. 调用 `GET /api/generations/jobs/{job_id}`。 +- **Expected Results**: + - 内部事件包含 `evaluation_completed`。 + - 内部 `evaluation_completed.event_metadata.artifact=storybook_pages`。 + - 用户 API events 不包含 `evaluation_completed`。 + - 用户 API 响应不包含 `overall_score`、维度分数、阈值或 golden replay 字段。 +- **Postconditions**: job 完成,绘本已持久化。 + +#### TC-ST-005: 资产生成和重试路径记录资产计划快照 + +- **Requirement**: H10-1, H10-2, H10-3 +- **Priority**: High +- **Preconditions**: + - 故事已有可生成或可重试的图片/音频资源。 +- **Test Steps**: + 1. 执行 `asset_generation` worker 任务。 + 2. 调用 `/api/generations/{story_id}/retry-assets`。 + 3. 按 id 查询内部 events。 +- **Expected Results**: + - `asset_generation` 事件顺序包含 `workflow_planned`。 + - `asset_generation` 的 `plan.mode=asset_generation`。 + - `asset_retry` 事件顺序包含 `workflow_planned`。 + - `asset_retry` 的 `plan.mode=asset_retry`。 + - 图片和音频任务在 plan 中为 `required=false`、`recoverable=true`。 +- **Postconditions**: 资源状态按原有语义更新。 + +#### TC-ST-006: 用户事件 metadata 使用白名单脱敏 + +- **Requirement**: H10-4, H10-5 +- **Priority**: High +- **Preconditions**: + - 内部 job events 包含原始 `plan.tasks`、`result_snapshot`、内部阈值或内部错误详情。 +- **Test Steps**: + 1. 调用 `GET /api/generations/jobs/{job_id}`。 + 2. 检查 `events[*].event_metadata`。 +- **Expected Results**: + - 用户响应保留 `step`、`artifact`、`asset`、`assets`、`failure_category` 等可解释字段。 + - `workflow_planned` 只返回 `plan_mode`、`planned_task_count`、`recoverable_task_count`。 + - 用户响应不包含原始 `plan`、`tasks`、`result_snapshot`、内部阈值、内部错误原文。 + - 用户响应仍不包含 `evaluation_completed`、`overall_score`、维度分数或 golden replay 字段。 +- **Postconditions**: 内部数据库事件不被修改。 + +#### TC-ST-007: 用户 request payload 使用白名单脱敏 + +- **Requirement**: H11-1, H11-4 +- **Priority**: High +- **Preconditions**: + - 生成 job 的 `request_payload` 同时包含用户输入、公开控制字段、内部调度 token、Provider override 和评测策略。 +- **Test Steps**: + 1. 调用 `GET /api/generations/jobs/{job_id}`。 + 2. 检查响应中的 `request_payload`。 +- **Expected Results**: + - 用户响应只保留 `output_mode`、`input_type`、`type`、`story_id`、`assets`、`page_count`、`generate_images` 等安全控制字段。 + - 用户响应不包含原始 `data`、`education_theme`、内部调度 token、Provider override 或 evaluation policy。 + - 内部数据库中的完整 request payload 不被修改。 +- **Postconditions**: 用户端仍可根据公开字段展示任务进度和可用操作。 + +#### TC-ST-008: 资产 plan runner 按 WorkflowPlan 顺序执行任务 + +- **Requirement**: H12-1, H12-5 +- **Priority**: High +- **Preconditions**: + - 构造 `asset_generation` 或 `asset_retry` plan,包含图片和音频 task。 +- **Test Steps**: + 1. 调用 `run_asset_plan(...)`。 + 2. 记录 image/audio handler 的调用顺序。 + 3. 检查 runner 返回的 executed/ignored task keys。 +- **Expected Results**: + - 图片和音频 handler 按 plan 中 `WorkflowTask` 顺序执行。 + - `start_asset_*` 和 `complete_asset_*` 这类非资产生产 task 被记录为 ignored,不触发 provider handler。 + - 未知非资产 task 默认 ignored,不影响已知资产 task。 +- **Postconditions**: 无数据库修改。 + +#### TC-ST-009: 后台资产生成由 plan runner 执行组合资产 + +- **Requirement**: H12-2, H12-5 +- **Priority**: High +- **Preconditions**: + - 已持久化故事同时具备可生成图片和音频的输入。 + - 创建 `asset_generation` job,`assets=["audio", "image"]`。 +- **Test Steps**: + 1. 调用 worker 执行该 job。 + 2. 查询 job events 和 story 状态。 +- **Expected Results**: + - event stream 为 `workflow_planned` 后依次出现音频和图片生成事件。 + - plan tasks 顺序包含 `complete_audio_asset`、`complete_image_asset`。 + - story 的 `audio_status` 与 `image_status` 均为 `ready`。 + - 用户 API 仍只暴露 coarse plan metadata,不返回原始 `plan.tasks`。 +- **Postconditions**: job 完成,资源状态与原有语义一致。 + +#### TC-ST-010: 用户侧过滤 executor coverage 内部事件 + +- **Requirement**: H13-4, H13-5 +- **Priority**: High +- **Preconditions**: + - 生成 job 包含内部 `executor_completed` 事件。 + - `executor_completed.event_metadata` 包含 task keys 和 result assets。 +- **Test Steps**: + 1. 调用 `GET /api/generations/jobs/{job_id}`。 + 2. 调用 `GET /api/generations/{story_id}/jobs`。 + 3. 调用 `GET /api/generations/{story_id}/trace-summary`。 +- **Expected Results**: + - 用户 job detail 不包含 `executor_completed`。 + - 用户 job detail 不包含 `executed_task_keys`、`ignored_task_keys` 或具体 task key。 + - 当 job 当前步骤短暂停留在 `executor_completed` 时,用户 summary 显示为安全公开的 `workflow_planned` 进度。 + - 用户 trace summary 不包含 `executor_completed` 或具体 task key。 + - 用户 trace summary 的 `total_events` 不统计内部 `executor_completed`。 +- **Postconditions**: 内部数据库事件不被修改。 + +### 5. Admin-Only Analytics Tests + +#### TC-ADM-001: 管理端评测 analytics 聚合内部评测事件 + +- **Requirement**: H8-1, H8-2 +- **Priority**: High +- **Preconditions**: + - 数据库存在多个用户的 `evaluation_completed` 事件。 + - 请求通过 admin guard。 +- **Test Steps**: + 1. 调用 `GET /admin/evaluations/analytics`。 + 2. 检查聚合结果。 +- **Expected Results**: + - 返回通过数、阻断数、通过率和平均分。 + - 返回 artifact、output mode、score band、dimension score、quality gate issue、failure category 和 warning 聚合。 + - 不返回故事正文、prompt、单条 evaluation event 或评分 reason。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-002: 管理端评测 analytics 支持过滤 + +- **Requirement**: H8-3 +- **Priority**: Medium +- **Preconditions**: + - 数据库存在新旧评测事件以及不同 artifact。 +- **Test Steps**: + 1. 调用 `GET /admin/evaluations/analytics?days=7`。 + 2. 调用 `GET /admin/evaluations/analytics?artifact=story_text`。 + 3. 调用非法 artifact。 +- **Expected Results**: + - `days` 过滤只统计窗口内事件。 + - `artifact` 过滤只统计对应 artifact。 + - 非法 artifact 返回 `422`。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-003: 管理端评测 analytics 需要 admin 鉴权 + +- **Requirement**: H8-2 +- **Priority**: High +- **Preconditions**: + - 未提供 admin Basic Auth。 +- **Test Steps**: + 1. 调用 `GET /admin/evaluations/analytics`。 +- **Expected Results**: + - 返回 `401`。 + - 不返回任何评测统计。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-004: 管理端完整生成 trace 返回内部事件流 + +- **Requirement**: H11-2, H11-3, H11-4 +- **Priority**: High +- **Preconditions**: + - 数据库存在包含 `workflow_planned` 与 `evaluation_completed` 的生成 job。 + - 请求通过 admin guard。 +- **Test Steps**: + 1. 调用 `GET /admin/generations/jobs/{job_id}/trace`。 + 2. 检查 request payload 与 event stream。 +- **Expected Results**: + - 返回完整 request payload,包括原始用户输入和内部调度字段。 + - 返回完整 `workflow_planned.event_metadata.plan.tasks`。 + - 返回 `evaluation_completed` 事件及其内部评分 metadata。 + - 响应包含 `user_id`,便于管理控制面审计。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-005: 管理端完整生成 trace 需要 admin 鉴权 + +- **Requirement**: H11-3 +- **Priority**: High +- **Preconditions**: + - 未提供 admin Basic Auth。 +- **Test Steps**: + 1. 调用 `GET /admin/generations/jobs/{job_id}/trace`。 +- **Expected Results**: + - 返回 `401`。 + - 不返回 request payload 或内部 event metadata。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-006: 管理端 executor coverage 聚合内部执行事件 + +- **Requirement**: H13-1, H13-2, H13-3, H13-5 +- **Priority**: High +- **Preconditions**: + - 数据库存在多个 `executor_completed` 事件。 + - 请求通过 admin guard。 +- **Test Steps**: + 1. 调用 `GET /admin/executors/coverage`。 + 2. 调用 `GET /admin/executors/coverage?plan_mode=asset_retry`。 + 3. 调用非法 plan mode。 +- **Expected Results**: + - 返回 total runs、planned/executed/ignored task counts 和 coverage ratio。 + - 返回 plan mode、output mode、executed task keys、ignored task keys 和 result assets 聚合。 + - `plan_mode` 过滤只统计对应 executor run。 + - 非法 plan mode 返回 `422`。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-007: 管理端 executor coverage 需要 admin 鉴权 + +- **Requirement**: H13-3 +- **Priority**: High +- **Preconditions**: + - 未提供 admin Basic Auth。 +- **Test Steps**: + 1. 调用 `GET /admin/executors/coverage`。 +- **Expected Results**: + - 返回 `401`。 + - 不返回 executor task keys 或 coverage metadata。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-008: 管理端完整生成 trace 返回单 job executor coverage 摘要 + +- **Requirement**: H14-1, H14-2, H14-4 +- **Priority**: High +- **Preconditions**: + - 数据库存在包含 `executor_completed` 事件的生成 job。 + - 请求通过 admin guard。 +- **Test Steps**: + 1. 调用 `GET /admin/generations/jobs/{job_id}/trace`。 + 2. 检查 `executor_coverage`。 +- **Expected Results**: + - 响应包含 `executor_coverage.scope=admin_internal_job_executor_coverage`。 + - `executor_coverage` 只统计当前 job 的 runs、planned/executed/ignored task counts 和 coverage ratio。 + - `executor_coverage.executed_task_keys`、`ignored_task_keys` 和 `result_assets` 与当前 job 的内部 executor event 一致。 + - 完整 event stream 仍保留 `executor_completed`,便于 admin 调试。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-009: 管理端 harness readiness 聚合内部质量门 + +- **Requirement**: H15-1, H15-2, H15-3, H15-4 +- **Priority**: High +- **Preconditions**: + - app 内部 harness fixture 存在 golden replay cases。 + - 数据库存在至少一条通过的 `evaluation_completed` 事件。 + - 数据库存在至少一条 `executor_completed` 事件。 + - 请求通过 admin guard。 +- **Test Steps**: + 1. 调用 `GET /admin/harness/readiness`。 + 2. 检查 readiness status、checks 和聚合摘要。 +- **Expected Results**: + - `status=ready`。 + - checks 包含 `golden_replay`、`runtime_evaluation_samples`、`runtime_evaluation_quality`、`executor_coverage_samples` 和 `executor_coverage_ratio`。 + - golden replay 显示全部通过。 + - evaluation analytics 与 executor coverage 只以聚合形式返回。 + - 响应不包含故事标题、正文、prompt、score reason 或 quality gate message。 +- **Postconditions**: 无数据修改。 + +#### TC-ADM-010: 管理端 harness readiness 阻断低质量运行样本并需要 admin 鉴权 + +- **Requirement**: H15-2, H15-3, H15-4, H15-5 +- **Priority**: High +- **Preconditions**: + - 数据库存在低质量或 blocking 的 `evaluation_completed` 事件。 + - executor coverage 运行样本缺失或不足。 +- **Test Steps**: + 1. 通过 admin guard 调用 `GET /admin/harness/readiness`。 + 2. 未提供 admin Basic Auth 调用同一路径。 +- **Expected Results**: + - 有 admin 权限时返回 `status=blocked`。 + - `runtime_evaluation_quality.status=blocked`。 + - executor 样本缺失时对应 check 为 `needs_attention`。 + - 无 admin 权限时返回 `401`。 + - 响应不包含 quality gate message 或单条事件明细。 +- **Postconditions**: 无数据修改。 + +## Test Coverage Matrix + +| Requirement ID | Test Cases | Coverage Status | +| --- | --- | --- | +| H7-1 | TC-F-002, TC-F-005, TC-E-001, TC-ERR-002, TC-ERR-003 | Complete | +| H7-2 | TC-F-001, TC-ST-001 | Complete | +| H7-3 | TC-F-001, TC-ST-001 | Complete | +| H7-4 | TC-F-003, TC-ERR-001 | Complete | +| H7-5 | This document | Complete | +| H7-7 | TC-F-006, TC-E-002 | Complete | +| H7-8 | TC-F-006, TC-F-007 | Complete | +| H7B-1 | TC-F-003 | Complete | +| H7B-2 | TC-F-004 | Complete | +| H7C-1 | TC-F-005, TC-ERR-003, TC-ST-002 | Complete | +| H8-1 | TC-ADM-001 | Complete | +| H8-2 | TC-ADM-001, TC-ADM-003 | Complete | +| H8-3 | TC-ADM-002 | Complete | +| H8-4 | TC-F-003, TC-F-004, TC-ADM-001 | Complete | +| H9-1 | TC-ST-002 | Complete | +| H9-2 | TC-ST-003 | Complete | +| H9-3 | TC-ST-001, TC-ST-002, TC-ST-003 | Complete | +| H9-4 | TC-F-003, TC-F-004, TC-ST-004 | Complete | +| H10-1 | TC-ST-005 | Complete | +| H10-2 | TC-ST-005 | Complete | +| H10-3 | TC-ST-005 | Complete | +| H10-4 | TC-ST-006 | Complete | +| H10-5 | TC-ST-005, TC-ST-006 | Complete | +| H11-1 | TC-ST-007 | Complete | +| H11-2 | TC-ADM-004 | Complete | +| H11-3 | TC-ADM-004, TC-ADM-005 | Complete | +| H11-4 | TC-ST-007, TC-ADM-004, TC-ADM-005 | Complete | +| H11-5 | This document, `docs/planning/harness-stage-11-report.md` | Complete | +| H12-1 | TC-ST-008 | Complete | +| H12-2 | TC-ST-009 | Complete | +| H12-3 | TC-ST-005, TC-ST-008 | Complete | +| H12-4 | TC-ST-005, backend story endpoint regression tests | Complete | +| H12-5 | TC-ST-008, TC-ST-009 | Complete | +| H13-1 | TC-ADM-006 | Complete | +| H13-2 | TC-ST-009, TC-ADM-006 | Complete | +| H13-3 | TC-ADM-006, TC-ADM-007 | Complete | +| H13-4 | TC-ST-010 | Complete | +| H13-5 | TC-ST-010, TC-ADM-006, TC-ADM-007 | Complete | +| H14-1 | TC-ADM-006, TC-ADM-008 | Complete | +| H14-2 | TC-ADM-008 | Complete | +| H14-3 | TC-ST-010 | Complete | +| H14-4 | TC-ST-010, TC-ADM-008 | Complete | +| H14-5 | This document, `docs/planning/harness-stage-14-report.md` | Complete | +| H15-1 | TC-F-006, TC-ADM-009 | Complete | +| H15-2 | TC-ADM-009, TC-ADM-010 | Complete | +| H15-3 | TC-ADM-009, TC-ADM-010 | Complete | +| H15-4 | TC-ADM-009, TC-ADM-010 | Complete | +| H15-5 | This document, `docs/planning/harness-stage-15-report.md` | Complete | + +## Notes + +- 当前自动化已覆盖 TC-F-001、TC-F-002、TC-F-003、TC-F-004、TC-F-005、TC-F-006、TC-F-007、TC-E-002、TC-ERR-001、TC-ERR-002、TC-ERR-003、TC-ST-001、TC-ST-002、TC-ST-003、TC-ST-004、TC-ST-005、TC-ST-006、TC-ST-007、TC-ST-008、TC-ST-009、TC-ST-010、TC-ADM-001、TC-ADM-002、TC-ADM-003、TC-ADM-004、TC-ADM-005、TC-ADM-006、TC-ADM-007、TC-ADM-008、TC-ADM-009、TC-ADM-010。 +- TC-E-001 可在下一轮补成显式单测。 +- 所有 `evaluation_completed`、golden replay 和评分维度数据均按内部质量资产处理,不应进入用户端接口或用户前端。 +- `GET /admin/evaluations/analytics` 只允许 admin-only 聚合摘要,不应返回原始内容、prompt、单条事件或评分 reason。 +- `GET /admin/generations/jobs/{job_id}/trace` 是 admin-only 调试和审查接口,可返回完整内部链路,不应被用户前端调用。 +- `GET /admin/executors/coverage` 是 admin-only executor 覆盖率接口,可返回 task keys 和 result assets,不应被用户前端调用。 +- `GET /admin/generations/jobs/{job_id}/trace` 可返回当前 job 的 `executor_coverage` 摘要;该摘要与 task keys 一样属于内部执行资产。 +- `GET /admin/harness/readiness` 是 admin-only harness 上线前审查摘要,可返回聚合 readiness、thresholds、golden coverage、evaluation analytics 和 executor coverage,不应返回正文、prompt、score reason、quality gate message 或单条事件明细。 diff --git a/backend/tests/test_admin_providers.py b/backend/tests/test_admin_providers.py index f0aace7..af1e955 100644 --- a/backend/tests/test_admin_providers.py +++ b/backend/tests/test_admin_providers.py @@ -27,6 +27,17 @@ def _build_admin_test_app(db_session) -> FastAPI: return app +def _build_admin_auth_required_test_app(db_session) -> FastAPI: + app = FastAPI() + app.include_router(admin_providers.router, prefix="/admin") + + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + return app + + async def _create_story( db_session, *, @@ -51,6 +62,38 @@ async def _create_story( return story +async def _record_evaluation_event( + db_session, + *, + user_id: str, + story_id: int, + output_mode: str, + artifact: str, + status: str, + metadata: dict, +): + job = await create_generation_job( + db_session, + user_id=user_id, + output_mode=output_mode, + input_type="keywords", + request_payload={"data": "测试"}, + story_id=story_id, + ) + return await record_generation_event( + db_session, + job=job, + story_id=story_id, + event_type="evaluation_completed", + status=status, + metadata={ + "step": "evaluation", + "artifact": artifact, + **metadata, + }, + ) + + async def test_admin_provider_analytics_aggregate_across_users(db_session, test_user): second_user = User( id="github:67890", @@ -197,6 +240,616 @@ async def test_admin_provider_analytics_aggregate_across_users(db_session, test_ ] +async def test_admin_evaluation_analytics_aggregate_internal_events( + db_session, + test_user, +): + second_user = User( + id="google:evaluation-user", + name="Evaluation User", + avatar_url="https://example.com/eval.png", + provider="google", + ) + db_session.add(second_user) + await db_session.commit() + + story = await _create_story(db_session, user_id=test_user.id, title="评测故事") + storybook = await _create_story( + db_session, + user_id=second_user.id, + title="评测绘本", + mode="storybook", + ) + + await _record_evaluation_event( + db_session, + user_id=test_user.id, + story_id=story.id, + output_mode="story", + artifact="story_text", + status="succeeded", + metadata={ + "overall_score": 0.92, + "passed": True, + "blocking": False, + "scores": [ + {"dimension": "structure", "score": 1.0, "reason": "完整"}, + {"dimension": "readability", "score": 0.84, "reason": "可读"}, + ], + "warnings": [], + }, + ) + await _record_evaluation_event( + db_session, + user_id=second_user.id, + story_id=storybook.id, + output_mode="storybook", + artifact="storybook_pages", + status="failed", + metadata={ + "overall_score": 0.0, + "passed": False, + "blocking": True, + "scores": [ + {"dimension": "structure", "score": 0.0, "reason": "结构失败"}, + {"dimension": "safety", "score": 0.0, "reason": "安全失败"}, + ], + "quality_gate": { + "issues": [ + { + "code": "unsafe_child_content", + "message": "风险词", + "failure_category": "safety_error", + "field": "pages", + } + ] + }, + "warnings": ["绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。"], + }, + ) + + admin_app = _build_admin_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/evaluations/analytics") + + assert response.status_code == 200 + data = response.json() + assert data["scope"] == "admin_internal_evaluations" + assert data["total_evaluations"] == 2 + assert data["passed_evaluations"] == 1 + assert data["blocked_evaluations"] == 1 + assert data["pass_rate"] == 0.5 + assert data["average_score"] == 0.46 + assert data["job_count"] == 2 + assert data["story_count"] == 2 + assert data["user_count"] == 2 + assert data["by_artifact"] == [ + {"artifact": "story_text", "count": 1}, + {"artifact": "storybook_pages", "count": 1}, + ] + assert data["by_output_mode"] == [ + {"output_mode": "story", "count": 1}, + {"output_mode": "storybook", "count": 1}, + ] + assert data["score_bands"] == [ + {"band": "blocked_quality_gate", "count": 1}, + {"band": "excellent", "count": 1}, + ] + assert data["dimension_scores"] == [ + {"dimension": "structure", "average_score": 0.5, "count": 2}, + {"dimension": "readability", "average_score": 0.84, "count": 1}, + {"dimension": "safety", "average_score": 0.0, "count": 1}, + ] + assert data["quality_gate_issues"] == [ + {"code": "unsafe_child_content", "count": 1}, + ] + assert data["failure_categories"] == [ + {"category": "safety_error", "count": 1}, + ] + assert data["warnings"] == [ + { + "message": "绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。", + "count": 1, + }, + ] + assert "评测故事" not in str(data) + assert "风险词" not in str(data) + assert "完整" not in str(data) + + +async def test_admin_evaluation_analytics_support_days_and_artifact_filters( + db_session, + test_user, +): + story = await _create_story(db_session, user_id=test_user.id, title="旧评测") + storybook = await _create_story( + db_session, + user_id=test_user.id, + title="新评测", + mode="storybook", + ) + + old_event = await _record_evaluation_event( + db_session, + user_id=test_user.id, + story_id=story.id, + output_mode="story", + artifact="story_text", + status="succeeded", + metadata={ + "overall_score": 0.96, + "passed": True, + "blocking": False, + "scores": [{"dimension": "structure", "score": 1.0, "reason": "完整"}], + "warnings": [], + }, + ) + old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10) + await db_session.commit() + + await _record_evaluation_event( + db_session, + user_id=test_user.id, + story_id=storybook.id, + output_mode="storybook", + artifact="storybook_pages", + status="failed", + metadata={ + "overall_score": 0.72, + "passed": False, + "blocking": True, + "scores": [{"dimension": "readability", "score": 0.62, "reason": "过短"}], + "warnings": ["分页正文长度偏短"], + }, + ) + + admin_app = _build_admin_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/evaluations/analytics?days=7") + assert response.status_code == 200 + data = response.json() + assert data["window_days"] == 7 + assert data["total_evaluations"] == 1 + assert data["artifact"] is None + assert data["by_artifact"] == [{"artifact": "storybook_pages", "count": 1}] + + response = await client.get( + "/admin/evaluations/analytics?artifact=story_text" + ) + assert response.status_code == 200 + data = response.json() + assert data["artifact"] == "story_text" + assert data["total_evaluations"] == 1 + assert data["average_score"] == 0.96 + + response = await client.get("/admin/evaluations/analytics?artifact=image") + assert response.status_code == 422 + + +async def test_admin_evaluation_analytics_requires_admin_auth(db_session): + admin_app = _build_admin_auth_required_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/evaluations/analytics") + + assert response.status_code == 401 + + +async def test_admin_generation_job_trace_returns_internal_event_stream( + db_session, + test_user, +): + story = await _create_story(db_session, user_id=test_user.id, title="内部链路故事") + job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="story", + input_type="keywords", + request_payload={ + "output_mode": "story", + "type": "keywords", + "data": "月亮森林", + "internal_dispatch_token": "admin-visible-token", + "provider_override": "internal-provider", + "evaluation_policy": {"threshold": 0.9}, + }, + story_id=story.id, + ) + await record_generation_event( + db_session, + job=job, + story_id=story.id, + event_type="workflow_planned", + status="succeeded", + metadata={ + "step": "request_acceptance", + "artifact": "none", + "plan": { + "mode": "story", + "tasks": [ + { + "key": "generate_narrative", + "step": "text_generation", + "artifact": "story_text", + "required": True, + "recoverable": False, + } + ], + }, + "internal_threshold": 0.9, + }, + ) + await record_generation_event( + db_session, + job=job, + story_id=story.id, + event_type="evaluation_completed", + status="succeeded", + metadata={ + "step": "evaluation", + "artifact": "story_text", + "overall_score": 0.94, + "passed": True, + "blocking": False, + "scores": [{"dimension": "structure", "score": 1.0}], + }, + ) + await record_generation_event( + db_session, + job=job, + story_id=story.id, + event_type="executor_completed", + status="succeeded", + metadata={ + "plan_mode": "asset_generation", + "planned_task_count": 3, + "executed_task_count": 1, + "ignored_task_count": 2, + "executed_task_keys": ["complete_image_asset"], + "ignored_task_keys": [ + "start_asset_generation", + "complete_asset_generation", + ], + "result_assets": ["cover_image"], + }, + ) + + admin_app = _build_admin_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get(f"/admin/generations/jobs/{job.id}/trace") + + assert response.status_code == 200 + data = response.json() + assert data["id"] == job.id + assert data["user_id"] == test_user.id + assert data["request_payload"]["data"] == "月亮森林" + assert data["request_payload"]["internal_dispatch_token"] == "admin-visible-token" + assert data["request_payload"]["evaluation_policy"] == {"threshold": 0.9} + + event_types = [event["event_type"] for event in data["events"]] + assert event_types == [ + "request_accepted", + "workflow_planned", + "evaluation_completed", + "executor_completed", + ] + workflow_event = data["events"][1] + assert workflow_event["event_metadata"]["plan"]["tasks"][0]["key"] == ( + "generate_narrative" + ) + assert workflow_event["event_metadata"]["internal_threshold"] == 0.9 + + evaluation_event = data["events"][2] + assert evaluation_event["event_metadata"]["overall_score"] == 0.94 + assert evaluation_event["event_metadata"]["scores"] == [ + {"dimension": "structure", "score": 1.0} + ] + executor_event = data["events"][3] + assert executor_event["event_metadata"]["executed_task_keys"] == [ + "complete_image_asset" + ] + assert executor_event["event_metadata"]["result_assets"] == ["cover_image"] + + executor_coverage = data["executor_coverage"] + assert executor_coverage["scope"] == "admin_internal_job_executor_coverage" + assert executor_coverage["total_runs"] == 1 + assert executor_coverage["total_planned_tasks"] == 3 + assert executor_coverage["total_executed_tasks"] == 1 + assert executor_coverage["total_ignored_tasks"] == 2 + assert executor_coverage["coverage_ratio"] == 0.3333 + assert executor_coverage["job_count"] == 1 + assert executor_coverage["story_count"] == 1 + assert executor_coverage["user_count"] == 1 + assert executor_coverage["by_plan_mode"] == [ + {"plan_mode": "asset_generation", "count": 1} + ] + assert executor_coverage["by_output_mode"] == [ + {"output_mode": "story", "count": 1} + ] + assert executor_coverage["executed_task_keys"] == [ + {"task_key": "complete_image_asset", "count": 1} + ] + assert executor_coverage["ignored_task_keys"] == [ + {"task_key": "complete_asset_generation", "count": 1}, + {"task_key": "start_asset_generation", "count": 1}, + ] + assert executor_coverage["result_assets"] == [ + {"asset": "cover_image", "count": 1} + ] + + +async def test_admin_generation_job_trace_requires_admin_auth(db_session): + admin_app = _build_admin_auth_required_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/generations/jobs/missing-job/trace") + + assert response.status_code == 401 + + +async def test_admin_executor_coverage_aggregates_internal_events( + db_session, + test_user, +): + story = await _create_story(db_session, user_id=test_user.id, title="执行器覆盖故事") + asset_job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="asset_generation", + input_type="audio,image", + request_payload={"story_id": story.id, "assets": ["audio", "image"]}, + story_id=story.id, + ) + await record_generation_event( + db_session, + job=asset_job, + story_id=story.id, + event_type="executor_completed", + status="succeeded", + metadata={ + "plan_mode": "asset_generation", + "planned_task_count": 4, + "executed_task_count": 2, + "ignored_task_count": 2, + "executed_task_keys": ["complete_audio_asset", "complete_image_asset"], + "ignored_task_keys": [ + "start_asset_generation", + "complete_asset_generation", + ], + "result_assets": ["audio", "cover_image"], + }, + ) + retry_job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="asset_retry", + input_type="image", + request_payload={"story_id": story.id, "assets": ["image"]}, + story_id=story.id, + ) + await record_generation_event( + db_session, + job=retry_job, + story_id=story.id, + event_type="executor_completed", + status="succeeded", + metadata={ + "plan_mode": "asset_retry", + "planned_task_count": 3, + "executed_task_count": 1, + "ignored_task_count": 2, + "executed_task_keys": ["complete_image_asset"], + "ignored_task_keys": ["start_asset_retry", "complete_asset_retry"], + "result_assets": ["cover_image"], + }, + ) + + admin_app = _build_admin_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/executors/coverage") + assert response.status_code == 200 + data = response.json() + assert data["scope"] == "admin_internal_executor_coverage" + assert data["total_runs"] == 2 + assert data["total_planned_tasks"] == 7 + assert data["total_executed_tasks"] == 3 + assert data["total_ignored_tasks"] == 4 + assert data["coverage_ratio"] == 0.4286 + assert data["job_count"] == 2 + assert data["story_count"] == 1 + assert data["user_count"] == 1 + assert data["by_plan_mode"] == [ + {"plan_mode": "asset_generation", "count": 1}, + {"plan_mode": "asset_retry", "count": 1}, + ] + assert data["executed_task_keys"] == [ + {"task_key": "complete_image_asset", "count": 2}, + {"task_key": "complete_audio_asset", "count": 1}, + ] + assert data["result_assets"] == [ + {"asset": "cover_image", "count": 2}, + {"asset": "audio", "count": 1}, + ] + + response = await client.get("/admin/executors/coverage?plan_mode=asset_retry") + assert response.status_code == 200 + data = response.json() + assert data["plan_mode"] == "asset_retry" + assert data["total_runs"] == 1 + assert data["total_planned_tasks"] == 3 + assert data["total_executed_tasks"] == 1 + + response = await client.get("/admin/executors/coverage?plan_mode=story") + assert response.status_code == 422 + + +async def test_admin_executor_coverage_requires_admin_auth(db_session): + admin_app = _build_admin_auth_required_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/executors/coverage") + + assert response.status_code == 401 + + +async def test_admin_harness_readiness_returns_ready_when_internal_gates_pass( + db_session, + test_user, +): + story = await _create_story(db_session, user_id=test_user.id, title="readiness 故事") + await _record_evaluation_event( + db_session, + user_id=test_user.id, + story_id=story.id, + output_mode="story", + artifact="story_text", + status="succeeded", + metadata={ + "overall_score": 0.92, + "passed": True, + "blocking": False, + "scores": [ + {"dimension": "structure", "score": 1.0, "reason": "内部 reason"}, + {"dimension": "readability", "score": 0.84, "reason": "内部 reason"}, + ], + "warnings": [], + }, + ) + asset_job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="asset_generation", + input_type="image", + request_payload={"story_id": story.id, "assets": ["image"]}, + story_id=story.id, + ) + await record_generation_event( + db_session, + job=asset_job, + story_id=story.id, + event_type="executor_completed", + status="succeeded", + metadata={ + "plan_mode": "asset_generation", + "planned_task_count": 3, + "executed_task_count": 1, + "ignored_task_count": 2, + "executed_task_keys": ["complete_image_asset"], + "ignored_task_keys": [ + "start_asset_generation", + "complete_asset_generation", + ], + "result_assets": ["cover_image"], + }, + ) + + admin_app = _build_admin_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/harness/readiness") + + assert response.status_code == 200 + data = response.json() + assert data["scope"] == "admin_internal_harness_readiness" + assert data["status"] == "ready" + assert data["thresholds"] == { + "min_runtime_evaluations": 1, + "min_executor_runs": 1, + "min_evaluation_pass_rate": 0.7, + "min_evaluation_average_score": 0.7, + "min_executor_coverage_ratio": 0.2, + } + assert {check["code"]: check["status"] for check in data["checks"]} == { + "golden_replay": "ready", + "runtime_evaluation_samples": "ready", + "runtime_evaluation_quality": "ready", + "executor_coverage_samples": "ready", + "executor_coverage_ratio": "ready", + } + assert data["golden_replay"]["passed"] is True + assert data["golden_replay"]["total_cases"] == 11 + assert data["evaluation_analytics"]["total_evaluations"] == 1 + assert data["evaluation_analytics"]["pass_rate"] == 1.0 + assert data["executor_coverage"]["total_runs"] == 1 + assert data["executor_coverage"]["coverage_ratio"] == 0.3333 + assert "内部 reason" not in str(data) + assert "readiness 故事" not in str(data) + + +async def test_admin_harness_readiness_blocks_low_runtime_quality( + db_session, + test_user, +): + story = await _create_story(db_session, user_id=test_user.id, title="低质量 readiness") + await _record_evaluation_event( + db_session, + user_id=test_user.id, + story_id=story.id, + output_mode="story", + artifact="story_text", + status="failed", + metadata={ + "overall_score": 0.0, + "passed": False, + "blocking": True, + "scores": [{"dimension": "structure", "score": 0.0, "reason": "缺失"}], + "quality_gate": { + "issues": [ + { + "code": "missing_story_text", + "message": "正文缺失", + "failure_category": "schema_error", + "field": "story_text", + } + ] + }, + "warnings": [], + }, + ) + + admin_app = _build_admin_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/harness/readiness") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "blocked" + checks = {check["code"]: check for check in data["checks"]} + assert checks["golden_replay"]["status"] == "ready" + assert checks["runtime_evaluation_samples"]["status"] == "ready" + assert checks["runtime_evaluation_quality"]["status"] == "blocked" + assert checks["executor_coverage_samples"]["status"] == "needs_attention" + assert checks["executor_coverage_ratio"]["status"] == "needs_attention" + assert data["evaluation_analytics"]["blocked_evaluations"] == 1 + assert data["executor_coverage"]["total_runs"] == 0 + assert "正文缺失" not in str(data) + assert "低质量 readiness" not in str(data) + + +async def test_admin_harness_readiness_requires_admin_auth(db_session): + admin_app = _build_admin_auth_required_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/harness/readiness") + + assert response.status_code == 401 + + async def test_admin_provider_analytics_support_days_and_capability_filters( db_session, test_user, diff --git a/backend/tests/test_generation_jobs.py b/backend/tests/test_generation_jobs.py index 8b4c311..8af1b5e 100644 --- a/backend/tests/test_generation_jobs.py +++ b/backend/tests/test_generation_jobs.py @@ -123,14 +123,19 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event assert [event.event_type for event in events] == [ "request_accepted", "worker_started", + "workflow_planned", "context_prepared", + "evaluation_completed", "narrative_generated", "story_saved", "generation_completed", ] - assert events[2].event_metadata["has_memory_context"] is False - assert events[3].event_metadata["title"] == "小兔子的冒险" - assert events[4].story_id == job.story_id + assert events[2].event_metadata["plan"]["mode"] == "story" + assert events[3].event_metadata["has_memory_context"] is False + assert events[4].event_metadata["passed"] is True + assert events[4].event_metadata["overall_score"] >= 0.7 + assert events[5].event_metadata["title"] == "小兔子的冒险" + assert events[6].story_id == job.story_id detail_response = await client.get(f"/api/generations/jobs/{job.id}") assert detail_response.status_code == 200 @@ -143,11 +148,16 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event assert [event["event_type"] for event in detail["events"]] == [ "request_accepted", "worker_started", + "workflow_planned", "context_prepared", "narrative_generated", "story_saved", "generation_completed", ] + assert all( + event["event_type"] != "evaluation_completed" + for event in detail["events"] + ) story_response = await client.get(f"/api/generations/{job.story_id}") assert story_response.status_code == 200 @@ -161,6 +171,13 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event assert [item["id"] for item in job_list] == [job.id] assert job_list[0]["progress_percent"] == 100 assert job_list[0]["is_terminal"] is True + + trace_response = await client.get( + f"/api/generations/{job.story_id}/trace-summary" + ) + assert trace_response.status_code == 200 + trace = trace_response.json() + assert "evaluation" not in trace finally: app.dependency_overrides.clear() @@ -220,13 +237,88 @@ async def test_generation_worker_records_quality_gate_failure_without_persisting assert [event.event_type for event in events] == [ "request_accepted", "worker_started", + "workflow_planned", "context_prepared", "quality_gate_failed", + "evaluation_completed", "generation_failed", ] - quality_event = events[3] + quality_event = events[4] assert quality_event.event_metadata["step"] == "narrative_generation" assert quality_event.event_metadata["issues"][0]["code"] == "missing_story_text" + evaluation_event = events[5] + assert evaluation_event.event_metadata["step"] == "evaluation" + assert evaluation_event.event_metadata["passed"] is False + assert evaluation_event.event_metadata["blocking"] is True + + +async def test_story_with_images_worker_records_plan_before_assets( + db_session, + test_user, + mock_text_provider, + mock_image_provider, +): + job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="story", + input_type="keywords", + request_payload={ + "output_mode": "story", + "type": "keywords", + "data": "小兔子, 森林", + "generate_images": True, + }, + ) + + await run_generation_job_service(job.id, db_session) + + refreshed_job = ( + await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id)) + ).scalar_one() + assert refreshed_job.story_id is not None + assert refreshed_job.status == "completed" + assert refreshed_job.current_step == "generation_completed" + assert refreshed_job.result_snapshot["image_status"] == "ready" + + events = ( + await db_session.execute( + select(GenerationJobEvent) + .where(GenerationJobEvent.job_id == job.id) + .order_by(GenerationJobEvent.id) + ) + ).scalars().all() + assert [event.event_type for event in events] == [ + "request_accepted", + "worker_started", + "workflow_planned", + "context_prepared", + "evaluation_completed", + "narrative_generated", + "story_saved", + "cover_image_started", + "cover_image_succeeded", + "generation_completed", + ] + + plan = events[2].event_metadata["plan"] + assert plan["mode"] == "story_with_assets" + assert [task["key"] for task in plan["tasks"]] == [ + "prepare_context", + "generate_narrative", + "evaluate_narrative", + "persist_story", + "generate_cover_image", + "queue_postprocessing", + "complete_generation", + ] + cover_task = next(task for task in plan["tasks"] if task["key"] == "generate_cover_image") + assert cover_task["required"] is False + assert cover_task["recoverable"] is True + assert events[4].event_metadata["passed"] is True + assert events[8].event_metadata["asset"] == "cover_image" + mock_text_provider.assert_called_once() + mock_image_provider.assert_called_once() async def test_asset_retry_records_job_events_and_updates_retryable_assets( @@ -279,12 +371,30 @@ async def test_asset_retry_records_job_events_and_updates_retryable_assets( ).scalars().all() assert [event.event_type for event in events] == [ "request_accepted", + "workflow_planned", "asset_retry_started", "cover_image_started", "cover_image_succeeded", + "executor_completed", "asset_retry_completed", ] - assert events[3].event_metadata["asset"] == "cover_image" + plan = events[1].event_metadata["plan"] + assert plan["mode"] == "asset_retry" + assert [task["key"] for task in plan["tasks"]] == [ + "start_asset_retry", + "complete_image_asset", + "complete_asset_retry", + ] + image_task = next( + task for task in plan["tasks"] if task["key"] == "complete_image_asset" + ) + assert image_task["required"] is False + assert image_task["recoverable"] is True + assert events[4].event_metadata["asset"] == "cover_image" + assert events[5].event_metadata["plan_mode"] == "asset_retry" + assert events[5].event_metadata["executed_task_keys"] == [ + "complete_image_asset" + ] finally: app.dependency_overrides.clear() @@ -365,10 +475,110 @@ async def test_asset_generation_job_worker_completes_cover_image( assert [event.event_type for event in events] == [ "request_accepted", "worker_started", + "workflow_planned", "cover_image_started", "cover_image_succeeded", + "executor_completed", "asset_generation_completed", ] + plan = events[2].event_metadata["plan"] + assert plan["mode"] == "asset_generation" + assert [task["key"] for task in plan["tasks"]] == [ + "start_asset_generation", + "complete_image_asset", + "complete_asset_generation", + ] + image_task = next( + task for task in plan["tasks"] if task["key"] == "complete_image_asset" + ) + assert image_task["required"] is False + assert image_task["recoverable"] is True + executor_event = events[5] + assert executor_event.event_metadata["plan_mode"] == "asset_generation" + assert executor_event.event_metadata["executed_task_keys"] == [ + "complete_image_asset" + ] + assert executor_event.event_metadata["ignored_task_keys"] == [ + "start_asset_generation", + "complete_asset_generation", + ] + assert executor_event.event_metadata["result_assets"] == ["cover_image"] + + +async def test_asset_generation_job_worker_executes_assets_in_plan_order( + db_session, + test_story, + mock_tts_provider, +): + job = await create_generation_job( + db_session, + user_id=test_story.user_id, + output_mode="asset_generation", + input_type="audio,image", + request_payload={"story_id": test_story.id, "assets": ["audio", "image"]}, + story_id=test_story.id, + ) + + with patch( + "app.services.story_service.generate_image", + new_callable=AsyncMock, + ) as mock_generate_image: + mock_generate_image.return_value = "https://example.com/plan-cover.png" + + await run_generation_job_service(job.id, db_session) + + refreshed_job = ( + await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id)) + ).scalar_one() + assert refreshed_job.status == "completed" + assert refreshed_job.current_step == "asset_generation_completed" + assert refreshed_job.result_snapshot["image_status"] == "ready" + assert refreshed_job.result_snapshot["audio_status"] == "ready" + + story = ( + await db_session.execute( + select(Story).where(Story.id == test_story.id) + ) + ).scalar_one() + assert story.image_url == "https://example.com/plan-cover.png" + assert story.audio_status == "ready" + assert story.audio_path is not None + + events = ( + await db_session.execute( + select(GenerationJobEvent) + .where(GenerationJobEvent.job_id == job.id) + .order_by(GenerationJobEvent.id) + ) + ).scalars().all() + assert [event.event_type for event in events] == [ + "request_accepted", + "worker_started", + "workflow_planned", + "audio_started", + "audio_succeeded", + "cover_image_started", + "cover_image_succeeded", + "executor_completed", + "asset_generation_completed", + ] + plan = events[2].event_metadata["plan"] + assert plan["mode"] == "asset_generation" + assert [task["key"] for task in plan["tasks"]] == [ + "start_asset_generation", + "complete_audio_asset", + "complete_image_asset", + "complete_asset_generation", + ] + assert events[4].event_metadata["asset"] == "audio" + assert events[6].event_metadata["asset"] == "cover_image" + assert events[7].event_metadata["executed_task_keys"] == [ + "complete_audio_asset", + "complete_image_asset", + ] + assert events[7].event_metadata["result_assets"] == ["audio", "cover_image"] + mock_tts_provider.assert_awaited_once() + mock_generate_image.assert_awaited_once() async def test_cancel_queued_asset_generation_job_marks_it_canceled( @@ -538,7 +748,9 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve assert [event.event_type for event in events] == [ "request_accepted", "worker_started", + "workflow_planned", "context_prepared", + "evaluation_completed", "narrative_generated", "storybook_images_started", "storybook_cover_image_succeeded", @@ -548,13 +760,45 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve "story_saved", "generation_completed", ] + plan = events[2].event_metadata["plan"] + assert plan["mode"] == "storybook" + assert [task["key"] for task in plan["tasks"]] == [ + "prepare_context", + "generate_storybook_pages", + "evaluate_storybook_pages", + "generate_storybook_images", + "persist_storybook", + "queue_postprocessing", + "complete_generation", + ] + image_task = next( + task + for task in plan["tasks"] + if task["key"] == "generate_storybook_images" + ) + assert image_task["required"] is False + assert image_task["recoverable"] is True + assert events[4].event_metadata["passed"] is True + assert events[4].event_metadata["artifact"] == "storybook_pages" page_events = [ event for event in events if event.event_type == "storybook_page_image_succeeded" ] assert [event.event_metadata["page_number"] for event in page_events] == [1, 2] - assert events[8].event_metadata["completed_pages"] == [1, 2] + assert events[10].event_metadata["completed_pages"] == [1, 2] + + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + detail_response = await client.get( + f"/api/generations/jobs/{job.id}" + ) + + assert detail_response.status_code == 200 + detail = detail_response.json() + assert "evaluation_completed" not in [ + event["event_type"] for event in detail["events"] + ] finally: app.dependency_overrides.clear() @@ -716,6 +960,414 @@ async def test_story_provider_stats_aggregate_job_events( app.dependency_overrides.clear() +async def test_story_trace_summary_aggregates_steps_artifacts_and_failure_categories( + db_session, + auth_token, + degraded_story_with_text, +): + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + + job = await create_generation_job( + db_session, + user_id=degraded_story_with_text.user_id, + output_mode="asset_retry", + input_type="image", + request_payload={"assets": ["image"]}, + story_id=degraded_story_with_text.id, + ) + await record_generation_event( + db_session, + job=job, + story_id=degraded_story_with_text.id, + event_type="cover_image_started", + status="running", + metadata={ + "step": "image_generation", + "artifact": "cover_image", + "failure_category": None, + }, + ) + await record_generation_event( + db_session, + job=job, + story_id=degraded_story_with_text.id, + event_type="cover_image_failed", + status="failed", + metadata={ + "step": "image_generation", + "artifact": "cover_image", + "failure_category": "provider_error", + }, + ) + await record_generation_event( + db_session, + job=job, + story_id=degraded_story_with_text.id, + event_type="quality_gate_failed", + status="failed", + metadata={ + "step": "narrative_generation", + "artifact": "story_text", + "failure_category": "schema_error", + }, + ) + await record_generation_event( + db_session, + job=job, + story_id=degraded_story_with_text.id, + event_type="evaluation_completed", + status="failed", + metadata={ + "step": "evaluation", + "artifact": "story_text", + "failure_category": "schema_error", + "overall_score": 0.0, + "passed": False, + "blocking": True, + "scores": [ + { + "dimension": "structure", + "score": 0.0, + "reason": "故事结构未通过质量门。", + }, + { + "dimension": "safety", + "score": 0.0, + "reason": "内容未通过儿童安全或结构完整性检查。", + }, + ], + }, + ) + + transport = ASGITransport(app=app) + try: + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + + response = await client.get( + f"/api/generations/{degraded_story_with_text.id}/trace-summary" + ) + + assert response.status_code == 200 + data = response.json() + assert data["story_id"] == degraded_story_with_text.id + assert data["total_events"] == 4 + assert data["failed_events"] == 2 + assert data["by_step"] == [ + {"name": "image_generation", "count": 2}, + {"name": "narrative_generation", "count": 1}, + ] + assert data["by_artifact"] == [ + {"name": "cover_image", "count": 2}, + {"name": "story_text", "count": 1}, + ] + assert data["failure_categories"] == [ + {"name": "provider_error", "count": 1}, + {"name": "schema_error", "count": 1}, + ] + assert "evaluation" not in data + assert "overall_score" not in str(data) + finally: + app.dependency_overrides.clear() + + +async def test_user_generation_job_detail_hides_internal_evaluation_step( + db_session, + auth_token, + test_user, +): + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + transport = ASGITransport(app=app) + + job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="story", + input_type="keywords", + request_payload={ + "output_mode": "story", + "type": "keywords", + "data": "小兔子", + "generate_images": False, + }, + ) + await record_generation_event( + db_session, + job=job, + event_type="evaluation_completed", + status="succeeded", + metadata={ + "step": "evaluation", + "artifact": "story_text", + "overall_score": 0.96, + "passed": True, + "blocking": False, + "scores": [ + {"dimension": "structure", "score": 1.0, "reason": "完整。"}, + ], + }, + ) + + try: + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + + response = await client.get(f"/api/generations/jobs/{job.id}") + + assert response.status_code == 200 + data = response.json() + assert data["current_step"] == "narrative_generated" + assert data["progress_label"] == "正文已生成" + assert [event["event_type"] for event in data["events"]] == [ + "request_accepted" + ] + assert "evaluation_completed" not in str(data) + assert "overall_score" not in str(data) + finally: + app.dependency_overrides.clear() + + +async def test_user_generation_job_detail_sanitizes_request_payload( + db_session, + auth_token, + test_user, +): + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + transport = ASGITransport(app=app) + + job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="story", + input_type="keywords", + request_payload={ + "output_mode": "story", + "input_type": "keywords", + "type": "keywords", + "data": "不要回传原始关键词", + "education_theme": "勇气", + "generate_images": True, + "page_count": 6, + "child_profile_id": "child-public-id", + "universe_id": "universe-public-id", + "internal_dispatch_token": "secret-dispatch-token", + "provider_override": "internal-provider", + "evaluation_policy": {"threshold": 0.9}, + }, + ) + + try: + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + + response = await client.get(f"/api/generations/jobs/{job.id}") + + assert response.status_code == 200 + data = response.json() + assert data["request_payload"] == { + "child_profile_id": "child-public-id", + "generate_images": True, + "input_type": "keywords", + "output_mode": "story", + "page_count": 6, + "type": "keywords", + "universe_id": "universe-public-id", + } + payload_dump = str(data["request_payload"]) + assert "不要回传原始关键词" not in payload_dump + assert "education_theme" not in payload_dump + assert "secret-dispatch-token" not in payload_dump + assert "internal-provider" not in payload_dump + assert "evaluation_policy" not in payload_dump + finally: + app.dependency_overrides.clear() + + +async def test_user_generation_job_detail_sanitizes_public_event_metadata( + db_session, + auth_token, + degraded_story_with_text, +): + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + transport = ASGITransport(app=app) + + job = await create_generation_job( + db_session, + user_id=degraded_story_with_text.user_id, + output_mode="asset_generation", + input_type="image", + request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]}, + story_id=degraded_story_with_text.id, + ) + await record_generation_event( + db_session, + job=job, + story_id=degraded_story_with_text.id, + event_type="workflow_planned", + status="succeeded", + metadata={ + "step": "request_acceptance", + "artifact": "none", + "plan": { + "mode": "asset_generation", + "tasks": [ + { + "key": "complete_image_asset", + "step": "image_generation", + "artifact": "image", + "required": False, + "recoverable": True, + } + ], + }, + "internal_threshold": 0.72, + }, + ) + await record_generation_event( + db_session, + job=job, + story_id=degraded_story_with_text.id, + event_type="asset_generation_completed", + status="completed", + metadata={ + "assets": ["image"], + "result_snapshot": { + "story_id": degraded_story_with_text.id, + "last_error": "internal provider detail", + }, + "error": "internal provider detail", + }, + ) + await record_generation_event( + db_session, + job=job, + story_id=degraded_story_with_text.id, + event_type="executor_completed", + status="succeeded", + metadata={ + "plan_mode": "asset_generation", + "planned_task_count": 3, + "executed_task_keys": ["complete_image_asset"], + "ignored_task_keys": [ + "start_asset_generation", + "complete_asset_generation", + ], + "result_assets": ["cover_image"], + }, + ) + + try: + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + + response = await client.get(f"/api/generations/jobs/{job.id}") + + assert response.status_code == 200 + data = response.json() + workflow_event = next( + event for event in data["events"] if event["event_type"] == "workflow_planned" + ) + assert workflow_event["event_metadata"] == { + "artifact": "none", + "plan_mode": "asset_generation", + "planned_task_count": 1, + "recoverable_task_count": 1, + "step": "request_acceptance", + } + + completion_event = next( + event + for event in data["events"] + if event["event_type"] == "asset_generation_completed" + ) + assert completion_event["event_metadata"] == {"assets": ["image"]} + assert "plan" not in workflow_event["event_metadata"] + assert "tasks" not in str(data["events"]) + assert "internal_threshold" not in str(data["events"]) + assert "result_snapshot" not in str(data["events"]) + assert "internal provider detail" not in str(data["events"]) + assert "executor_completed" not in str(data["events"]) + assert "complete_image_asset" not in str(data["events"]) + finally: + app.dependency_overrides.clear() + + +async def test_user_generation_job_summary_hides_internal_executor_step( + db_session, + auth_token, + degraded_story_with_text, +): + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + transport = ASGITransport(app=app) + + job = await create_generation_job( + db_session, + user_id=degraded_story_with_text.user_id, + output_mode="asset_generation", + input_type="image", + request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]}, + story_id=degraded_story_with_text.id, + ) + await record_generation_event( + db_session, + job=job, + story_id=degraded_story_with_text.id, + event_type="executor_completed", + status="succeeded", + metadata={ + "plan_mode": "asset_generation", + "executed_task_keys": ["complete_image_asset"], + }, + ) + + try: + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + + detail_response = await client.get(f"/api/generations/jobs/{job.id}") + list_response = await client.get( + f"/api/generations/{degraded_story_with_text.id}/jobs" + ) + trace_summary_response = await client.get( + f"/api/generations/{degraded_story_with_text.id}/trace-summary" + ) + + assert detail_response.status_code == 200 + detail = detail_response.json() + assert detail["current_step"] == "workflow_planned" + assert detail["progress_label"] == "工作流已规划" + assert "executor_completed" not in str(detail) + assert "complete_image_asset" not in str(detail) + + assert list_response.status_code == 200 + listed_job = next(item for item in list_response.json() if item["id"] == job.id) + assert listed_job["current_step"] == "workflow_planned" + assert listed_job["progress_label"] == "工作流已规划" + + assert trace_summary_response.status_code == 200 + trace_summary = trace_summary_response.json() + assert "executor_completed" not in str(trace_summary) + assert "complete_image_asset" not in str(trace_summary) + assert trace_summary["total_events"] == 1 + finally: + app.dependency_overrides.clear() + + async def test_user_provider_analytics_aggregate_across_stories( db_session, auth_token, diff --git a/backend/tests/test_harness_runtime.py b/backend/tests/test_harness_runtime.py index a58ddaa..f5f1e40 100644 --- a/backend/tests/test_harness_runtime.py +++ b/backend/tests/test_harness_runtime.py @@ -1,5 +1,7 @@ """Tests for generation harness runtime support.""" +from pathlib import Path + import pytest from sqlalchemy import select @@ -7,8 +9,21 @@ from app.db.models import GenerationJob, GenerationJobEvent from app.services.adapters.storybook.primary import Storybook, StorybookPage from app.services.adapters.text.models import StoryOutput from app.services.generation_jobs import create_generation_job, record_generation_event +from app.services.harness.artifacts import AssetCompletionResult from app.services.harness.control import ExecutionControl, GenerationJobCanceledError +from app.services.harness.evaluation_replay import ( + EvaluationReplayArtifact, + EvaluationReplayCase, + ExpectedEvaluation, + replay_evaluation_golden_cases, + run_evaluation_replay_cases, +) +from app.services.harness.evaluators import evaluate_story_output, evaluate_storybook_output +from app.services.harness.executor import run_asset_plan from app.services.harness.plans import ( + WorkflowMode, + WorkflowPlan, + WorkflowTask, build_asset_plan, build_story_plan, build_storybook_plan, @@ -27,12 +42,18 @@ from app.services.harness.types import ( normalize_trace_metadata, step_for_event, ) +from app.services.story_status import StoryAssetStatus + +FIXTURES_DIR = ( + Path(__file__).parents[1] / "app" / "services" / "harness" / "fixtures" +) def test_event_type_maps_to_standard_workflow_step(): assert step_for_event("request_accepted") == WorkflowStep.REQUEST_ACCEPTANCE assert step_for_event("context_prepared") == WorkflowStep.CONTEXT_PREPARATION assert step_for_event("narrative_generated") == WorkflowStep.NARRATIVE_GENERATION + assert step_for_event("evaluation_completed") == WorkflowStep.EVALUATION assert step_for_event("story_saved") == WorkflowStep.STORY_PERSISTENCE assert step_for_event("provider_call_succeeded") == WorkflowStep.PROVIDER_INVOCATION assert step_for_event("quality_gate_failed") == WorkflowStep.NARRATIVE_GENERATION @@ -46,6 +67,7 @@ def test_event_type_maps_to_standard_workflow_step(): def test_event_type_maps_to_standard_artifact(): assert artifact_for_event("narrative_generated") == ArtifactKind.STORY_TEXT assert artifact_for_event("quality_gate_failed") == ArtifactKind.STORY_TEXT + assert artifact_for_event("evaluation_completed") == ArtifactKind.STORY_TEXT assert artifact_for_event("cover_image_succeeded") == ArtifactKind.COVER_IMAGE assert artifact_for_event("storybook_page_image_failed") == ArtifactKind.PAGE_IMAGE assert artifact_for_event("audio_cache_hit") == ArtifactKind.AUDIO @@ -108,6 +130,13 @@ def test_story_plan_without_assets_snapshot(): "required": True, "recoverable": False, }, + { + "key": "evaluate_narrative", + "step": "evaluation", + "artifact": "story_text", + "required": True, + "recoverable": False, + }, { "key": "persist_story", "step": "story_persistence", @@ -137,7 +166,7 @@ def test_story_plan_with_assets_marks_cover_recoverable(): plan = build_story_plan(generate_images=True).to_snapshot() assert plan["mode"] == "story_with_assets" - assert plan["tasks"][3] == { + assert plan["tasks"][4] == { "key": "generate_cover_image", "step": "image_generation", "artifact": "cover_image", @@ -153,13 +182,14 @@ def test_storybook_plan_with_images_marks_storybook_images_recoverable(): assert [task["key"] for task in plan["tasks"]] == [ "prepare_context", "generate_storybook_pages", + "evaluate_storybook_pages", "generate_storybook_images", "persist_storybook", "queue_postprocessing", "complete_generation", ] - assert plan["tasks"][2]["artifact"] == "image" - assert plan["tasks"][2]["recoverable"] is True + assert plan["tasks"][3]["artifact"] == "image" + assert plan["tasks"][3]["recoverable"] is True def test_asset_retry_plan_deduplicates_assets(): @@ -200,6 +230,86 @@ def test_asset_retry_plan_deduplicates_assets(): } +@pytest.mark.asyncio +async def test_run_asset_plan_executes_asset_tasks_in_plan_order(): + calls: list[str] = [] + + async def image_task() -> AssetCompletionResult: + calls.append("image") + return AssetCompletionResult( + asset="cover_image", + status=StoryAssetStatus.READY, + value="https://example.com/cover.png", + ) + + async def audio_task() -> AssetCompletionResult: + calls.append("audio") + return AssetCompletionResult( + asset="audio", + status=StoryAssetStatus.READY, + value=b"audio", + ) + + result = await run_asset_plan( + build_asset_plan(output_mode="asset_generation", assets=["audio", "image"]), + image_task=image_task, + audio_task=audio_task, + ) + + assert calls == ["audio", "image"] + assert result.executed_task_keys == ("complete_audio_asset", "complete_image_asset") + assert result.ignored_task_keys == ( + "start_asset_generation", + "complete_asset_generation", + ) + assert [item.asset for item in result.task_results] == ["audio", "cover_image"] + + +@pytest.mark.asyncio +async def test_run_asset_plan_ignores_unknown_non_asset_tasks(): + calls: list[str] = [] + plan = WorkflowPlan( + mode=WorkflowMode.ASSET_RETRY, + tasks=( + WorkflowTask( + key="start_asset_retry", + step=WorkflowStep.ASSET_RETRY, + artifact=ArtifactKind.NONE, + ), + WorkflowTask( + key="complete_video_asset", + step=WorkflowStep.UNKNOWN, + artifact=ArtifactKind.UNKNOWN, + required=False, + recoverable=True, + ), + WorkflowTask( + key="complete_asset_retry", + step=WorkflowStep.ASSET_RETRY, + artifact=ArtifactKind.NONE, + ), + ), + ) + + async def image_task() -> AssetCompletionResult: + calls.append("image") + return AssetCompletionResult( + asset="cover_image", + status=StoryAssetStatus.READY, + ) + + result = await run_asset_plan(plan, image_task=image_task) + + assert calls == [] + assert result.task_results == () + assert result.executed_task_keys == () + assert result.ignored_task_keys == ( + "start_asset_retry", + "complete_video_asset", + "complete_asset_retry", + ) + + def test_story_quality_gate_accepts_complete_child_safe_story(): validate_story_output( StoryOutput( @@ -211,6 +321,166 @@ def test_story_quality_gate_accepts_complete_child_safe_story(): ) +def test_story_evaluator_scores_complete_child_safe_story(): + result = evaluate_story_output( + StoryOutput( + mode="generated", + title="小兔子的月光花园", + story_text="小兔子在花园里学会了和朋友轮流分享水壶,也学会了复盘今天的努力。", + cover_prompt_suggestion="A gentle moonlit garden with a rabbit", + ), + education_theme="复盘", + ) + + assert result.passed is True + assert result.blocking is False + assert result.overall_score >= 0.9 + assert result.to_metadata()["scores"][0]["dimension"] == "structure" + + +def test_story_evaluator_blocks_quality_gate_failure(): + result = evaluate_story_output( + StoryOutput( + mode="generated", + title="空白故事", + story_text="", + cover_prompt_suggestion="A cover", + ) + ) + + assert result.passed is False + assert result.blocking is True + assert result.overall_score == 0.0 + assert result.gate_error is not None + assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == "missing_story_text" + + +def test_storybook_evaluator_scores_complete_child_safe_storybook(): + result = evaluate_storybook_output( + Storybook( + title="森林里的复盘星星", + main_character="小兔子露露", + art_style="温暖水彩", + cover_prompt="A warm watercolor forest cover", + pages=[ + StorybookPage( + page_number=1, + text="露露在森林里发现一颗会提醒她复盘的小星星。", + image_prompt="Lulu finds a star", + ), + StorybookPage( + page_number=2, + text="她回想今天的努力,学会下次先和朋友商量。", + image_prompt="Lulu thinking with friends", + ), + ], + ), + education_theme="复盘", + ) + + assert result.passed is True + assert result.blocking is False + assert result.overall_score >= 0.9 + + +def test_storybook_evaluator_blocks_quality_gate_failure(): + result = evaluate_storybook_output( + Storybook( + title="森林绘本", + main_character="小兔子", + art_style="水彩", + cover_prompt="A forest cover", + pages=[ + StorybookPage(page_number=1, text="第一页。", image_prompt="page 1"), + StorybookPage(page_number=1, text="第二页。", image_prompt="page 2"), + ], + ) + ) + + assert result.passed is False + assert result.blocking is True + assert result.gate_error is not None + assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == ( + "invalid_storybook_page_number" + ) + + +def test_evaluation_golden_cases_replay_successfully(): + result = replay_evaluation_golden_cases( + FIXTURES_DIR / "evaluation_golden_cases.json" + ) + + assert result.passed is True, result.failure_report() + assert result.failed_case_ids == () + assert len(result.cases) == 11 + assert { + case.artifact + for case in result.cases + } == { + EvaluationReplayArtifact.STORY, + EvaluationReplayArtifact.STORYBOOK, + } + + +def test_evaluation_golden_cases_report_internal_coverage_summary(): + result = replay_evaluation_golden_cases( + FIXTURES_DIR / "evaluation_golden_cases.json" + ) + + summary = result.coverage_summary() + + assert summary["artifact"] == { + "storybook": 5, + "story": 6, + } + assert summary["age_band"] == { + "3-4": 4, + "5-6": 4, + "unknown": 2, + "7-8": 1, + } + assert summary["risk_area"] == { + "schema_error": 4, + "happy_path": 2, + "readability_warning": 2, + "safety_error": 2, + "length_boundary": 1, + } + assert summary["outcome"] == { + "blocked": 8, + "passed": 3, + } + assert summary["tags"]["story"] == 6 + assert summary["tags"]["storybook"] == 5 + assert summary["tags"]["blocking"] == 6 + assert summary["tags"]["threshold_block"] == 2 + + +def test_evaluation_replay_reports_expectation_mismatch(): + case = EvaluationReplayCase( + case_id="expectation-mismatch", + artifact=EvaluationReplayArtifact.STORY, + input_payload={"keywords": "小兔子"}, + output_payload={ + "mode": "generated", + "title": "小兔子的花园", + "story_text": "小兔子学会了和朋友分享水壶。", + "cover_prompt_suggestion": "A rabbit sharing a watering can", + }, + expected=ExpectedEvaluation( + passed=True, + blocking=False, + min_overall_score=0.99, + ), + ) + + result = run_evaluation_replay_cases([case]) + + assert result.passed is False + assert result.failed_case_ids == ("expectation-mismatch",) + assert "expected overall_score >=" in result.failure_report() + + def test_story_quality_gate_rejects_missing_story_text(): output = StoryOutput( mode="generated", diff --git a/docs/planning/harness-stage-10-report.md b/docs/planning/harness-stage-10-report.md new file mode 100644 index 0000000..e1fdc88 --- /dev/null +++ b/docs/planning/harness-stage-10-report.md @@ -0,0 +1,159 @@ +# Harness Engineering 改造阶段 10 报告 + +**阶段**: 10 - 资产计划与 Public Metadata Sanitizer +**日期**: 2026-06-22 +**状态**: 已完成当前切片 +**范围**: 资产生成/重试 WorkflowPlan、用户侧 job event metadata 白名单脱敏、回归测试和商业机密边界复核 + +--- + +## 1. 本阶段目标 + +阶段 10 的目标是把资产任务也纳入 Harness Engineering 的显式计划模型,并把用户侧事件 metadata 从“过滤少数内部事件”升级为“白名单公开”。 + +本阶段重点: + +- `asset_generation` 写入 `workflow_planned`。 +- `asset_retry` 写入 `workflow_planned`。 +- 旧封面/音频兼容接口创建的资产 job 也写入 plan。 +- 用户侧 job detail 的 event metadata 使用 public sanitizer。 +- 内部数据库事件继续保留完整 metadata,供测试、内部分析和 admin-only 能力使用。 + +## 2. 已完成工作 + +### 资产 WorkflowPlan + +修改文件: + +- `backend/app/services/story_service.py` + +新增行为: + +- 后台 `asset_generation` worker 在执行资源补全前记录 `asset_generation` plan。 +- `/api/generations/{story_id}/retry-assets` 同步重试路径记录 `asset_retry` plan。 +- 旧 `/api/image/generate/{story_id}` 和 `/api/audio/{story_id}` 兼容路径记录 `asset_generation` plan。 + +资产 plan 快照: + +- `plan.mode=asset_generation` 或 `asset_retry` +- 图片任务使用 `complete_image_asset` +- 音频任务使用 `complete_audio_asset` +- 图片/音频任务均为 `required=false`、`recoverable=true` + +### Public Metadata Sanitizer + +修改文件: + +- `backend/app/services/generation_jobs.py` + +新增能力: + +- `public_generation_event_metadata(...)`。 +- 用户侧 `public_generation_event_to_response(...)` 不再原样返回 event metadata。 +- `evaluation_completed` 事件继续完全过滤。 +- `workflow_planned` 只返回 coarse plan 摘要: + - `plan_mode` + - `planned_task_count` + - `recoverable_task_count` + +用户侧允许保留: + +- `step` +- `artifact` +- `failure_category` +- `asset` / `assets` +- `status` +- `mode` +- `output_mode` +- `input_type` +- `page_count` +- `page_number` +- `adapter` +- `capability` +- `strategy` +- `latency_ms` +- `estimated_cost_usd` +- 资源状态和少量可解释执行上下文 + +用户侧禁止返回: + +- 原始 `plan` +- 原始 `plan.tasks` +- `result_snapshot` +- 内部阈值 +- 内部错误原文 +- `overall_score` +- 维度分数 +- 评分 reason +- golden replay 信息 + +## 3. 测试覆盖 + +修改文件: + +- `backend/tests/test_generation_jobs.py` + +新增或更新覆盖: + +- 更新 `asset_retry` 事件顺序,断言 `asset_retry` plan。 +- 更新 `asset_generation` worker 事件顺序,断言 `asset_generation` plan。 +- 新增 `test_user_generation_job_detail_sanitizes_public_event_metadata`,确认用户 API 不返回原始 plan、tasks、result snapshot、内部阈值和内部错误原文。 + +## 4. 验证结果 + +已执行: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +结果: + +- 定向生成任务测试:`22 passed` +- 后端全量测试:`152 passed` +- Ruff:`All checks passed!` +- 用户前端构建:通过 +- 管理端构建:通过 + +构建提示: + +- `frontend` 和 `admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。 +- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。 +- 以上均为依赖数据 freshness 提示,不影响当前构建结果。 + +## 5. 自审结论 + +本阶段继续保持“内部完整、外部最小”的边界: + +- 内部 event metadata 没有丢失,admin-only 和测试仍可读取完整 plan 与评测数据。 +- 用户侧 job event metadata 已从 denylist 走向 allowlist,未来新增内部字段默认不会公开。 +- 用户侧仍可看到进度、资源、Provider 和失败分类等可操作信息。 +- 原始 `plan.tasks`、内部阈值、内部错误原文和 result snapshot 不进入用户事件流。 + +## 6. Bug 与风险记录 + +已发现并即时修复的问题: + +- 初次测试时 `asset_generation` 和 `asset_retry` 的旧事件顺序断言未包含 `workflow_planned`;已更新测试并增加 plan 快照断言。 +- sanitizer 测试最初用字符串搜索禁止 `plan`,误伤公开字段 `plan_mode`;已改为断言原始 `plan` key 不存在。 + +当前风险: + +- `request_payload` 仍作为 job detail 字段返回,当前包含用户发起请求本身。后续如请求 payload 增加内部调度参数,需要单独做 payload sanitizer。 +- Provider 成本信息当前仍在用户侧展示,属于既有产品运营摘要。若商业策略变化,需要从 white list 中移除 `estimated_cost_usd` 并同步前端。 +- admin-frontend 当前复用用户侧 `/api/generations/jobs/{job_id}`,因此看到的是脱敏事件。未来如果管理端需要完整内部 event metadata,应新增 admin-only trace endpoint。 + +## 7. 后续建议 + +下一阶段建议进入阶段 11: + +1. 设计 admin-only generation trace detail,让管理端在权限保护下查看完整内部 plan/evaluation/provider metadata。 +2. 为 `request_payload` 增加 public sanitizer,防止未来内部调度字段被用户端 job detail 透出。 +3. 继续推进 executor 小步接管,把资产 plan 从“记录事实”升级为“驱动执行”的最小执行单元。 diff --git a/docs/planning/harness-stage-11-report.md b/docs/planning/harness-stage-11-report.md new file mode 100644 index 0000000..1bc0652 --- /dev/null +++ b/docs/planning/harness-stage-11-report.md @@ -0,0 +1,165 @@ +# Harness Engineering 改造阶段 11 报告 + +**阶段**: 11 - Trace 访问分级与 Request Payload Sanitizer +**日期**: 2026-06-22 +**状态**: 已完成当前切片 +**范围**: 用户侧 request payload 白名单脱敏、admin-only 完整生成 trace、回归测试和商业机密边界复核 + +--- + +## 1. 本阶段目标 + +阶段 11 承接阶段 10 的风险记录:事件 metadata 已经白名单脱敏,但用户侧 job detail 仍会原样返回 `request_payload`。如果后续 executor 或调度层把内部字段写入 payload,就可能把内部策略、Provider override 或评测配置分发给用户端。 + +本阶段目标: + +- 用户侧 `GET /api/generations/jobs/{job_id}` 只返回安全公开的 request payload 字段。 +- 管理控制面新增完整 trace detail,用于内部审查、排障和评测驱动复盘。 +- 完整内部评测数据、workflow plan、原始 request payload 只在 `admin_guard` 后可见。 + +## 2. 已完成工作 + +### 用户侧 Request Payload Sanitizer + +修改文件: + +- `backend/app/services/generation_jobs.py` + +新增能力: + +- `public_generation_request_payload(...)` +- 用户侧 `get_generation_job_detail(...)` 不再原样返回 `job.request_payload` +- request payload 使用白名单公开 + +当前用户侧允许字段: + +- `assets` +- `child_profile_id` +- `generate_images` +- `input_type` +- `output_mode` +- `page_count` +- `story_id` +- `type` +- `universe_id` + +当前用户侧禁止字段: + +- 原始 `data` +- `education_theme` +- 内部调度 token +- Provider override +- evaluation policy +- 任意 dict 型内部配置 + +### Admin-Only 完整 Trace Detail + +新增文件: + +- `backend/app/services/admin_generation_trace.py` + +修改文件: + +- `backend/app/api/admin_providers.py` + +新增接口: + +```http +GET /admin/generations/jobs/{job_id}/trace +``` + +接口能力: + +- 返回完整 `request_payload` +- 返回完整 event stream +- 不过滤 `evaluation_completed` +- 不脱敏 `workflow_planned.event_metadata.plan.tasks` +- 返回 `user_id` 供管理控制面审计 +- 继承 admin router 的 `admin_guard` 保护 + +## 3. 测试覆盖 + +修改文件: + +- `backend/tests/test_generation_jobs.py` +- `backend/tests/test_admin_providers.py` +- `backend/tests/harness-evaluation-test-cases.md` + +新增覆盖: + +- `test_user_generation_job_detail_sanitizes_request_payload` + - 断言用户 job detail 不返回原始 `data` + - 断言用户 job detail 不返回内部调度 token、Provider override 或 evaluation policy + - 断言用户 job detail 保留必要公开控制字段 +- `test_admin_generation_job_trace_returns_internal_event_stream` + - 断言 admin trace 返回完整 request payload + - 断言 admin trace 返回 `workflow_planned` 原始 plan tasks + - 断言 admin trace 返回 `evaluation_completed` 和评分 metadata +- `test_admin_generation_job_trace_requires_admin_auth` + - 断言未通过 admin guard 时返回 `401` + +## 4. 当前验证结果 + +已执行: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +结果: + +- 定向生成任务 + admin trace 测试:`31 passed` +- 后端全量测试:`155 passed` +- Ruff:`All checks passed!` +- 用户前端构建:通过 +- 管理端构建:通过 + +补充敏感公开面扫描: + +```bash +rg -n "evaluations/analytics|EvaluationAnalytics|admin_evaluation|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py +``` + +结果:无命中。用户前端、公开 schema、用户 API 和用户 job service 未暴露评测 analytics、评分、golden/replay 或内部 request payload 字段。 + +构建提示: + +- `frontend` 和 `admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。 +- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。 +- 以上均为依赖数据 freshness 提示,不影响当前构建结果。 + +## 5. 自审结论 + +本阶段把 trace 数据访问明确分成两层: + +- 用户层:只看可用功能、进度、资源状态和少量安全控制字段。 +- 管理层:在 admin guard 后查看完整内部链路,用于调试、审查和评测驱动改进。 + +这满足“用户前端不能展示评测数据”的要求,并且比阶段 10 更稳:即使后续内部调度把更多策略字段写入 request payload,用户接口也不会默认公开。 + +## 6. Bug 与风险记录 + +已发现并即时修复的问题: + +- 无新增运行时 bug。 + +当前风险: + +- admin-frontend 当前还没有专门调用 `/admin/generations/jobs/{job_id}/trace` 的页面;管理端如果继续复用用户接口,看到的仍是脱敏 trace。这是安全默认值,但内部排障体验还可以继续增强。 +- 用户 request payload 白名单当前保守,不返回 `data` 和 `education_theme`。如果未来用户端确实需要展示“我刚才输入了什么”,应设计单独的用户输入回显字段,并避免混入内部调度字段。 +- admin trace 返回完整内部 metadata,必须继续保持在 admin-only router 下,不得被用户前端或公开 API 复用。 + +## 7. 后续建议 + +下一阶段建议进入阶段 12: + +1. 推进 executor 小步接管,让 `WorkflowPlan` 从“记录计划”逐步变成“驱动最小任务执行”。 +2. 先选择资产生成或 asset retry 作为低风险 executor 试点。 +3. 管理端可后续增加 trace detail UI,但必须调用 admin-only endpoint,并明确标记为内部审查视图。 diff --git a/docs/planning/harness-stage-12-report.md b/docs/planning/harness-stage-12-report.md new file mode 100644 index 0000000..c950aee --- /dev/null +++ b/docs/planning/harness-stage-12-report.md @@ -0,0 +1,150 @@ +# Harness Engineering 改造阶段 12 报告 + +**阶段**: 12 - Plan-Driven Asset Executor 试点 +**日期**: 2026-06-22 +**状态**: 已完成当前切片 +**范围**: 资产任务 executor 最小接管、后台资产生成/资源重试/旧资源接口接入、回归测试和用户公开面边界复核 + +--- + +## 1. 本阶段目标 + +阶段 12 的目标是让 `WorkflowPlan` 不再只是 trace 快照,而是开始驱动一部分真实执行。为了控制风险,本阶段只接管资产任务,不迁移主文本生成、评测和故事持久化。 + +本阶段重点: + +- 新增 plan-driven asset runner。 +- 后台 `asset_generation` 按 plan task key 执行图片/音频任务。 +- 同步 `asset_retry` 按 plan task key 执行图片/音频重试。 +- 旧封面和音频兼容接口也通过同一个 runner 执行。 +- 保留既有 asset workflow 对 provider、缓存、状态同步、取消检查和事件记录的职责。 + +## 2. 已完成工作 + +### Asset Executor Runner + +修改文件: + +- `backend/app/services/harness/executor.py` + +新增能力: + +- `AssetPlanRunResult` +- `run_asset_plan(...)` + +执行规则: + +- 只支持 `asset_generation` 和 `asset_retry` plan。 +- `complete_image_asset` 调用 image handler。 +- `complete_audio_asset` 调用 audio handler。 +- `start_asset_*`、`complete_asset_*` 和未知非资产 task 记录为 ignored,不触发 provider handler。 +- 返回 task results、executed task keys 和 ignored task keys,便于单测和后续观测扩展。 + +### Story Service 接入 + +修改文件: + +- `backend/app/services/story_service.py` + +已接入路径: + +- 后台 `asset_generation` worker。 +- 同步 `retry_story_assets`。 +- 旧 `generate_story_cover`。 +- 旧 `generate_story_audio`。 + +保持不变的职责: + +- 图片/音频 provider 调用仍在 `asset_workflows`。 +- 音频缓存读写仍在 `asset_workflows`。 +- story 状态同步仍在 `asset_workflows`。 +- `cover_image_*`、`audio_*`、`storybook_*image*` 事件仍由 asset workflow 记录。 +- job 完成/失败语义保持原有 `finish_generation_job` 路径。 + +## 3. 测试覆盖 + +修改文件: + +- `backend/tests/test_harness_runtime.py` +- `backend/tests/test_generation_jobs.py` +- `backend/tests/harness-evaluation-test-cases.md` + +新增覆盖: + +- `test_run_asset_plan_executes_asset_tasks_in_plan_order` + - 验证 runner 按 plan task 顺序执行音频和图片。 + - 验证非资产生产 task 被记录为 ignored。 +- `test_run_asset_plan_ignores_unknown_non_asset_tasks` + - 验证未知非资产 task 不触发 handler。 +- `test_asset_generation_job_worker_executes_assets_in_plan_order` + - 验证后台组合资产 job 按 plan 顺序先生成音频再生成图片。 + - 验证 story 的 `audio_status` 和 `image_status` 均为 `ready`。 + - 验证 event stream 与 plan tasks 对齐。 + +## 4. 当前验证结果 + +已执行: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_harness_runtime.py tests/test_generation_jobs.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +结果: + +- Harness runtime + generation job 定向测试:`48 passed` +- 后端全量测试:`158 passed` +- Ruff:`All checks passed!` +- 用户前端构建:通过 +- 管理端构建:通过 + +补充敏感公开面扫描: + +```bash +rg -n "evaluations/analytics|EvaluationAnalytics|admin_evaluation|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py +``` + +结果:无命中。用户前端、公开 schema、用户 API 和用户 job service 未暴露评测 analytics、评分、golden/replay 或内部 request payload 字段。 + +构建提示: + +- `frontend` 和 `admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。 +- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。 +- 以上均为依赖数据 freshness 提示,不影响当前构建结果。 + +## 5. 自审结论 + +本阶段完成了 executor 接管的第一步,但没有扩大到主生成链路: + +- `WorkflowPlan` 已能驱动资产 task 执行。 +- asset workflow 仍保持单一职责,负责真实 provider 调用和状态转换。 +- 事件流与用户可见行为保持兼容。 +- 用户侧仍只看到 coarse plan metadata;原始 `plan.tasks`、评测结果和内部调度数据不进入用户接口。 + +这个切片足够小,失败时也容易回滚:只需要把资产入口从 `run_asset_plan` 调回原来的顺序 `if "image"` / `if "audio"` 分支。 + +## 6. Bug 与风险记录 + +已发现并即时修复的问题: + +- 接入 runner 后,原来的 `_retry_*` 私有薄封装不再被调用。已删除这些死代码,避免后续误读。 + +当前风险: + +- `run_asset_plan` 当前只解释图片和音频 task,未知资产默认 ignored。未来如果新增视频、角色设定图等资产,需要显式增加 handler,而不是依赖 unknown task。 +- 主文本生成、评测和持久化仍未由 executor 驱动;它们当前仍是 plan-aware trace,而不是 plan-driven execution。 +- runner 当前不单独写入 task-level start/finish 事件,仍复用 asset workflow 的现有事件。若后续需要更细粒度 executor 审计,可以增加 admin-only 内部事件,但不能默认进入用户侧。 + +## 7. 后续建议 + +下一阶段建议进入阶段 13: + +1. 将 `WorkflowPlan` 的 task result 纳入 admin-only trace 聚合,便于看 executor 执行覆盖率。 +2. 选择主文本生成中的低风险 task,例如 `queue_postprocessing` 或 `complete_generation`,继续小步接管。 +3. 若要接管 `evaluate_narrative`,必须先补更明确的评测数据隔离测试,避免任何评分字段进入用户前端。 diff --git a/docs/planning/harness-stage-13-report.md b/docs/planning/harness-stage-13-report.md new file mode 100644 index 0000000..d34ee24 --- /dev/null +++ b/docs/planning/harness-stage-13-report.md @@ -0,0 +1,182 @@ +# Harness Engineering 改造阶段 13 报告 + +**阶段**: 13 - Admin-Only Executor Coverage +**日期**: 2026-06-23 +**状态**: 已完成当前切片 +**范围**: 内部 executor coverage 事件、admin-only coverage 聚合、用户侧 executor 数据隔离、回归测试 + +--- + +## 1. 本阶段目标 + +阶段 13 承接阶段 12 的 plan-driven asset executor:资产任务已经按 `WorkflowPlan` 执行,但内部还缺少跨 job 的覆盖率视角。本阶段把 executor 执行结果记录为内部事件,并新增管理控制面聚合,帮助我们审查计划任务是否真的被执行。 + +本阶段目标: + +- 资产 executor 完成后写入内部 `executor_completed` 事件。 +- 管理端可聚合 executor runs、planned/executed/ignored task counts、task keys 和 result assets。 +- 用户端继续看不到 executor task keys、coverage metadata 或内部 executor step。 + +## 2. 已完成工作 + +### Executor Coverage Metadata + +修改文件: + +- `backend/app/services/harness/executor.py` +- `backend/app/services/story_service.py` + +新增能力: + +- `AssetPlanRunResult.result_assets` +- `AssetPlanRunResult.to_metadata(...)` +- `record_executor_result(...)` + +内部 metadata 包含: + +- `plan_mode` +- `planned_task_count` +- `executed_task_count` +- `ignored_task_count` +- `result_count` +- `executed_task_keys` +- `ignored_task_keys` +- `result_assets` + +已接入路径: + +- 后台 `asset_generation` +- 同步 `asset_retry` +- 旧 `generate_story_cover` +- 旧 `generate_story_audio` + +### Admin-Only Coverage Analytics + +新增文件: + +- `backend/app/services/admin_executor_coverage.py` + +修改文件: + +- `backend/app/api/admin_providers.py` + +新增接口: + +```http +GET /admin/executors/coverage +``` + +支持过滤: + +```http +GET /admin/executors/coverage?days=7 +GET /admin/executors/coverage?plan_mode=asset_retry +``` + +返回聚合: + +- total runs +- total planned/executed/ignored task counts +- coverage ratio +- job/story/user counts +- by plan mode +- by output mode +- executed task keys +- ignored task keys +- result assets + +### 用户侧隔离 + +修改文件: + +- `backend/app/services/generation_jobs.py` + +隔离规则: + +- 用户 job detail 过滤 `executor_completed` 事件。 +- 用户 job summary 如果内部 `current_step=executor_completed`,对外映射为 `workflow_planned` 和“工作流已规划”。 +- 用户公开 metadata 白名单不包含 executor task keys 或 coverage 字段。 + +## 3. 测试覆盖 + +修改文件: + +- `backend/tests/test_generation_jobs.py` +- `backend/tests/test_admin_providers.py` +- `backend/tests/harness-evaluation-test-cases.md` + +新增或更新覆盖: + +- 资产生成/重试事件序列包含内部 `executor_completed`。 +- 用户 job detail 不返回 `executor_completed` 或 task keys。 +- 用户 job summary 不暴露内部 executor step。 +- admin trace 可读取完整 `executor_completed`。 +- admin coverage 聚合 total runs、task counts、coverage ratio、task keys 和 result assets。 +- admin coverage 支持 `plan_mode` 过滤并拒绝非法 plan mode。 +- admin coverage 未鉴权返回 `401`。 + +## 4. 当前验证结果 + +已执行: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py tests/test_harness_runtime.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +结果: + +- 定向 generation/admin/harness 测试:`59 passed` +- 后端全量测试:`161 passed` +- Ruff:`All checks passed!` +- 用户前端构建:通过 +- 管理端构建:通过 + +补充敏感公开面扫描: + +```bash +rg -n "executors/coverage|ExecutorCoverage|admin_executor|executor_completed|executed_task_keys|ignored_task_keys|coverage_ratio|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py +``` + +结果:仅命中 `backend/app/services/generation_jobs.py` 中对 `executor_completed` 的过滤和 current step 映射逻辑。用户前端、公开 schema 和用户 API route 未暴露 executor coverage、task keys、评测分数、golden/replay 或内部 request payload 字段。 + +构建提示: + +- `frontend` 和 `admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。 +- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。 +- 以上均为依赖数据 freshness 提示,不影响当前构建结果。 + +## 5. 自审结论 + +本阶段保留了“内部完整、用户最小”的边界: + +- executor task keys 是内部执行证据,只进入 admin-only trace/coverage。 +- 用户端仍只看到可用功能和进度,不看到 task keys、coverage ratio 或内部 executor step。 +- admin coverage 聚合不返回故事正文、prompt 或评测评分 reason。 + +## 6. Bug 与风险记录 + +已发现并即时修复的问题: + +- 初版 admin coverage bucket 使用通用模型,响应中出现无关字段 `null`。已拆成专用 bucket response model,减少管理端响应噪声。 +- `executor_completed` 会短暂写入 `job.current_step`。已在用户 summary 中映射为安全公开的 `workflow_planned`,并补测试防止泄露。 + +当前风险: + +- `executor_completed` 当前只覆盖资产 executor。主文本、评测和持久化仍是 plan-aware,不应被 coverage 误解为全链路 executor 覆盖。 +- coverage ratio 使用 executed/planned 任务数,包含 start/complete 这类 ignored task,因此是执行器覆盖口径,不是产品成功率。 +- admin coverage 返回 task keys,必须保持 admin-only,不允许用户前端调用。 + +## 7. 后续建议 + +下一阶段建议进入阶段 14: + +1. 在 admin trace detail 中增加 executor coverage summary,减少管理端自行解析事件。 +2. 选择 `queue_postprocessing` 或 `complete_generation` 这类低风险主链路 task 继续小步接管。 +3. 若要接管评测 task,先补更严格的用户侧敏感扫描和 contract tests。 diff --git a/docs/planning/harness-stage-14-report.md b/docs/planning/harness-stage-14-report.md new file mode 100644 index 0000000..e0515c2 --- /dev/null +++ b/docs/planning/harness-stage-14-report.md @@ -0,0 +1,188 @@ +# Harness Engineering 阶段 14 报告 + +**阶段**: Admin Trace Executor Coverage Summary +**日期**: 2026-06-23 +**状态**: 已完成当前切片 + +## 1. 阶段目标 + +本阶段继续沿用原架构路径,不扩大 executor 对主文本生成、评测或持久化的接管范围,只增强管理控制面的审查能力。 + +目标: + +- 让 admin-only 完整 generation trace 自带当前 job 的 executor coverage 摘要。 +- 复用全局 executor coverage 聚合逻辑,避免全局 coverage 与单 job trace 统计口径漂移。 +- 修正用户 trace summary 隔离规则,确保内部 `executor_completed` 不通过聚合数量、task key 或 result asset 泄露到用户侧。 + +## 2. 完成内容 + +### H14-1: 抽出 executor coverage 纯聚合函数 + +- 在 `app/services/admin_executor_coverage.py` 中新增 `summarize_executor_coverage_rows(...)`。 +- `GET /admin/executors/coverage` 继续返回原有结构,但内部改为复用共享聚合函数。 +- 聚合口径保持不变:runs、planned/executed/ignored task counts、coverage ratio、plan mode、output mode、task keys 和 result assets。 + +### H14-2: admin trace 返回 `executor_coverage` + +- `app/services/admin_generation_trace.py` 在完整事件流之外,新增当前 job 的 `executor_coverage` 摘要。 +- trace 内嵌 summary 的 `scope` 为 `admin_internal_job_executor_coverage`。 +- `app/api/admin_providers.py` 的 `AdminGenerationJobTraceResponse` 增加 `executor_coverage` 字段。 + +### H14-3: 用户 trace summary 过滤 `executor_completed` + +- `app/services/generation_jobs.py` 的 trace summary 聚合现在同时跳过 `evaluation_completed` 和 `executor_completed`。 +- 用户侧仍然只看到产品可解释的 workflow 进度,不看到内部 executor coverage、task keys 或 result assets。 + +### H14-4: 测试覆盖 + +- `tests/test_admin_providers.py` 增加 admin trace 内嵌 executor coverage 断言。 +- `tests/test_generation_jobs.py` 增加用户 trace summary 不包含 `executor_completed` 和 task key 的断言。 +- `backend/tests/harness-evaluation-test-cases.md` 增加 TC-ADM-008,并更新 TC-ST-010。 + +### H14-5: 文档同步 + +- `docs/technical/harness-engineering-modernization.md` 更新至阶段 0-14。 +- 新增 `Admin Trace Executor Coverage Summary` 设计章节。 +- 增加 FR-015、NFR-011、阶段 14 计划、风险缓解和当前状态。 + +## 3. 审查结论 + +### 用户侧商业机密隔离 + +本阶段没有向用户端新增任何 evaluation 或 executor coverage 数据。 + +用户侧继续隐藏: + +- `evaluation_completed` +- `executor_completed` +- `overall_score` +- 评分维度、阈值、golden replay +- `executed_task_keys` +- `ignored_task_keys` +- `executor_coverage` + +额外修正: + +- 用户 trace summary 的 `total_events` 不再统计内部 `executor_completed`,避免通过事件数量暴露内部执行器步骤。 + +### 管理端审查能力 + +管理端现在可以在单个 trace 响应里同时查看: + +- 完整 request payload。 +- 完整 event stream。 +- 完整 evaluation metadata。 +- 当前 job 的 executor coverage summary。 + +这让后续排查 plan-driven executor 迁移时,不必在完整 trace 和全局 coverage API 之间手动拼接数据。 + +### 架构边界 + +本阶段仍保持阶段 12 的保守边界: + +- executor 只接管资产 task key。 +- 主文本生成、绘本主结构、评测和持久化仍走原服务路径。 +- admin-only 聚合能力不改变用户 API schema。 + +## 4. 验证记录 + +已通过: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_generation_jobs.py tests/test_harness_runtime.py -q +``` + +结果: + +```text +59 passed +``` + +已通过: + +```bash +cd backend +.venv/bin/python -m ruff check app tests +``` + +结果: + +```text +All checks passed! +``` + +已通过: + +```bash +cd backend +.venv/bin/python -m pytest +``` + +结果: + +```text +161 passed +``` + +已通过: + +```bash +cd frontend +npm run build +``` + +结果: + +```text +vue-tsc && vite build +✓ built +``` + +备注:Browserslist 数据陈旧警告,不影响构建结果。 + +已通过: + +```bash +cd admin-frontend +npm run build +``` + +结果: + +```text +vue-tsc && vite build +✓ built +``` + +备注:Browserslist 与 baseline-browser-mapping 数据陈旧警告,不影响构建结果。 + +已通过用户侧敏感字段扫描: + +```bash +rg -n "executors/coverage|ExecutorCoverage|admin_executor|executor_coverage|executor_completed|executed_task_keys|ignored_task_keys|coverage_ratio|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py +``` + +扫描结果: + +- 未在用户前端、用户 schema 或用户 story API 中发现 admin executor coverage、评测分数、golden replay、provider override 或内部 dispatch token。 +- 命中项仅位于 `generation_jobs.py` 的内部事件过滤和安全进度映射逻辑。 + +已通过: + +```bash +git diff --check +``` + +## 5. 风险与后续建议 + +| 风险 | 状态 | 建议 | +| --- | --- | --- | +| admin trace 与全局 coverage 口径漂移 | 已缓解 | 已抽共享聚合函数,后续新增字段必须先进该函数 | +| 用户 trace summary 暗含内部事件数量 | 已修正 | 保持内部事件 denylist,并继续用测试覆盖 | +| executor 接管范围扩大过快 | 已控制 | 下一阶段仍应先围绕资产与 observability,不急于接管主生成 | +| admin-only 数据误接用户前端 | 持续关注 | 每阶段继续运行敏感字段扫描 | + +## 6. 阶段结论 + +阶段 14 完成了 admin trace 的审查能力增强,并补齐用户 trace summary 对 executor 内部事件的隔离。当前架构继续符合“评测驱动、admin-only 内部质量资产、用户侧只展示可用功能”的边界。 diff --git a/docs/planning/harness-stage-15-report.md b/docs/planning/harness-stage-15-report.md new file mode 100644 index 0000000..dcaee3e --- /dev/null +++ b/docs/planning/harness-stage-15-report.md @@ -0,0 +1,228 @@ +# Harness Engineering 阶段 15 报告 + +**阶段**: Admin-Only Harness Readiness +**日期**: 2026-06-23 +**状态**: 已完成当前切片 + +## 1. 阶段目标 + +本阶段继续沿用原设计路径:不扩大 executor 对主生成链路的接管范围,而是建立一个内部 readiness 审查摘要,让后续每次扩大 harness 接管范围前都能先看聚合质量门。 + +目标: + +- 将内部 golden replay、evaluation analytics 和 executor coverage 串成一个 admin-only readiness audit。 +- 保持 readiness 只返回聚合状态、阈值和覆盖摘要。 +- 避免把评测数据、executor task key 或 readiness 结果分发到用户端。 +- 修正运行环境风险:golden replay fixture 必须随 app 发布,而不是只存在于 tests 目录。 + +## 2. 完成内容 + +### H15-1: app 内部 golden replay fixture + +- 将 `evaluation_golden_cases.json` 放入 `app/services/harness/fixtures/`。 +- `tests/test_harness_runtime.py` 改为读取 app 内部 fixture。 +- 这样 Docker 镜像 `COPY app ./app` 后,admin readiness 仍能读取 golden cases。 + +### H15-2: admin harness readiness 服务 + +- 新增 `app/services/admin_harness_readiness.py`。 +- 聚合输入: + - 内部 golden replay。 + - `get_admin_evaluation_analytics(...)`。 + - `get_admin_executor_coverage(...)`。 +- 输出: + - `status`: `ready`、`needs_attention` 或 `blocked`。 + - `thresholds`: 当前内部 readiness 阈值。 + - `checks`: 每个质量门的状态与聚合细节。 + - `golden_replay`、`evaluation_analytics`、`executor_coverage` 聚合摘要。 + +当前 checks: + +| Check | 行为 | +| --- | --- | +| `golden_replay` | golden cases 未全部通过则 `blocked` | +| `runtime_evaluation_samples` | 当前窗口没有 evaluation 样本则 `needs_attention` | +| `runtime_evaluation_quality` | pass rate 或 average score 低于阈值则 `blocked` | +| `executor_coverage_samples` | 当前窗口没有 executor run 则 `needs_attention` | +| `executor_coverage_ratio` | coverage ratio 低于阈值则 `blocked` | + +### H15-3: admin-only readiness API + +- 新增 `GET /admin/harness/readiness`。 +- 复用 admin router 的 `admin_guard`。 +- 支持 `days` 查询参数,与 evaluation analytics 和 executor coverage 的窗口口径一致。 + +### H15-4: 测试覆盖 + +- `tests/test_admin_providers.py` 新增 readiness ready 路径测试。 +- 新增 low runtime quality blocked 路径测试。 +- 新增 admin auth required 测试。 +- 测试断言 readiness 响应不包含 story title、score reason 或 quality gate message。 + +### H15-5: 文档同步 + +- `docs/technical/harness-engineering-modernization.md` 更新至阶段 0-15。 +- `backend/tests/harness-evaluation-test-cases.md` 新增 TC-ADM-009、TC-ADM-010。 +- 本报告记录安全边界、审查结论和验证结果。 + +## 3. 审查结论 + +### 用户侧商业机密隔离 + +本阶段没有新增用户端接口、用户前端类型或用户前端展示。 + +用户侧继续不可见: + +- `GET /admin/harness/readiness` +- `golden_replay` +- `evaluation_analytics` +- `executor_coverage` +- `overall_score` +- 评分维度、评分 reason、阈值 +- `executed_task_keys` +- `ignored_task_keys` +- quality gate message + +### 管理端输出边界 + +readiness 是 admin-only 聚合摘要。它允许管理端看到: + +- 当前窗口的运行期 evaluation 聚合。 +- 当前窗口的 executor coverage 聚合。 +- golden replay 是否通过及覆盖标签分布。 +- readiness checks 和阈值。 + +它不返回: + +- 故事正文。 +- 绘本分页正文。 +- 用户 prompt。 +- cover prompt。 +- score reason。 +- quality gate message。 +- 单条 evaluation event 或 executor event 明细。 + +### 架构边界 + +阶段 15 没有改变生成执行路径: + +- 主文本生成仍走现有 service。 +- 绘本主结构仍走现有 service。 +- executor 仍只接管资产 task key。 +- readiness 只读聚合数据,不写入 job 或 story 状态。 + +## 4. 验证记录 + +已通过: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_admin_providers.py -q +``` + +结果: + +```text +13 passed +``` + +已通过: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_harness_runtime.py -q +``` + +结果: + +```text +37 passed +``` + +已通过: + +```bash +cd backend +.venv/bin/python -m ruff check app tests +``` + +结果: + +```text +All checks passed! +``` + +已通过: + +```bash +cd backend +.venv/bin/python -m pytest +``` + +结果: + +```text +164 passed +``` + +已通过: + +```bash +cd frontend +npm run build +``` + +结果: + +```text +vue-tsc && vite build +✓ built +``` + +备注:Browserslist 数据陈旧警告,不影响构建结果。 + +已通过: + +```bash +cd admin-frontend +npm run build +``` + +结果: + +```text +vue-tsc && vite build +✓ built +``` + +备注:Browserslist 与 baseline-browser-mapping 数据陈旧警告,不影响构建结果。 + +已通过用户侧敏感字段扫描: + +```bash +rg -n "harness/readiness|HarnessReadiness|admin_harness|golden_replay|evaluation_analytics|executor_coverage|executors/coverage|ExecutorCoverage|admin_executor|executor_completed|executed_task_keys|ignored_task_keys|coverage_ratio|overall_score|golden|replay|evaluation_policy|provider_override|internal_dispatch_token" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py +``` + +扫描结果: + +- 未在用户前端、用户 schema 或用户 story API 中发现 readiness、admin evaluation analytics、executor coverage、评分、golden replay、provider override 或内部 dispatch token。 +- 命中项仅位于 `generation_jobs.py` 的内部事件过滤和安全进度映射逻辑。 + +已通过: + +```bash +git diff --check +``` + +## 5. 风险与后续建议 + +| 风险 | 状态 | 建议 | +| --- | --- | --- | +| 生产镜像缺少 golden fixture | 已修正 | fixture 已放入 app 内部 harness fixtures | +| readiness 结果被误接用户前端 | 持续关注 | 保持 admin-only 路由,并继续运行敏感字段扫描 | +| 阈值过于简单 | 可接受 | 当前为阶段 15 最小门槛,后续可按真实样本调优 | +| readiness 输出过细 | 已控制 | 只返回聚合,不返回原文、prompt、reason 或单条事件 | + +## 6. 阶段结论 + +阶段 15 建立了 admin-only harness readiness 审查能力,把评测驱动从“有测试、有 analytics”推进到“扩大接管范围前有聚合质量门”。用户端仍然只展示可用功能和进度,不接触评测数据、内部执行覆盖或 readiness 结果。 diff --git a/docs/planning/harness-stage-5-report.md b/docs/planning/harness-stage-5-report.md new file mode 100644 index 0000000..806488a --- /dev/null +++ b/docs/planning/harness-stage-5-report.md @@ -0,0 +1,140 @@ +# Harness Engineering 改造阶段 5 报告 + +**阶段**: 5 - Trace Analytics 与前端增量展示 +**日期**: 2026-06-21 +**状态**: 已完成 +**范围**: 后端 trace summary 聚合、用户端与管理端生成轨迹展示、完整验证 + +--- + +## 1. 本阶段目标 + +阶段 5 的目标是让阶段 1-4 写入的标准 harness metadata 变成可见、可分析的产品能力。 + +本阶段明确区分两类统计: + +- Provider stats:只统计 Provider 调用成功率、延迟、成本和供应商失败。 +- Trace summary:统计 workflow step、artifact、failure category 等 harness 运行时语义。 + +这样质量门失败不会被误算为供应商失败,供应商看板和生成工作流看板各自保持语义清楚。 + +## 2. 已完成工作 + +### 后端 + +修改文件: + +- `backend/app/schemas/story_schemas.py` +- `backend/app/services/generation_jobs.py` +- `backend/app/api/stories.py` +- `backend/tests/test_generation_jobs.py` + +新增 API: + +```http +GET /api/generations/{story_id}/trace-summary +``` + +响应字段: + +- `story_id` +- `window_days` +- `total_events` +- `failed_events` +- `by_step` +- `by_artifact` +- `failure_categories` + +新增聚合能力: + +- workflow step 聚合,例如 `image_generation`、`narrative_generation` +- artifact 聚合,例如 `cover_image`、`story_text` +- failure category 聚合,例如 `provider_error`、`schema_error` + +### 用户端 + +修改文件: + +- `frontend/src/types/generation.ts` +- `frontend/src/components/GenerationTrace.vue` + +新增展示: + +- 流程事件总数 +- 失败事件数 +- 主要步骤 +- 主要失败类型 +- 单个事件下方展示标准 step、artifact、failure category + +### 管理端 + +修改文件: + +- `admin-frontend/src/components/GenerationTrace.vue` + +新增展示与用户端保持一致: + +- trace summary 卡片 +- 事件级 step/artifact/failure category 标签 + +## 3. 验证结果 + +已执行: + +```bash +cd backend +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests + +cd ../frontend +npm run build + +cd ../admin-frontend +npm run build +``` + +结果: + +- 后端完整测试:`139 passed` +- 后端 ruff:`All checks passed!` +- 用户端生产构建:通过 +- 管理端生产构建:通过 + +构建备注: + +- Vite/Browserslist 输出了浏览器数据过期提示,不影响构建结果。 +- 管理端构建输出了 `baseline-browser-mapping` 数据偏旧提示,不影响构建结果。 + +## 4. 自审结论 + +本阶段符合设计目标: + +- 没有混淆 Provider stats 和 workflow trace stats。 +- 前端只做增量展示,没有改变生成/重试主流程。 +- 新 API 有后端测试覆盖。 +- 用户端和管理端构建均通过。 +- 质量门失败、Provider 失败和资产失败现在都有更清楚的可观测语义。 + +## 5. 当前新架构状态 + +Harness engineering 改造主线已完成阶段 0-5: + +- 设计基线完成。 +- Harness runtime 基础类型完成。 +- TraceRecorder 和 ExecutionControl 完成。 +- 资产工作流主要抽取完成。 +- WorkflowPlan 建模完成。 +- 确定性 Quality Gates 完成。 +- Trace Analytics 和前端展示完成。 + +## 6. 后续建议 + +下一步建议进入 **阶段 6:新架构实测与执行器小步接管**。 + +建议切片: + +1. 使用 Docker demo stack 跑 smoke,验证真实 API/worker/前端联动。 +2. 在本地 demo provider 下创建故事和绘本,确认 trace summary 数据真实可见。 +3. 回到阶段 3B,让普通故事无图片路径先由 `WorkflowPlan` 驱动执行。 +4. 逐步迁移带图片故事、绘本和资产任务执行器。 + diff --git a/docs/planning/harness-stage-6-report.md b/docs/planning/harness-stage-6-report.md new file mode 100644 index 0000000..a4f3d5e --- /dev/null +++ b/docs/planning/harness-stage-6-report.md @@ -0,0 +1,222 @@ +# Harness Engineering 改造阶段 6 报告 + +**阶段**: 6 - 新架构真实运行烟测 +**日期**: 2026-06-21 +**状态**: 已完成 +**范围**: 本地新代码 API、Celery worker、Docker PostgreSQL/Redis、真实 HTTP 生成链路、trace/provider 聚合验证 + +--- + +## 1. 本阶段目标 + +阶段 6 的目标是验证阶段 0-5 的新架构不只在单元测试和构建层面通过,也能在真实运行时闭环中工作。 + +本阶段重点验证: + +- FastAPI 可以使用新代码启动。 +- Celery worker 可以消费新代码派发的 generation job。 +- `TraceRecorder` 写入的标准 metadata 能被 `trace-summary` 正确聚合。 +- 主内容生成和资源重试都能进入 harness 运行时视角。 +- Provider stats 继续只统计 Provider 调用,不与 workflow trace 混淆。 + +## 2. 运行环境 + +复用 Docker demo stack 中已运行的基础设施: + +- PostgreSQL: `localhost:52432` +- Redis: `localhost:52379` + +本地新代码进程: + +- API: `127.0.0.1:53000` +- Worker: `celery -A app.core.celery_app worker --concurrency=1` + +启动 API 使用的关键环境变量: + +```bash +DATABASE_URL='postgresql+asyncpg://dreamweaver:dreamweaver_password@localhost:52432/dreamweaver_db' +CELERY_BROKER_URL='redis://localhost:52379/0' +CELERY_RESULT_BACKEND='redis://localhost:52379/0' +REDIS_URL='redis://localhost:52379/0' +``` + +## 3. 已执行烟测 + +### 3.1 健康检查 + +请求: + +```bash +curl -fsS http://127.0.0.1:53000/health +``` + +结果: + +```json +{"status":"ok"} +``` + +### 3.2 dev 登录与会话验证 + +通过 `/auth/dev/signin` 创建真实 cookie 会话,再查询 `/auth/session`。 + +结果: + +```text +login_status=302 +user_id=github:dev_user_001 +``` + +### 3.3 普通故事生成链路 + +请求: + +```json +{ + "output_mode": "story", + "type": "keywords", + "data": "星光书签, 小鹿, 学会复盘", + "education_theme": "复盘与成长", + "generate_images": false +} +``` + +结果: + +```text +job_id=a606878c-98a7-4d05-af95-629d0cd2f194 +poll=01 status=running step=request_accepted story_id=none +poll=02 status=completed step=generation_completed story_id=59 +story_title=星光书签、小鹿、学会复盘的晚安冒险 +``` + +说明: + +- API 成功创建 generation job。 +- Worker 成功 claim 并执行任务。 +- 故事成功落库。 +- job 以 `generation_completed` 收敛。 + +### 3.4 主生成 trace summary + +结果: + +```text +trace_total_events=8 +trace_failed_events=0 +trace_steps=[ + {"name":"provider_invocation","count":2}, + {"name":"context_preparation","count":1}, + {"name":"narrative_generation","count":1}, + {"name":"story_persistence","count":1} +] +trace_artifacts=[ + {"name":"story_text","count":1} +] +``` + +说明: + +- 标准 step 已可聚合。 +- `story_text` artifact 已可聚合。 +- 无失败事件。 + +### 3.5 图片资源重试链路 + +对 story `59` 执行: + +```json +{"assets":["image"]} +``` + +结果: + +```text +retry_image_status=ready +trace_before_total=8 +trace_after_total=15 +recent_jobs=[ + {"status":"completed","output_mode":"asset_retry","current_step":"asset_retry_completed","story_id":59}, + {"status":"completed","output_mode":"story","current_step":"generation_completed","story_id":59} +] +``` + +重试后 trace 聚合: + +```text +trace_after_steps=[ + {"name":"provider_invocation","count":4}, + {"name":"image_generation","count":2}, + {"name":"context_preparation","count":1}, + {"name":"narrative_generation","count":1}, + {"name":"story_persistence","count":1} +] +trace_after_artifacts=[ + {"name":"cover_image","count":2}, + {"name":"story_text","count":1} +] +``` + +Provider stats: + +```json +{ + "story_id": 59, + "total_calls": 2, + "successful_calls": 2, + "failed_calls": 0, + "by_provider": [ + {"capability":"image","adapter":"demo","call_count":1,"success_count":1,"failure_count":0}, + {"capability":"text","adapter":"demo","call_count":1,"success_count":1,"failure_count":0} + ], + "failure_reasons": [] +} +``` + +说明: + +- 资源重试新建了 `asset_retry` job。 +- 图片生成进入 `image_generation` step。 +- 封面进入 `cover_image` artifact 聚合。 +- Provider stats 正确统计 text/image provider 调用。 + +## 4. Docker build 说明 + +本阶段尝试执行: + +```bash +docker compose up -d --build +``` + +遇到两个与代码无关的外部阻塞: + +1. 根目录 `.env` 中镜像代理覆盖为 `docker.1ms.run/library/node:18-alpine`,该镜像拉取失败。 +2. 改用官方镜像变量后,Docker Hub metadata 拉取出现网络 EOF。 + +因此本阶段没有把新镜像完整 build 成 Docker stack。为验证新代码运行时,本阶段改用本地 API/worker 进程连接现有 Docker PostgreSQL/Redis,覆盖了真实 HTTP、Celery、DB、Redis 和 demo provider 链路。 + +## 5. 自审结论 + +本阶段烟测通过,说明阶段 0-5 的 harness engineering 改造已经具备真实运行能力: + +- 主内容生成链路可完成。 +- 资产重试链路可完成。 +- 标准 trace metadata 可以被后端聚合。 +- Provider stats 和 workflow trace stats 语义保持分离。 +- 前端新增的 trace summary 数据来源已经被真实 API 验证。 + +仍需注意: + +- Docker 镜像重建受外部 registry/network 影响,后续在网络稳定或镜像源修复后应再跑一次完整 Docker build smoke。 +- 阶段 3 的 `WorkflowPlan` 当前仍是建模基线,执行器接管尚未开始。 + +## 6. 后续建议 + +下一步建议进入 **阶段 7:执行器小步接管**。 + +建议切片: + +1. 先让普通故事、`generate_images=false` 的最小路径由 `WorkflowPlan` 驱动。 +2. 保持现有 `story_service` 作为外层编排入口,避免一次性迁移所有模式。 +3. 给执行器增加一条最小集成测试,验证 step 事件顺序、质量门和持久化行为。 +4. 再迁移带封面故事、绘本、资产生成和资产重试。 diff --git a/docs/planning/harness-stage-7-report.md b/docs/planning/harness-stage-7-report.md new file mode 100644 index 0000000..d37fad0 --- /dev/null +++ b/docs/planning/harness-stage-7-report.md @@ -0,0 +1,252 @@ +# Harness Engineering 改造阶段 7 报告 + +**阶段**: 7 - 评测驱动与执行器最小接管 +**日期**: 2026-06-22 +**状态**: 已完成 7A/7B/7C/7D/7E 当前切片 +**范围**: deterministic evaluator、evaluation trace、普通故事无图片路径的 WorkflowPlan 接入、内部 golden replay、覆盖摘要、测试与 QA 用例 + +--- + +## 1. 本阶段目标 + +阶段 7 的目标是响应“产品需要评测驱动”的长期要求:生成任务不能只用成功/失败判断质量,而要在主内容持久化前形成可追踪、可回归、可统计的 evaluation result。 + +本阶段只接管最小运行路径: + +- `output_mode=story` +- `generate_images=false` + +不在本阶段迁移绘本、带图片故事、资产生成或资产重试执行器,避免一次性扩大风险。 + +## 2. 已完成工作 + +### 后端 harness + +新增文件: + +- `backend/app/services/harness/evaluators.py` +- `backend/app/services/harness/executor.py` +- `backend/app/services/harness/evaluation_replay.py` +- `backend/tests/fixtures/evaluation_golden_cases.json` + +新增能力: + +- `EvaluationDimension` +- `EvaluationScore` +- `EvaluationResult` +- `evaluate_story_output` +- `EvaluationReplayCoverage` +- `EvaluationReplayCase` +- `EvaluationReplaySuiteResult.coverage_summary` +- `ExpectedEvaluation` +- `replay_evaluation_golden_cases` +- `run_evaluation_replay_cases` +- `record_workflow_plan` +- `record_evaluation_result` + +当前确定性评分维度: + +- `structure` +- `safety` +- `age_fit` +- `educational_value` +- `readability` + +### 内部 golden replay + +阶段 7D 已建立第一组内部 golden cases,用固定样本锁住 deterministic evaluator 的回归基线。 + +阶段 7E 已将 golden cases 扩充到 11 个样本,并为每条 case 增加内部覆盖标签: + +- `age_band` +- `content_shape` +- `risk_area` +- `tags` + +当前样本覆盖: + +- 完整普通故事通过。 +- 较长普通故事通过。 +- 普通故事空正文被质量门阻断。 +- 普通故事封面提示词缺失被质量门阻断。 +- 普通故事安全风险词被质量门阻断。 +- 普通故事结构完整但阅读体验偏短,在高阈值下被评测阻断。 +- 完整绘本分页通过。 +- 绘本重复页码被质量门阻断。 +- 绘本没有分页内容被质量门阻断。 +- 绘本分页安全风险词被质量门阻断。 +- 绘本分页正文过短触发 warning,并在高阈值下被评测阻断。 + +当前覆盖摘要已由单测锁定: + +- artifact: `story=6`、`storybook=5` +- age_band: `3-4=4`、`5-6=4`、`7-8=1`、`unknown=2` +- risk_area: `schema_error=4`、`happy_path=2`、`readability_warning=2`、`safety_error=2`、`length_boundary=1` +- outcome: `passed=3`、`blocked=8` + +实现边界: + +- replay fixture 只被后端测试和内部工具读取。 +- 线上生成链路不会自动读取 golden cases。 +- 不新增用户端 API。 +- 不改变公开 schema。 +- 不把 replay 结果、评分、维度或阈值分发到用户前端。 +- 覆盖摘要只用于后端测试和内部评测基线审查,不进入用户端 API。 + +replay 会比较: + +- `passed` +- `blocking` +- `overall_score` 区间 +- 必需维度是否存在 +- quality gate issue code +- warning 文案片段 +- coverage summary + +### 事件模型 + +新增标准 step: + +- `evaluation` + +新增事件: + +- `workflow_planned` +- `evaluation_completed` + +新增进度: + +- `workflow_planned`: `8%`,工作流已规划 +- `evaluation_completed`: `52%`,内容评测已完成 + +### story service + +普通故事无图片路径现在会: + +1. 构建 `WorkflowPlan` +2. 写入 `workflow_planned` +3. 准备上下文 +4. 调用文本 provider +5. 执行 deterministic evaluator +6. 写入 `evaluation_completed` +7. 通过后写入 `narrative_generated` +8. 持久化故事 +9. 收敛 job + +质量门失败时会同时写入: + +- `quality_gate_failed` +- `evaluation_completed` + +这样 failed job 的阻断原因和评分事实都能被追踪。 + +阶段 7C 已将绘本主内容纳入内部 deterministic evaluator: + +- 绘本 Provider 输出后、持久化前执行 `evaluate_storybook_output`。 +- 绘本质量门失败会写入内部 `quality_gate_failed` 和 `evaluation_completed`。 +- 绘本评测通过会写入内部 `evaluation_completed`,artifact 标记为 `storybook_pages`。 +- 用户可访问的 job detail 仍会过滤 `evaluation_completed`。 + +### 前端与管理端 + +管理端生成轨迹已补充内部新事件/步骤中文标签: + +- `workflow_planned`: 工作流规划 +- `evaluation_completed`: 内容评测 +- `evaluation`: 内容评测 + +安全边界修正: + +- 用户端不展示评测分数、维度、通过率或阻断阈值。 +- 用户可访问的 job detail 不返回 `evaluation_completed` 事件。 +- 用户可访问的 `trace-summary` 不返回 `evaluation` 聚合对象。 +- 用户端生成轨迹组件不保留 `evaluation_completed` 和 `evaluation` 展示标签。 +- 评测 metadata 只保留在内部 job events 中,后续如需展示必须通过 admin-only 内部接口。 + +### Trace Summary + +`GET /api/generations/{story_id}/trace-summary` 继续只返回用户可解释的工作流摘要: + +- `total_events` +- `failed_events` +- `by_step` +- `by_artifact` +- `failure_categories` + +该接口会跳过 `evaluation_completed`,且 `total_events` 也只统计公开事件,避免把评测分数、维度、阻断策略或内部评测步骤数量分发给普通用户。 + +## 3. 验证结果 + +已执行: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_harness_runtime.py tests/test_generation_jobs.py +.venv/bin/python -m ruff check app tests + +.venv/bin/python -m pytest + +cd ../frontend +npm run build + +cd ../admin-frontend +npm run build +``` + +最新结果: + +- 定向测试:`42 passed` +- Harness runtime 定向测试:`22 passed` +- 后端完整测试:`146 passed` +- Ruff:`All checks passed!` +- 用户端构建:通过 +- 管理端构建:通过 + +构建备注: + +- Vite/Browserslist 输出浏览器数据过期提示,不影响构建结果。 +- 管理端输出 `baseline-browser-mapping` 数据偏旧提示,不影响构建结果。 + +## 4. 自审结论 + +本阶段目前符合小步迁移原则: + +- 没有引入外部评测服务和额外成本。 +- 没有改变 API 响应结构。 +- 公共 `trace-summary` 不分发 evaluation summary。 +- 公共 `trace-summary` 的 `total_events` 不统计 `evaluation_completed`。 +- 只接入普通故事无图片路径。 +- 质量门阻断仍然发生在持久化前。 +- evaluation metadata 已进入内部 job event,但用户接口会脱敏。 +- 用户端只展示可用功能和可解释状态,不展示评测数据。 +- 文本故事和绘本主内容都已经在持久化前进入内部 deterministic evaluator。 +- 内部 golden replay 已能在单测中检查评测基线漂移。 +- 内部 replay 覆盖摘要已能检查年龄段、内容形态、风险区域、标签和 outcome 分布。 +- replay 结果未接入任何用户端接口或前端展示。 + +## 5. Bug 与风险记录 + +当前没有必须立即阻断的已知 bug。 + +已发现并即时修复的问题: + +- 首次插入 plan-aware 分支时,storybook 返回块缩进被补丁碰歪;已在继续测试前修复。 +- 后端新增 `workflow_planned` 和 `evaluation_completed` 后,用户端/管理端事件标签一开始没有同步;审查发现后已补中文标签并重新构建通过。 +- 阶段 7B 曾短暂把 evaluation summary 接入用户端和用户可访问 API;经产品安全边界复核后已移除,并补充测试确保公共响应不包含 `evaluation`、用户 job detail 不包含 `evaluation_completed`。 +- 阶段 7D 初次新增 replay 模块后 Ruff 发现 import 顺序问题;已用 Ruff 修复并重新跑定向测试。 + +后续风险: + +- 当前 evaluator 是确定性启发式,适合做回归基线,但不能替代高质量模型评测或人工样本评审。 +- 当前 golden cases 已扩展到 11 条,但仍偏工程回归样本;后续需要补充真实用户输入分布、Provider 输出变体、教育主题缺失/弱相关、不同绘本页数和更细年龄分层。 +- 旧同步接口调用 `generate_and_save_story` 时也会执行 evaluator,但没有 job 时不会记录事件;这是兼容选择,后续可以考虑为同步接口生成 lightweight evaluation response。 +- 后续如果要看 evaluation summary,必须新建 admin-only 内部接口,并确认不会被用户端调用。 + +## 6. 后续建议 + +下一步继续阶段 8: + +1. 设计 admin-only evaluation analytics,明确权限边界和脱敏规则。 +2. 逐步让带图片故事和绘本执行路径由 `WorkflowPlan` 接管。 +3. 扩充 golden cases 到真实用户输入分布和 Provider 输出变体。 +4. 在 Docker registry 网络恢复后重新跑完整 build smoke。 diff --git a/docs/planning/harness-stage-8-report.md b/docs/planning/harness-stage-8-report.md new file mode 100644 index 0000000..220a82c --- /dev/null +++ b/docs/planning/harness-stage-8-report.md @@ -0,0 +1,142 @@ +# Harness Engineering 改造阶段 8 报告 + +**阶段**: 8 - Admin-Only Evaluation Analytics +**日期**: 2026-06-22 +**状态**: 已完成当前切片 +**范围**: admin-only 内部评测聚合、权限边界、过滤、测试和用户端隔离审查 + +--- + +## 1. 本阶段目标 + +阶段 8 的目标是在不泄露商业机密的前提下,让内部团队可以看到内容评测的聚合质量趋势。 + +本阶段只做管理控制面后端接口: + +- 不做用户端接口。 +- 不做用户端前端展示。 +- 不做管理端可视化页面。 +- 不返回原始故事内容、prompt、单条 evaluation event 或评分 reason。 + +## 2. 已完成工作 + +### 后端服务 + +新增文件: + +- `backend/app/services/admin_evaluation_analytics.py` + +新增能力: + +- 聚合内部 `evaluation_completed` 事件。 +- 支持 `days` 时间窗口过滤。 +- 支持 `artifact=story_text|storybook_pages` 过滤。 +- 汇总通过数、阻断数、通过率、平均分、artifact、output mode、score band、dimension score、quality gate issue、failure category 和 warning。 + +### Admin-only API + +在既有 admin router 中新增: + +```text +GET /admin/evaluations/analytics +``` + +该接口受现有 admin 控制面保护: + +- `ENABLE_ADMIN_CONSOLE=true` 时才挂载 admin router。 +- 路由继承 `Depends(admin_guard)`。 +- Basic Auth 失败时返回 `401`。 + +查询参数: + +- `days`: `1-365` +- `artifact`: `story_text` 或 `storybook_pages` + +### 响应边界 + +该接口只返回聚合摘要: + +- `total_evaluations` +- `passed_evaluations` +- `blocked_evaluations` +- `pass_rate` +- `average_score` +- `job_count` +- `story_count` +- `user_count` +- `by_artifact` +- `by_output_mode` +- `score_bands` +- `dimension_scores` +- `quality_gate_issues` +- `failure_categories` +- `warnings` + +该接口不会返回: + +- 故事正文 +- 绘本分页正文 +- 用户 prompt +- cover prompt +- 单条 job event +- 单条 evaluation event +- 评分 reason +- quality gate message + +## 3. 验证结果 + +已执行: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_generation_jobs.py +.venv/bin/python -m ruff check app/services/admin_evaluation_analytics.py app/api/admin_providers.py tests/test_admin_providers.py +``` + +结果: + +- Admin + 用户侧脱敏定向测试:`26 passed` +- Ruff:`All checks passed!` + +已做用户端隔离扫描: + +```bash +rg -n "evaluations/analytics|EvaluationAnalytics|admin_evaluation|evaluation_completed|overall_score|golden|replay" frontend/src backend/app/schemas backend/app/api/stories.py backend/app/services/generation_jobs.py +``` + +扫描结论: + +- 用户端前端没有 evaluation analytics 接口、类型或展示命中。 +- 用户端公开 schema 没有新增 evaluation analytics 响应模型。 +- 用户侧后端只保留 `evaluation_completed` 的过滤/脱敏逻辑。 + +## 4. 自审结论 + +本阶段符合评测数据内部分级原则: + +- 评测 analytics 是 admin-only。 +- 用户端 API 没有新增评测数据。 +- 用户前端没有新增评测入口。 +- 响应为聚合摘要,不返回原始内容或单条评测明细。 +- 权限测试覆盖未授权访问。 +- 用户端脱敏测试继续通过。 + +## 5. Bug 与风险记录 + +已发现并即时修复的问题: + +- 初次测试时 `dimension_scores` 的排序预期与实现不一致;实现按覆盖次数优先排序,更适合运营视图,因此已修正测试预期。 + +当前风险: + +- 当前接口返回 warning 文案聚合。warning 文案来自内部 evaluator,目前不包含原始内容,但后续新增 warning 时必须避免拼接用户正文或 prompt。 +- 当前只做后端 admin API,尚未做管理端页面。后续做 UI 时仍需避免展示单条评测明细和原文内容。 +- analytics 聚合目前使用 Python 读取 JSON metadata 聚合,适合当前数据量和 SQLite/PostgreSQL 兼容;后续数据量变大时可考虑离线物化或数据库 JSON 聚合。 + +## 6. 后续建议 + +下一步建议进入阶段 9: + +1. 继续让带图片故事和绘本路径由 `WorkflowPlan` 更完整接管。 +2. 或先做 admin-only evaluation analytics 的管理端只读页面,但必须保持聚合摘要边界。 +3. 扩充真实用户输入分布的 golden cases,特别是教育主题弱相关和不同年龄段样本。 diff --git a/docs/planning/harness-stage-9-report.md b/docs/planning/harness-stage-9-report.md new file mode 100644 index 0000000..f5873c1 --- /dev/null +++ b/docs/planning/harness-stage-9-report.md @@ -0,0 +1,144 @@ +# Harness Engineering 改造阶段 9 报告 + +**阶段**: 9 - WorkflowPlan 接管扩展 +**日期**: 2026-06-22 +**状态**: 已完成当前切片 +**范围**: 普通故事带图片、绘本生成路径的计划快照接入、事件顺序测试、用户端评测隔离复核 + +--- + +## 1. 本阶段目标 + +阶段 9 的目标是把 `WorkflowPlan` 从普通故事无图片路径扩展到三条主生成路径: + +- 普通故事无图片:已在阶段 7 接入,本阶段继续作为基线。 +- 普通故事带图片:新增 `story_with_assets` plan。 +- 绘本:新增 `storybook` plan。 + +本阶段不重写完整执行器,也不改变用户侧 API 响应结构。目标是先让计划快照成为稳定的运行时事实,为后续把执行分支逐步迁移到 executor 打基础。 + +## 2. 已完成工作 + +### 后端生成路径 + +修改文件: + +- `backend/app/services/story_service.py` + +新增行为: + +- `output_mode=storybook` 时,在调用 `generate_storybook_service` 前记录 `workflow_planned`。 +- `output_mode=story` 且 `generate_images=true` 时,在调用 `generate_full_story_service` 前记录 `workflow_planned`。 +- `generate_images=false` 的普通故事路径继续复用已有 `_execute_story_without_assets_plan`。 + +### WorkflowPlan 快照 + +普通故事带图片路径: + +- `plan.mode=story_with_assets` +- tasks 包含: + - `prepare_context` + - `generate_narrative` + - `evaluate_narrative` + - `persist_story` + - `generate_cover_image` + - `queue_postprocessing` + - `complete_generation` +- `generate_cover_image.required=false` +- `generate_cover_image.recoverable=true` + +绘本路径: + +- `plan.mode=storybook` +- tasks 包含: + - `prepare_context` + - `generate_storybook_pages` + - `evaluate_storybook_pages` + - `generate_storybook_images` + - `persist_storybook` + - `queue_postprocessing` + - `complete_generation` +- `generate_storybook_images.required=false` +- `generate_storybook_images.recoverable=true` + +### 测试 + +修改文件: + +- `backend/tests/test_generation_jobs.py` + +新增或更新覆盖: + +- 新增 `test_story_with_images_worker_records_plan_before_assets`。 +- 更新绘本 worker 测试,断言 `workflow_planned` 事件顺序和 `storybook` plan 快照。 +- 继续确认用户 job detail 不返回 `evaluation_completed`。 + +### 文档 + +修改文件: + +- `docs/technical/harness-engineering-modernization.md` +- `backend/tests/harness-evaluation-test-cases.md` + +新增内容: + +- 设计文档新增 Workflow Plan Coverage。 +- 阶段计划新增阶段 9。 +- QA 用例新增带图片故事和绘本计划快照状态转换测试。 + +## 3. 验证结果 + +已执行: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +结果: + +- 定向生成任务测试:`21 passed` +- 后端全量测试:`151 passed` +- Ruff:`All checks passed!` +- 用户前端构建:通过 +- 管理端构建:通过 + +构建提示: + +- `frontend` 和 `admin-frontend` 构建均提示 Browserslist/caniuse-lite 数据较旧。 +- `admin-frontend` 额外提示 `baseline-browser-mapping` 数据较旧。 +- 以上均为依赖数据 freshness 提示,不影响当前构建结果。 + +## 4. 自审结论 + +本阶段改动符合当前 Harness Engineering 路径: + +- 改动面集中在生成入口,不重写 Provider、质量门或持久化逻辑。 +- 三条主路径的计划事件顺序一致:`worker_started` 后、`context_prepared` 前记录 `workflow_planned`。 +- 图片类任务在 plan 中明确为可恢复资产,不阻断主内容阅读。 +- `evaluation_completed` 继续作为内部事件存在,用户端 detail 和 trace summary 不分发评分数据。 +- 新增测试断言 plan 快照,而不是只断言事件名称,能更早发现后续执行器迁移时的计划漂移。 + +## 5. Bug 与风险记录 + +本阶段未发现需要统一后置处理的 bug。 + +当前风险: + +- `_generate_generation_service_with_job` 仍保留分支式执行,只是补齐了 plan 记录。后续如果要真正由 executor 编排执行,需要继续拆分 story、storybook、asset workflow 的最小执行单元。 +- `workflow_planned` 当前在用户侧可见。它不包含评测分数、阈值或 replay 信息,可以展示为“工作流规划”;后续如果 plan metadata 增加内部策略字段,必须先做 public sanitizer。 +- 当前 plan 快照写入 job event metadata。数据量较小,适合现在的 trace 需求;后续若引入更复杂 DAG 或重放执行状态,可考虑独立表或压缩摘要。 + +## 6. 后续建议 + +下一阶段建议进入阶段 10: + +1. 将资产生成和重试路径也纳入 `WorkflowPlan` 记录,统一 `asset_generation` 与 `asset_retry` 的计划快照。 +2. 为用户侧 job/event 输出增加公共 metadata sanitizer,明确允许字段白名单,避免未来 plan 或 trace 字段扩展时误泄露内部质量策略。 +3. 继续扩展评测驱动 golden cases,优先覆盖教育主题弱相关、不同年龄段长度边界和绘本分页一致性。 diff --git a/docs/technical/harness-engineering-modernization.md b/docs/technical/harness-engineering-modernization.md index f0e170c..5e56821 100644 --- a/docs/technical/harness-engineering-modernization.md +++ b/docs/technical/harness-engineering-modernization.md @@ -1,10 +1,10 @@ # Harness Engineering 架构改造技术设计 -**项目**: DreamWeaver 梦语织机 -**版本**: 0.1 -**日期**: 2026-06-21 -**状态**: 阶段 0 已建立设计基线 -**作者**: Codex +**项目**: DreamWeaver 梦语织机 +**版本**: 0.1 +**日期**: 2026-06-23 +**状态**: 阶段 0-15 当前切片已完成,主生成与资产任务均已写入 WorkflowPlan 快照,资产生成/重试已开始由 plan-driven executor 驱动,executor coverage 已进入 admin-only 聚合并嵌入 admin trace,admin-only harness readiness 审查已建立,用户侧 job event/request payload 已使用白名单脱敏,文本故事和绘本已纳入内部评测驱动,内部 golden replay 基线、覆盖摘要、admin-only evaluation analytics 和 admin-only 完整 trace 已建立 +**作者**: Codex --- @@ -36,6 +36,7 @@ DreamWeaver 当前已经完成统一生成工作流的第一轮落地:`POST /a - 把 `story_service` 中的运行时控制职责抽到 harness 层。 - 让 workflow step、artifact、trace、failure category 成为一等概念。 +- 让内容生成结果在持久化和发布前具备可追踪、可回归的评测结果。 - 保持 `/api/generations`、旧兼容接口、现有状态字段和主要测试行为不破坏。 - 优先做渐进式重构,不引入复杂工作流引擎,不进行大爆炸重写。 - 每个大阶段都产出阶段报告,包含实现、审查、验证和风险。 @@ -50,19 +51,25 @@ DreamWeaver 当前已经完成统一生成工作流的第一轮落地:`POST /a ## 3. 架构原则 -1. **主内容优先可读** +1. **主内容优先可读** 文本故事或绘本结构是 blocking artifact;封面、分页插图、音频是 recoverable artifact。 -2. **API 稳定优先** +2. **API 稳定优先** 先重构内部边界,再考虑扩展响应字段。现有前端、smoke、测试不应被第一阶段打断。 -3. **事件结构稳定** +3. **事件结构稳定** 继续复用 `generation_job_events`,但逐步标准化 metadata,避免每个调用点随手定义不同结构。 -4. **Provider 不等于产品能力** +4. **Provider 不等于产品能力** Provider 只是 tool invocation 的实现。产品能力应由 capability、workflow step、artifact 和 recovery policy 共同定义。 -5. **小步可验证** +5. **评测驱动优先** + 生成成功不等于产品成功。每条新执行路径必须先定义可追踪 evaluation 事件、评分维度、阻断阈值和回归测试,再扩大迁移范围。 + +6. **评测数据内部分级** + 评测分数、维度、阈值和阻断细节属于内部质量资产与商业机密,不通过用户端接口或用户前端分发。用户端只展示可操作功能、可解释进度和可恢复状态。 + +7. **小步可验证** 每个最小任务都必须能通过单测、局部测试或文档审查验证。 ## 4. 目标架构 @@ -78,6 +85,7 @@ flowchart TB HARNESS --> TRACE["Trace Recorder
job events / step metadata / provider trace"] HARNESS --> ARTIFACT["Artifact Workflows
story_text / storybook_pages / image / audio"] HARNESS --> GUARD["Quality Gates
schema / 儿童安全 / 内容完整性"] + HARNESS --> EVAL["Evaluators
结构 / 安全 / 年龄适配 / 教育价值 / 可读性"] ARTIFACT --> ROUTER["Provider Router
策略 / failover / 熔断 / 成本"] ROUTER --> ADAPTERS["Provider Adapters"] @@ -112,10 +120,11 @@ flowchart TB | Step | 当前事件 | 是否阻塞主内容 | | --- | --- | --- | -| `request_acceptance` | `request_accepted`、`retry_queued` | 是 | +| `request_acceptance` | `request_accepted`、`retry_queued`、`workflow_planned` | 是 | | `worker_start` | `worker_started` | 是 | | `context_preparation` | `context_prepared` | 是 | | `narrative_generation` | `narrative_generated` | 是 | +| `evaluation` | `evaluation_completed` | 是 | | `story_persistence` | `story_saved` | 是 | | `image_generation` | `cover_image_*`、`storybook_*image*` | 否 | | `audio_generation` | `audio_*` | 否 | @@ -172,11 +181,204 @@ flowchart TB } ``` +### 5.6 Evaluation Result + +每次主内容生成必须逐步产出可追踪评测结果。第一阶段使用确定性启发式,后续可替换或叠加模型评测、人审样本集和离线 replay。 + +标准字段: + +| 字段 | 说明 | +| --- | --- | +| `overall_score` | `0.0-1.0` 总分 | +| `passed` | 是否通过当前阈值 | +| `blocking` | 是否阻断持久化或发布 | +| `scores` | 维度评分列表 | +| `quality_gate` | 质量门失败详情,可为空 | +| `warnings` | 非阻断风险提示 | + +当前维度: + +- `structure` +- `safety` +- `age_fit` +- `educational_value` +- `readability` + +标准事件: + +- `workflow_planned` +- `evaluation_completed` + 短期兼容要求: - 不删除现有 metadata 字段。 - 新增字段必须向后兼容。 - 前端仍可使用当前 `event_type`、`status`、`message`、`event_metadata`。 +- 用户端 API 和用户前端不得返回或展示 `overall_score`、维度分数、阈值、阻断策略或 golden replay 结果。 + +### 5.7 Workflow Plan Coverage + +`WorkflowPlan` 是生成 harness 的显式执行骨架。当前主生成路径和资产路径都会写入 `workflow_planned` 事件: + +| 模式 | plan mode | 关键任务 | 备注 | +| --- | --- | --- | --- | +| 普通故事无图片 | `story` | `prepare_context`、`generate_narrative`、`evaluate_narrative`、`persist_story`、`queue_postprocessing`、`complete_generation` | 当前最小 plan-aware 路径 | +| 普通故事带图片 | `story_with_assets` | 在普通故事任务基础上增加 `generate_cover_image` | 封面图为 `required=false`、`recoverable=true` | +| 绘本 | `storybook` | `prepare_context`、`generate_storybook_pages`、`evaluate_storybook_pages`、可选 `generate_storybook_images`、`persist_storybook`、`queue_postprocessing`、`complete_generation` | 绘本图片为可恢复资产 | +| 资产生成 | `asset_generation` | `start_asset_generation`、`complete_image_asset` 或 `complete_audio_asset`、`complete_asset_generation` | 图片/音频均为 `required=false`、`recoverable=true` | +| 资产重试 | `asset_retry` | `start_asset_retry`、`complete_image_asset` 或 `complete_audio_asset`、`complete_asset_retry` | 同步重试路径也记录 plan | + +当前边界: + +- `workflow_planned` 可进入用户侧进度,因为它只描述产品步骤,不包含评分、阈值或 golden replay 信息。 +- 用户端只返回 coarse plan metadata:`plan_mode`、`planned_task_count`、`recoverable_task_count`。 +- 用户端不返回原始 `plan.tasks`、任务 key、内部阈值或执行策略。 +- `evaluation_completed` 只保留在内部事件、内部测试和 admin-only 聚合中。 +- 用户端 job detail 会过滤 `evaluation_completed`。 +- 用户端 trace summary 不统计 `evaluation_completed` 的事件数量、step、artifact 或失败分类。 +- 用户端 trace summary 不统计 `executor_completed` 的事件数量、task key 或 result asset。 + +### 5.8 Public Event Metadata Sanitizer + +用户侧 job detail 的 `events[*].event_metadata` 使用白名单输出。数据库中的内部 metadata 不被删除,内部分析、测试和 admin-only 能力仍可读取完整事件;普通用户 API 只返回产品可解释且可操作的字段。 + +允许公开的类别: + +- 标准 step、artifact、failure_category。 +- 资源状态和资产范围,如 `asset`、`assets`、`status`、`image_status`、`audio_status`。 +- 用户可理解的执行上下文,如 `mode`、`output_mode`、`input_type`、`page_count`、`page_number`。 +- Provider 运营摘要,如 `adapter`、`capability`、`strategy`、`latency_ms`、`estimated_cost_usd`。 +- coarse plan 摘要:`plan_mode`、`planned_task_count`、`recoverable_task_count`。 + +禁止公开的类别: + +- `evaluation_completed` 事件本身。 +- `overall_score`、维度分数、评分 reason、阈值、质量门 issue 明细。 +- 原始 `plan` 和 `plan.tasks`。 +- `result_snapshot`、内部错误原文、内部阈值、replay/golden case 信息。 +- 任意未来新增 metadata 字段,除非显式加入白名单。 + +### 5.9 Internal Evaluation Replay + +内部 evaluation replay 用于把固定 golden cases 和当前 evaluator 输出做对比,帮助我们在调整质量门、评分维度或 Provider 输出结构时快速发现评测基线漂移。 + +当前边界: + +- replay 输入和结果只用于后端测试、内部工具或未来 admin-only 能力。 +- replay fixture 不被线上生成链路自动读取。 +- replay 不新增公开 API,不改变用户端 schema,不进入用户前端 bundle。 +- 用户端 trace summary 的 `total_events` 不统计内部 `evaluation_completed`。 +- replay 断言只检查内部质量事实:`passed`、`blocking`、`overall_score` 区间、维度存在性、质量门 issue code 和 warning。 +- replay case 可以携带内部覆盖标签:年龄段、内容形态、风险区域和标签集合。 +- replay suite 可以生成内部覆盖摘要:artifact、age_band、content_shape、risk_area、tags、outcome。 + +当前 golden case 覆盖: + +- 完整普通故事通过。 +- 较长普通故事通过。 +- 普通故事空正文被质量门阻断。 +- 普通故事封面提示词缺失被质量门阻断。 +- 普通故事安全风险词被质量门阻断。 +- 普通故事结构合格但在高阈值下因阅读体验偏短被评测阻断。 +- 完整绘本分页通过。 +- 绘本重复页码被质量门阻断。 +- 绘本没有分页内容被质量门阻断。 +- 绘本分页安全风险词被质量门阻断。 +- 绘本分页正文过短触发内部 warning 并在高阈值下阻断。 + +### 5.10 Admin-Only Evaluation Analytics + +内部评测 analytics 只允许在管理控制面读取,用于质量运营和评测策略复盘。该能力不得进入用户端 `/api/generations` 路由、用户前端类型或用户前端 bundle。 + +当前 admin-only 聚合字段: + +| 字段 | 说明 | +| --- | --- | +| `total_evaluations` | 内部评测事件数量 | +| `passed_evaluations` | 通过数量 | +| `blocked_evaluations` | 阻断数量 | +| `pass_rate` | 通过率 | +| `average_score` | 总分平均值 | +| `by_artifact` | 按 `story_text` / `storybook_pages` 聚合 | +| `by_output_mode` | 按 story / storybook 聚合 | +| `score_bands` | 按分数段聚合 | +| `dimension_scores` | 各评分维度平均分 | +| `quality_gate_issues` | 质量门 issue code 聚合 | +| `failure_categories` | 质量门 failure category 聚合 | +| `warnings` | 内部 warning 文案聚合 | + +安全边界: + +- 只挂载在 admin router 下,受 `ENABLE_ADMIN_CONSOLE` 和 Basic Auth admin guard 保护。 +- 不返回故事正文、绘本分页正文、用户 prompt、cover prompt、score reason、quality gate message、单条 evaluation event 或 job event 明细。 +- 用户端 API 继续过滤 `evaluation_completed`。 +- 用户端 trace summary 继续不统计内部 `evaluation_completed`。 +- 用户端前端不包含该接口调用、类型定义或展示组件。 + +### 5.11 Admin-Only Executor Coverage + +内部 executor coverage 用于审查 `WorkflowPlan` 是否真正驱动了资产执行,以及哪些 task key 仍只是计划占位或被当前 runner 忽略。该能力只属于管理控制面,不进入用户 API 或用户前端。 + +当前 admin-only 聚合字段: + +| 字段 | 说明 | +| --- | --- | +| `total_runs` | executor 完成事件数量 | +| `total_planned_tasks` | 计划任务总数 | +| `total_executed_tasks` | 实际执行任务总数 | +| `total_ignored_tasks` | 被 runner 忽略的任务总数 | +| `coverage_ratio` | `executed / planned` | +| `by_plan_mode` | 按 `asset_generation` / `asset_retry` 等模式聚合 | +| `by_output_mode` | 按生成 job 的 output mode 聚合 | +| `executed_task_keys` | 已执行 task key 聚合 | +| `ignored_task_keys` | 已忽略 task key 聚合 | +| `result_assets` | 返回资产聚合 | + +安全边界: + +- 只挂载在 admin router 下,受 `ENABLE_ADMIN_CONSOLE` 和 Basic Auth admin guard 保护。 +- `executor_completed` 事件、task key、ignored task key 和 result asset 明细不进入用户 job detail。 +- 用户 job summary 如果短暂停留在内部 `executor_completed` step,会映射为安全公开的 `workflow_planned`。 +- 用户 trace summary 不统计 `executor_completed`,避免通过事件数量或聚合维度泄露内部执行器结构。 +- 用户前端不包含 `/admin/executors/coverage` 调用、类型定义或展示组件。 + +### 5.12 Admin Trace Executor Coverage Summary + +管理端单个 generation trace 在完整事件流之外,额外返回 `executor_coverage` 摘要,用于一次请求内同时完成“看事件”和“看执行覆盖”的审查。 + +设计边界: + +- `GET /admin/generations/jobs/{job_id}/trace` 复用全局 executor coverage 的聚合函数,避免两个 admin 视图统计口径不一致。 +- trace 内嵌 summary 的 `scope` 为 `admin_internal_job_executor_coverage`,只统计当前 job 的 `executor_completed` 事件。 +- trace 内嵌 summary 允许返回 task key、ignored task key 和 result asset,因为该接口已经是 admin-only 完整内部 trace。 +- 用户侧 `/api/generations/jobs/{job_id}`、`/api/generations/{story_id}/jobs` 和 `/api/generations/{story_id}/trace-summary` 均不返回该字段。 + +### 5.13 Admin-Only Harness Readiness + +内部 harness readiness 用于在扩大 plan-driven executor 或评测策略接管范围前,给管理控制面提供一个聚合级别的上线前审查摘要。 + +输入来源: + +- 内部 golden replay fixture,随后端 app 一起发布,避免运行环境缺少测试目录。 +- admin-only evaluation analytics 聚合。 +- admin-only executor coverage 聚合。 + +当前 readiness checks: + +| Check | 说明 | 默认门槛 | +| --- | --- | --- | +| `golden_replay` | 内部 golden cases 是否全部符合预期 | 必须全部通过 | +| `runtime_evaluation_samples` | 当前窗口是否有运行期 evaluation 样本 | 至少 1 条 | +| `runtime_evaluation_quality` | 运行期 evaluation 通过率和均分是否达标 | pass rate >= 0.7,average score >= 0.7 | +| `executor_coverage_samples` | 当前窗口是否有 executor coverage 样本 | 至少 1 次 run | +| `executor_coverage_ratio` | executor 实际执行任务占计划任务比例 | coverage ratio >= 0.2 | + +安全边界: + +- 只挂载在 admin router 下,受 `ENABLE_ADMIN_CONSOLE` 和 Basic Auth admin guard 保护。 +- 只返回聚合结果、阈值、状态和 coverage summary。 +- 不返回故事正文、绘本分页正文、用户 prompt、cover prompt、score reason、quality gate message 或单条事件明细。 +- 用户端 API 和用户前端不包含该接口调用、类型定义或展示组件。 ## 6. 模块设计 @@ -431,6 +633,358 @@ npm run build - `docs/planning/harness-stage-5-report.md` +### 阶段 6: 新架构真实运行烟测 + +目标: + +- 使用新代码启动本地 API 与 Celery worker。 +- 复用 Docker demo stack 中的 PostgreSQL 与 Redis。 +- 通过真实 HTTP API 覆盖登录、生成、worker 执行、故事落库、trace summary 和 provider stats。 +- 覆盖主内容工作流与资源重试工作流。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H6-1 | 启动本地新代码 API | `/health` 返回 `{"status":"ok"}` | +| H6-2 | 启动本地新代码 worker | 生成任务可被 worker claim 并执行 | +| H6-3 | 使用 dev 登录创建真实 cookie 会话 | `/auth/session` 返回开发用户 | +| H6-4 | 提交普通故事生成 | job 进入 completed/degraded 且 story 落库 | +| H6-5 | 查询 trace summary/provider stats | 返回 step、artifact、provider 聚合 | +| H6-6 | 执行图片资源重试 | trace summary 聚合出 `image_generation` 与 `cover_image` | +| H6-7 | 清理临时进程并恢复 Docker worker | `docker compose ps` 环境回到可用状态 | + +验证命令: + +```bash +cd backend +DATABASE_URL='postgresql+asyncpg://dreamweaver:dreamweaver_password@localhost:52432/dreamweaver_db' \ +CELERY_BROKER_URL='redis://localhost:52379/0' \ +CELERY_RESULT_BACKEND='redis://localhost:52379/0' \ +REDIS_URL='redis://localhost:52379/0' \ +.venv/bin/python -m uvicorn app.main:app --host 127.0.0.1 --port 53000 + +DATABASE_URL='postgresql+asyncpg://dreamweaver:dreamweaver_password@localhost:52432/dreamweaver_db' \ +CELERY_BROKER_URL='redis://localhost:52379/0' \ +CELERY_RESULT_BACKEND='redis://localhost:52379/0' \ +REDIS_URL='redis://localhost:52379/0' \ +.venv/bin/celery -A app.core.celery_app worker --loglevel=info --concurrency=1 +``` + +阶段报告: + +- `docs/planning/harness-stage-6-report.md` + +### 阶段 7: 评测驱动与执行器最小接管 + +目标: + +- 将“生成是否合格”从隐含质量门升级为结构化 evaluation result。 +- 让普通故事、`generate_images=false` 的最小路径由 `WorkflowPlan` 参与执行。 +- 在 job events 中记录 `workflow_planned` 和 `evaluation_completed`。 +- 用测试锁住评分、阻断、事件顺序和 trace 聚合。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H7-1 | 新增 deterministic evaluator | 通过/阻断用例有单测 | +| H7-2 | 新增 plan-aware executor helper | 任务写入 `workflow_planned` | +| H7-3 | 普通故事无图片路径接入 plan | worker 事件序列包含 plan/evaluation | +| H7-4 | 质量门失败也写入 evaluation | failed job 可解释阻断原因 | +| H7-5 | 增加评测驱动 QA 用例文档 | 覆盖功能、边界、错误和状态转换 | +| H7-6 | 阶段报告记录 bug/风险 | 大 bug 可后续统一处理 | +| H7-7 | 增加内部 golden replay 基线 | 固定样本可离线回放并被单测锁定 | +| H7-8 | 增加 replay 覆盖摘要 | 年龄段、内容形态、风险区域和 outcome 分布可被单测锁定 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_harness_runtime.py tests/test_generation_jobs.py +.venv/bin/python -m ruff check app tests +``` + +阶段报告: + +- `docs/planning/harness-stage-7-report.md` + +### 阶段 8: Admin-Only Evaluation Analytics + +目标: + +- 提供管理控制面内部评测摘要,用于质量运营和评测策略复盘。 +- 明确 admin-only 权限边界,避免将评测数据分发给普通用户。 +- 只返回聚合摘要,不返回原始内容、prompt、单条评测明细或评分 reason。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H8-1 | 新增 admin evaluation analytics 服务 | 可聚合 `evaluation_completed` | +| H8-2 | 新增 admin-only 路由 | `/admin/evaluations/analytics` 受 admin guard 保护 | +| H8-3 | 支持 days/artifact 过滤 | 过滤测试通过 | +| H8-4 | 锁定用户端隔离 | 用户端扫描无 evaluation analytics 入口 | +| H8-5 | 阶段报告记录安全边界 | 明确不返回原始内容和单条明细 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_generation_jobs.py +.venv/bin/python -m ruff check app tests +``` + +阶段报告: + +- `docs/planning/harness-stage-8-report.md` + +### 阶段 9: WorkflowPlan 接管扩展 + +目标: + +- 让普通故事无图片、普通故事带图片、绘本三条主生成路径都写入显式 `workflow_planned`。 +- 将计划快照用于锁定事件顺序、可恢复资产任务和后续执行器迁移边界。 +- 继续保持评测数据内部分级,用户端只看到可用进度和可恢复状态。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H9-1 | 带图片故事路径记录 `story_with_assets` plan | 事件顺序中 `workflow_planned` 位于 `worker_started` 与 `context_prepared` 之间 | +| H9-2 | 绘本路径记录 `storybook` plan | plan 快照包含 `evaluate_storybook_pages` 和可恢复图片任务 | +| H9-3 | 补主路径事件顺序测试 | story、story_with_assets、storybook 三条路径均被测试覆盖 | +| H9-4 | 锁定用户端评测隔离 | 用户 API 不返回 `evaluation_completed`、评分、维度或 replay 数据 | +| H9-5 | 阶段报告记录执行偏差和验证结果 | 报告包含实现、审查、测试和风险 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +阶段报告: + +- `docs/planning/harness-stage-9-report.md` + +### 阶段 10: 资产计划与 Public Metadata Sanitizer + +目标: + +- 将 `asset_generation` 和 `asset_retry` 也纳入 `WorkflowPlan` 记录。 +- 让用户侧 job event metadata 使用白名单脱敏,避免未来内部 metadata 扩展时误泄露质量策略。 +- 保留用户前端需要的可解释字段:step、artifact、failure category、资源状态、Provider 运营摘要和 coarse plan 摘要。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H10-1 | 后台资产生成记录 `asset_generation` plan | worker 事件顺序包含 `workflow_planned` | +| H10-2 | 资源重试记录 `asset_retry` plan | 同步 retry events 包含 plan 快照 | +| H10-3 | 旧封面/音频生成接口记录资产 plan | 兼容接口不破坏现有响应 | +| H10-4 | 用户 event metadata 白名单脱敏 | 用户 API 不返回原始 `plan.tasks`、`result_snapshot`、内部错误和阈值 | +| H10-5 | 补资产计划和 sanitizer 回归测试 | `tests/test_generation_jobs.py` 覆盖相关路径 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +阶段报告: + +- `docs/planning/harness-stage-10-report.md` + +### 阶段 11: Trace 访问分级与 Request Payload Sanitizer + +目标: + +- 用户侧 job detail 的 `request_payload` 改为白名单脱敏,避免内部调度参数、Provider override、评测策略或原始输入被接口原样回传。 +- 新增 admin-only generation trace detail,在 `admin_guard` 保护下返回完整内部 request payload、workflow plan 和 evaluation metadata。 +- 明确用户前端与管理控制面的 trace 数据边界,为后续 executor 接管保留完整取证能力。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H11-1 | 用户侧 request payload 白名单脱敏 | 用户 job detail 只返回 output/input mode、资产、故事 ID、页数、图片请求开关等安全控制字段 | +| H11-2 | 新增 admin-only trace detail 服务 | 管理端可按 job id 读取完整内部 request payload 和完整 event metadata | +| H11-3 | 新增 admin trace 路由与响应模型 | `GET /admin/generations/jobs/{job_id}/trace` 受 `admin_guard` 保护 | +| H11-4 | 补用户脱敏和 admin 完整 trace 测试 | 用户接口不含内部字段;admin 接口包含 `evaluation_completed` 和完整 plan | +| H11-5 | 阶段报告记录商业机密边界 | 报告说明用户端不分发评测数据,admin-only 数据用途和剩余风险 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +阶段报告: + +- `docs/planning/harness-stage-11-report.md` + +### 阶段 12: Plan-Driven Asset Executor 试点 + +目标: + +- 让 `WorkflowPlan` 从“记录计划”进入“驱动执行”的第一步。 +- 先接管低风险资产任务:`asset_generation`、`asset_retry`、旧封面生成、旧音频生成。 +- 保留现有 asset workflow 的 provider 调用、状态同步、取消检查和事件记录,不把细节复制进 executor。 +- 保持用户侧公开面不新增评测数据或内部 task metadata。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H12-1 | 新增 `run_asset_plan` | 按 `WorkflowTask.key` 顺序执行图片/音频任务,并返回执行结果 | +| H12-2 | 后台 `asset_generation` 接入 plan runner | 多资产 job 按 plan 顺序生成音频和图片,事件顺序稳定 | +| H12-3 | 同步 `asset_retry` 接入 plan runner | 图片/音频重试仍保持原有完成和失败语义 | +| H12-4 | 旧封面/音频接口接入 plan runner | `/api/image/generate/{id}` 和 `/api/audio/{id}` 行为兼容 | +| H12-5 | 补 executor 与资产路径回归测试 | harness 单测覆盖执行顺序;generation job 测试覆盖组合资产执行 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_harness_runtime.py tests/test_generation_jobs.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +阶段报告: + +- `docs/planning/harness-stage-12-report.md` + +### 阶段 13: Admin-Only Executor Coverage + +目标: + +- 将资产 executor 的执行结果记录成内部 `executor_completed` 事件。 +- 新增 admin-only executor coverage 聚合,用于审查 plan-driven execution 覆盖率。 +- 用户侧 job detail、job list 和 trace summary 继续隐藏内部 executor task key 与 coverage metadata。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H13-1 | executor result 生成 coverage metadata | metadata 包含 plan mode、planned/executed/ignored counts、task keys、result assets | +| H13-2 | 资产路径记录 `executor_completed` | asset generation/retry 和旧资源接口写入内部 executor 事件 | +| H13-3 | 新增 admin-only coverage API | `GET /admin/executors/coverage` 受 admin guard 保护 | +| H13-4 | 用户侧过滤 executor 事件和 step | 用户 API 不返回 `executor_completed` 或 task keys | +| H13-5 | 补 admin coverage 与用户隔离测试 | 聚合、过滤、鉴权和用户隔离均被测试覆盖 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py tests/test_harness_runtime.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +阶段报告: + +- `docs/planning/harness-stage-13-report.md` + +### 阶段 14: Admin Trace Executor Coverage Summary + +目标: + +- 让 admin-only 完整 generation trace 自带 executor coverage 摘要。 +- 复用全局 executor coverage 聚合逻辑,保持 `/admin/executors/coverage` 与单 job trace 的统计口径一致。 +- 修正用户 trace summary 的隔离边界,确保内部 `executor_completed` 不通过聚合数量或 task key 泄露。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H14-1 | 抽出 executor coverage 纯聚合函数 | 全局 coverage API 与单 job trace 复用同一函数 | +| H14-2 | admin trace 返回 `executor_coverage` | `GET /admin/generations/jobs/{job_id}/trace` 包含当前 job executor summary | +| H14-3 | 用户 trace summary 过滤 `executor_completed` | 用户 trace summary 不统计内部 executor 事件数量或 task key | +| H14-4 | 补 admin trace summary 与用户隔离测试 | admin 可见覆盖摘要;用户 detail/list/trace summary 不可见 | +| H14-5 | 阶段报告记录审查与验证 | 报告包含实现、风险、命令和结果 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_generation_jobs.py tests/test_admin_providers.py tests/test_harness_runtime.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +阶段报告: + +- `docs/planning/harness-stage-14-report.md` + +### 阶段 15: Admin-Only Harness Readiness + +目标: + +- 建立一个 admin-only readiness audit,在扩大 harness 接管范围前给出聚合质量门。 +- 复用 golden replay、evaluation analytics 和 executor coverage,避免新增独立统计口径。 +- 保持用户侧完全不可见,不向用户端分发评测数据、executor task key 或 readiness 结果。 + +最小任务: + +| ID | 任务 | 验收 | +| --- | --- | --- | +| H15-1 | 将 golden replay fixture 放入 app 内部路径 | Docker 运行环境可读取内部 golden cases | +| H15-2 | 新增 admin harness readiness 服务 | 聚合 golden replay、evaluation analytics 和 executor coverage | +| H15-3 | 新增 admin-only readiness API | `GET /admin/harness/readiness` 受 admin guard 保护 | +| H15-4 | 补 readiness ready/blocked/鉴权测试 | ready、blocked、needs_attention 和 401 均被覆盖 | +| H15-5 | 阶段报告记录安全边界和验证 | 报告说明不返回正文、prompt、score reason 或单条事件 | + +验证命令: + +```bash +cd backend +.venv/bin/python -m pytest tests/test_admin_providers.py tests/test_harness_runtime.py -q +.venv/bin/python -m pytest +.venv/bin/python -m ruff check app tests +cd ../frontend +npm run build +cd ../admin-frontend +npm run build +``` + +阶段报告: + +- `docs/planning/harness-stage-15-report.md` + ## 8. 需求与验收 ### 功能需求 @@ -446,6 +1000,13 @@ npm run build | FR-007 | SHOULD | 资产工作流应从主 service 拆出 | `story_service` 行数和职责减少 | | FR-008 | SHOULD | 输出验证应在持久化前执行 | schema 缺失可被测试捕获 | | FR-009 | COULD | 前端展示标准 step/failure category | 构建通过且无布局溢出 | +| FR-010 | MUST | 用户侧事件 metadata 必须白名单脱敏 | 用户 API 不返回评测分数、原始 plan、result snapshot 或内部错误原文 | +| FR-011 | MUST | 用户侧 request payload 必须白名单脱敏 | 用户 job detail 不返回原始输入、内部调度参数、provider override 或评测策略 | +| FR-012 | SHOULD | 管理控制面可读取完整内部 trace | admin-only trace endpoint 返回完整 request payload 和完整 event metadata | +| FR-013 | SHOULD | 资产任务应由 WorkflowPlan 驱动执行 | asset generation/retry 按 plan task key 执行图片和音频任务 | +| FR-014 | SHOULD | 管理控制面可审查 executor 覆盖率 | admin-only coverage endpoint 聚合 executor runs、task counts 和 result assets | +| FR-015 | SHOULD | 管理端单 job trace 可审查 executor 覆盖摘要 | admin-only trace endpoint 返回当前 job 的 executor coverage summary | +| FR-016 | SHOULD | 管理控制面可执行 harness readiness 审查 | admin-only readiness endpoint 聚合 golden replay、evaluation analytics 和 executor coverage | ### 非功能需求 @@ -457,6 +1018,12 @@ npm run build | NFR-004 | SHOULD | 低耦合 | harness 类型模块不依赖 FastAPI 和 SQLAlchemy | | NFR-005 | SHOULD | 性能稳定 | 不新增阻塞式外部调用 | | NFR-006 | SHOULD | 中文一致性 | 文档、用户可见文案和新增注释使用简体中文 | +| NFR-007 | MUST | 默认不公开内部 metadata | 未加入白名单的新字段不会出现在用户侧 job event 响应中 | +| NFR-008 | MUST | Trace 数据访问分级 | 用户接口只返回安全公开字段;完整评测和内部调度数据仅在 admin guard 后提供 | +| NFR-009 | SHOULD | Executor 接管必须小步可回退 | 先接资产任务;主文本生成仍保持原有服务路径 | +| NFR-010 | MUST | Executor coverage 默认不公开 | `executor_completed`、task keys 和 coverage metadata 不进入用户端接口 | +| NFR-011 | MUST | Admin trace 统计口径一致 | 单 job trace 与全局 executor coverage 复用同一聚合逻辑 | +| NFR-012 | MUST | Readiness 数据默认不公开 | readiness endpoint 只在 admin guard 后提供,不返回正文、prompt、score reason 或单条事件 | ## 9. 风险与缓解 @@ -468,6 +1035,14 @@ npm run build | Provider trace 与 job event 重复 | 低 | 保持 Provider 事件专注调用层,workflow 事件专注产品步骤 | | 文档与实现偏离 | 中 | 每个阶段报告必须记录实现偏差 | | 质量门误伤内容 | 中 | 第四阶段先做确定性低风险检查,模型评审延后 | +| 评测 analytics 泄露商业机密 | 高 | 仅 admin-only 聚合摘要;用户端 API/前端不接入;测试覆盖 admin guard 和用户端隔离 | +| 新增 trace metadata 误进用户 API | 高 | `public_generation_event_metadata` 使用 allowlist,新增字段默认不公开 | +| 请求 payload 混入内部字段 | 高 | `public_generation_request_payload` 使用 allowlist,完整 payload 仅 admin-only trace endpoint 可见 | +| Executor 抽象过早扩大范围 | 中 | 阶段 12 只接管资产 task key;主文本、评测和持久化暂不迁移 | +| Executor coverage 泄露内部执行策略 | 中 | `executor_completed` 全量过滤用户侧响应,只在 admin-only coverage/trace 中提供 | +| Admin trace 与全局 coverage 口径漂移 | 中 | 抽出共享聚合函数,测试同时覆盖 admin trace 和全局 coverage API | +| Readiness 运行环境缺少 golden fixture | 中 | golden cases 放入 app 内部 harness fixtures,随 Docker `COPY app ./app` 发布 | +| Readiness 聚合泄露内部内容 | 高 | 只返回聚合状态和覆盖摘要;测试断言不包含 story title、score reason 或 quality gate message | ## 10. 审查清单 @@ -490,4 +1065,14 @@ npm run build | 阶段 2 | 已完成主要资产补全抽取 | 封面、音频、持久化绘本缺失图片补全已迁入 harness asset workflows | | 阶段 3 | 已完成计划建模基线 | 已定义 WorkflowPlan/WorkflowTask 和核心模式计划快照;执行器接管留待后续 | | 阶段 4 | 已完成确定性质量门 | 已接入文本故事和绘本结构完整性/儿童安全基础检查 | -| 阶段 5 | 待执行 | Trace Analytics 与前端展示 | +| 阶段 5 | 已完成 trace analytics 与前端展示 | 已新增 trace summary API,并在用户端/管理端生成轨迹中展示 step、artifact、failure category | +| 阶段 6 | 已完成真实运行烟测 | 已通过本地新代码 API/worker + Docker PostgreSQL/Redis 覆盖主生成和图片资源重试链路 | +| 阶段 7 | 已完成 7A/7B/7C/7D/7E 当前切片 | 已接入 deterministic evaluator、`workflow_planned`、`evaluation_completed`、普通故事无图片 plan-aware 路径、绘本内部评测、内部 golden replay 和覆盖摘要;已修正并锁定用户侧不分发评测数据 | +| 阶段 8 | 已完成 admin-only evaluation analytics 当前切片 | 已新增 `/admin/evaluations/analytics` 聚合接口、admin guard 测试、days/artifact 过滤和用户端隔离扫描 | +| 阶段 9 | 已完成 WorkflowPlan 接管扩展当前切片 | 普通故事带图片和绘本路径已记录 plan 快照,三条主路径事件顺序与用户端评测隔离已由测试覆盖 | +| 阶段 10 | 已完成资产计划与 public metadata sanitizer 当前切片 | 资产生成/重试路径已记录 plan;用户侧 event metadata 改为白名单并隐藏原始 plan、result snapshot 和内部字段 | +| 阶段 11 | 已完成 trace 访问分级当前切片 | 用户侧 request payload 改为白名单;新增 admin-only 完整 trace endpoint 并覆盖鉴权和内部事件测试 | +| 阶段 12 | 已完成 plan-driven asset executor 当前切片 | `run_asset_plan` 已按 plan task key 驱动图片/音频资产任务;后台资产生成、资源重试和旧封面/音频接口已接入 | +| 阶段 13 | 已完成 admin-only executor coverage 当前切片 | 资产 executor 已记录内部 `executor_completed`;新增 `/admin/executors/coverage`,用户侧继续过滤 executor 事件和 task keys | +| 阶段 14 | 已完成 admin trace executor coverage summary 当前切片 | admin trace 已内嵌单 job executor coverage 摘要;用户 trace summary 继续过滤内部 executor 事件 | +| 阶段 15 | 已完成 admin-only harness readiness 当前切片 | 新增 `/admin/harness/readiness` 聚合 golden replay、evaluation analytics 与 executor coverage;用户侧继续不可见 | diff --git a/frontend/src/components/GenerationTrace.vue b/frontend/src/components/GenerationTrace.vue index 9a96a40..ea79ce4 100644 --- a/frontend/src/components/GenerationTrace.vue +++ b/frontend/src/components/GenerationTrace.vue @@ -7,6 +7,7 @@ import type { GenerationJobEvent, GenerationJobSummary, GenerationProviderStats, + GenerationTraceSummary, } from '../types/generation' import LoadingSpinner from './ui/LoadingSpinner.vue' @@ -27,6 +28,7 @@ const props = withDefaults( const jobHistory = ref([]) const activeJob = ref(null) const providerStats = ref(null) +const traceSummary = ref(null) const loading = ref(false) const actionLoading = ref(false) const error = ref('') @@ -42,6 +44,8 @@ const providerSuccessRate = computed(() => { if (!providerStats.value?.total_calls) return null return Math.round((providerStats.value.successful_calls / providerStats.value.total_calls) * 100) }) +const topTraceStep = computed(() => traceSummary.value?.by_step[0] ?? null) +const topFailureCategory = computed(() => traceSummary.value?.failure_categories[0] ?? null) const containerClass = computed(() => ( isDark.value @@ -100,6 +104,7 @@ function getJobStatusLabel(status?: string) { function getEventLabel(eventType: string) { const labels: Record = { request_accepted: '请求接收', + workflow_planned: '工作流规划', worker_started: '后台任务开始', retry_queued: '重新排队', cancel_requested: '已请求取消', @@ -122,6 +127,7 @@ function getEventLabel(eventType: string) { provider_call_started: '供应商调用', provider_call_succeeded: '供应商成功', provider_call_failed: '供应商失败', + quality_gate_failed: '质量门失败', asset_retry_started: '资源重试开始', asset_retry_completed: '资源重试完成', asset_retry_failed: '资源重试失败', @@ -134,6 +140,72 @@ function getEventLabel(eventType: string) { return labels[eventType] ?? eventType } +function getStepLabel(step?: unknown) { + const labels: Record = { + request_acceptance: '请求接收', + worker_start: '后台启动', + context_preparation: '上下文准备', + narrative_generation: '主内容生成', + story_persistence: '故事保存', + provider_invocation: '供应商调用', + image_generation: '图片生成', + audio_generation: '音频生成', + asset_retry: '资源重试', + asset_generation: '资源生成', + postprocessing: '后处理', + completion: '任务完成', + cancellation: '取消', + stale_recovery: '超时收敛', + unknown: '未知步骤', + } + const key = typeof step === 'string' ? step : '' + return labels[key] ?? key +} + +function getArtifactLabel(artifact?: unknown) { + const labels: Record = { + story_text: '故事正文', + storybook_pages: '绘本分页', + cover_image: '封面图', + page_image: '分页插图', + image: '图片资源', + audio: '音频', + achievement_memory: '成长记忆', + none: '无资源', + unknown: '未知资源', + } + const key = typeof artifact === 'string' ? artifact : '' + return labels[key] ?? key +} + +function getFailureCategoryLabel(category?: unknown) { + const labels: Record = { + provider_error: '供应商失败', + schema_error: '结构不完整', + safety_error: '儿童安全风险', + timeout: '超时', + canceled: '用户取消', + stale_job: '任务卡住', + storage_error: '存储失败', + validation_error: '输入校验失败', + unknown_error: '未知失败', + } + const key = typeof category === 'string' ? category : '' + return labels[key] ?? key +} + +function getTraceMetaText(event: GenerationJobEvent) { + const meta = event.event_metadata + const step = getStepLabel(meta.step) + const artifact = getArtifactLabel(meta.artifact) + const failureCategory = meta.failure_category + ? getFailureCategoryLabel(meta.failure_category) + : '' + return [step, artifact && artifact !== '无资源' ? artifact : '', failureCategory] + .filter(Boolean) + .join(' · ') +} + function formatDateTime(value: string) { return new Intl.DateTimeFormat('zh-CN', { hour: '2-digit', @@ -175,22 +247,25 @@ async function selectGenerationJob(jobId: string) { async function refresh() { if (props.storyId === null) { - jobHistory.value = [] - activeJob.value = null - providerStats.value = null - return + jobHistory.value = [] + activeJob.value = null + providerStats.value = null + traceSummary.value = null + return } error.value = '' const selectedJobId = activeJob.value?.id ?? null try { - const [jobs, stats] = await Promise.all([ + const [jobs, stats, trace] = await Promise.all([ api.get(`/api/generations/${props.storyId}/jobs`), api.get(`/api/generations/${props.storyId}/provider-stats`), + api.get(`/api/generations/${props.storyId}/trace-summary`), ]) jobHistory.value = jobs providerStats.value = stats + traceSummary.value = trace const nextJobId = ( selectedJobId ? jobHistory.value.find((job) => job.id === selectedJobId)?.id @@ -205,6 +280,7 @@ async function refresh() { jobHistory.value = [] activeJob.value = null providerStats.value = null + traceSummary.value = null error.value = e instanceof Error ? e.message : '生成轨迹加载失败' } } @@ -318,6 +394,32 @@ defineExpose({ refresh }) +
+
+
流程事件
+
{{ traceSummary.total_events }}
+
+
+
失败事件
+
{{ traceSummary.failed_events }}
+
+
+
主要步骤
+
+ {{ topTraceStep ? `${getStepLabel(topTraceStep.name)} · ${topTraceStep.count}` : '暂无' }} +
+
+
+
主要失败
+
+ {{ topFailureCategory ? `${getFailureCategoryLabel(topFailureCategory.name)} · ${topFailureCategory.count}` : '暂无' }} +
+
+
+
暂无生成轨迹。旧数据会在下一次资源补全后开始记录。
@@ -432,6 +534,9 @@ defineExpose({ refresh })

{{ event.message }}

+

+ {{ getTraceMetaText(event) }} +

diff --git a/frontend/src/types/generation.ts b/frontend/src/types/generation.ts index 688ae92..c188cf7 100644 --- a/frontend/src/types/generation.ts +++ b/frontend/src/types/generation.ts @@ -58,6 +58,21 @@ export interface GenerationProviderStats { }> } +export interface GenerationTraceBucket { + name: string + count: number +} + +export interface GenerationTraceSummary { + story_id: number + window_days: number | null + total_events: number + failed_events: number + by_step: GenerationTraceBucket[] + by_artifact: GenerationTraceBucket[] + failure_categories: GenerationTraceBucket[] +} + export interface GenerationProviderAnalytics { window_days: number | null capability: string | null