Expand generation harness observability

2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions
--- a/backend/app/services/admin_evaluation_analytics.py
+++ b/backend/app/services/admin_evaluation_analytics.py
@@ -0,0 +1,204 @@
+"""Admin-only analytics for internal generation evaluation events."""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models import GenerationJob, GenerationJobEvent
+
+
+def _as_float(value: Any) -> float | None:
+    if isinstance(value, int | float):
+        return float(value)
+    return None
+
+
+def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
+    return [
+        {key_name: name, "count": count}
+        for name, count in sorted(
+            counts.items(),
+            key=lambda item: (-item[1], item[0]),
+        )
+    ]
+
+
+def _average_bucket(
+    totals: dict[str, float],
+    counts: dict[str, int],
+    *,
+    key_name: str,
+) -> list[dict[str, Any]]:
+    rows = [
+        {
+            key_name: name,
+            "average_score": round(totals[name] / counts[name], 4),
+            "count": counts[name],
+        }
+        for name in totals
+        if counts.get(name)
+    ]
+    rows.sort(key=lambda item: (-int(item["count"]), str(item[key_name])))
+    return rows
+
+
+def _score_band(score: float) -> str:
+    if score >= 0.9:
+        return "excellent"
+    if score >= 0.8:
+        return "good"
+    if score >= 0.7:
+        return "pass"
+    if score > 0:
+        return "blocked_low_score"
+    return "blocked_quality_gate"
+
+
+def _metadata_scores(metadata: dict[str, Any]) -> list[dict[str, Any]]:
+    raw_scores = metadata.get("scores")
+    if not isinstance(raw_scores, list):
+        return []
+    return [score for score in raw_scores if isinstance(score, dict)]
+
+
+def _quality_gate_issues(metadata: dict[str, Any]) -> list[dict[str, Any]]:
+    quality_gate = metadata.get("quality_gate")
+    if not isinstance(quality_gate, dict):
+        return []
+    raw_issues = quality_gate.get("issues")
+    if not isinstance(raw_issues, list):
+        return []
+    return [issue for issue in raw_issues if isinstance(issue, dict)]
+
+
+async def get_admin_evaluation_analytics(
+    db: AsyncSession,
+    *,
+    days: int | None = None,
+    artifact: str | None = None,
+) -> dict[str, Any]:
+    """Aggregate internal evaluation results for the admin control plane."""
+
+    cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None
+
+    query = (
+        select(GenerationJobEvent, GenerationJob)
+        .join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
+        .where(GenerationJobEvent.event_type == "evaluation_completed")
+        .order_by(GenerationJobEvent.id)
+    )
+    if cutoff is not None:
+        query = query.where(GenerationJobEvent.created_at >= cutoff)
+
+    rows = (await db.execute(query)).all()
+
+    total_evaluations = 0
+    passed_evaluations = 0
+    blocked_evaluations = 0
+    score_total = 0.0
+    score_count = 0
+    job_ids: set[str] = set()
+    story_ids: set[int] = set()
+    user_ids: set[str] = set()
+    artifacts: dict[str, int] = {}
+    output_modes: dict[str, int] = {}
+    score_bands: dict[str, int] = {}
+    dimension_totals: dict[str, float] = {}
+    dimension_counts: dict[str, int] = {}
+    quality_gate_codes: dict[str, int] = {}
+    failure_categories: dict[str, int] = {}
+    warning_counts: dict[str, int] = {}
+
+    for event, job in rows:
+        metadata = event.event_metadata or {}
+        event_artifact = str(metadata.get("artifact") or "unknown")
+        if artifact is not None and event_artifact != artifact:
+            continue
+
+        total_evaluations += 1
+        job_ids.add(job.id)
+        user_ids.add(job.user_id)
+        if event.story_id is not None:
+            story_ids.add(int(event.story_id))
+        elif job.story_id is not None:
+            story_ids.add(int(job.story_id))
+
+        artifacts[event_artifact] = artifacts.get(event_artifact, 0) + 1
+        output_modes[job.output_mode] = output_modes.get(job.output_mode, 0) + 1
+
+        passed = metadata.get("passed") is True
+        blocking = metadata.get("blocking") is True
+        if passed:
+            passed_evaluations += 1
+        if blocking:
+            blocked_evaluations += 1
+
+        overall_score = _as_float(metadata.get("overall_score"))
+        if overall_score is not None:
+            score_total += overall_score
+            score_count += 1
+            band = _score_band(overall_score)
+            score_bands[band] = score_bands.get(band, 0) + 1
+
+        for score in _metadata_scores(metadata):
+            dimension = score.get("dimension")
+            dimension_score = _as_float(score.get("score"))
+            if not isinstance(dimension, str) or dimension_score is None:
+                continue
+            dimension_totals[dimension] = dimension_totals.get(dimension, 0.0) + dimension_score
+            dimension_counts[dimension] = dimension_counts.get(dimension, 0) + 1
+
+        for issue in _quality_gate_issues(metadata):
+            code = issue.get("code")
+            if isinstance(code, str) and code:
+                quality_gate_codes[code] = quality_gate_codes.get(code, 0) + 1
+            failure_category = issue.get("failure_category")
+            if isinstance(failure_category, str) and failure_category:
+                failure_categories[failure_category] = (
+                    failure_categories.get(failure_category, 0) + 1
+                )
+
+        warnings = metadata.get("warnings")
+        if isinstance(warnings, list):
+            for warning in warnings:
+                if isinstance(warning, str) and warning:
+                    warning_counts[warning] = warning_counts.get(warning, 0) + 1
+
+    return {
+        "scope": "admin_internal_evaluations",
+        "window_days": days,
+        "artifact": artifact,
+        "total_evaluations": total_evaluations,
+        "passed_evaluations": passed_evaluations,
+        "blocked_evaluations": blocked_evaluations,
+        "pass_rate": (
+            round(passed_evaluations / total_evaluations, 4)
+            if total_evaluations
+            else 0.0
+        ),
+        "average_score": round(score_total / score_count, 4) if score_count else None,
+        "job_count": len(job_ids),
+        "story_count": len(story_ids),
+        "user_count": len(user_ids),
+        "by_artifact": _sorted_count_buckets(artifacts, key_name="artifact"),
+        "by_output_mode": _sorted_count_buckets(output_modes, key_name="output_mode"),
+        "score_bands": _sorted_count_buckets(score_bands, key_name="band"),
+        "dimension_scores": _average_bucket(
+            dimension_totals,
+            dimension_counts,
+            key_name="dimension",
+        ),
+        "quality_gate_issues": _sorted_count_buckets(
+            quality_gate_codes,
+            key_name="code",
+        ),
+        "failure_categories": _sorted_count_buckets(
+            failure_categories,
+            key_name="category",
+        ),
+        "warnings": _sorted_count_buckets(warning_counts, key_name="message"),
+    }
--- a/backend/app/services/admin_executor_coverage.py
+++ b/backend/app/services/admin_executor_coverage.py
@@ -0,0 +1,147 @@
+"""Admin-only analytics for internal workflow executor coverage."""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models import GenerationJob, GenerationJobEvent
+
+
+def _as_int(value: Any) -> int:
+    if isinstance(value, bool):
+        return int(value)
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float):
+        return int(value)
+    return 0
+
+
+def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
+    return [
+        {key_name: name, "count": count}
+        for name, count in sorted(
+            counts.items(),
+            key=lambda item: (-item[1], item[0]),
+        )
+    ]
+
+
+def _iter_strings(value: Any) -> Iterable[str]:
+    if not isinstance(value, list | tuple | set):
+        return
+
+    for item in value:
+        if isinstance(item, str) and item:
+            yield item
+
+
+def summarize_executor_coverage_rows(
+    rows: Iterable[tuple[GenerationJobEvent, GenerationJob]],
+    *,
+    days: int | None = None,
+    plan_mode: str | None = None,
+    scope: str = "admin_internal_executor_coverage",
+) -> dict[str, Any]:
+    """Aggregate internal executor coverage rows into an admin-only summary."""
+
+    total_runs = 0
+    total_planned_tasks = 0
+    total_executed_tasks = 0
+    total_ignored_tasks = 0
+    job_ids: set[str] = set()
+    story_ids: set[int] = set()
+    user_ids: set[str] = set()
+    by_plan_mode: dict[str, int] = {}
+    by_output_mode: dict[str, int] = {}
+    executed_task_keys: dict[str, int] = {}
+    ignored_task_keys: dict[str, int] = {}
+    result_assets: dict[str, int] = {}
+
+    for event, job in rows:
+        metadata = event.event_metadata or {}
+        event_plan_mode = str(metadata.get("plan_mode") or "unknown")
+        if plan_mode is not None and event_plan_mode != plan_mode:
+            continue
+
+        total_runs += 1
+        job_ids.add(job.id)
+        user_ids.add(job.user_id)
+        if event.story_id is not None:
+            story_ids.add(int(event.story_id))
+        elif job.story_id is not None:
+            story_ids.add(int(job.story_id))
+
+        by_plan_mode[event_plan_mode] = by_plan_mode.get(event_plan_mode, 0) + 1
+        by_output_mode[job.output_mode] = by_output_mode.get(job.output_mode, 0) + 1
+
+        total_planned_tasks += _as_int(metadata.get("planned_task_count"))
+        total_executed_tasks += _as_int(metadata.get("executed_task_count"))
+        total_ignored_tasks += _as_int(metadata.get("ignored_task_count"))
+
+        for key in _iter_strings(metadata.get("executed_task_keys")):
+            executed_task_keys[key] = executed_task_keys.get(key, 0) + 1
+
+        for key in _iter_strings(metadata.get("ignored_task_keys")):
+            ignored_task_keys[key] = ignored_task_keys.get(key, 0) + 1
+
+        for asset in _iter_strings(metadata.get("result_assets")):
+            result_assets[asset] = result_assets.get(asset, 0) + 1
+
+    coverage_ratio = (
+        round(total_executed_tasks / total_planned_tasks, 4)
+        if total_planned_tasks
+        else 0.0
+    )
+
+    return {
+        "scope": scope,
+        "window_days": days,
+        "plan_mode": plan_mode,
+        "total_runs": total_runs,
+        "total_planned_tasks": total_planned_tasks,
+        "total_executed_tasks": total_executed_tasks,
+        "total_ignored_tasks": total_ignored_tasks,
+        "coverage_ratio": coverage_ratio,
+        "job_count": len(job_ids),
+        "story_count": len(story_ids),
+        "user_count": len(user_ids),
+        "by_plan_mode": _sorted_count_buckets(by_plan_mode, key_name="plan_mode"),
+        "by_output_mode": _sorted_count_buckets(by_output_mode, key_name="output_mode"),
+        "executed_task_keys": _sorted_count_buckets(
+            executed_task_keys,
+            key_name="task_key",
+        ),
+        "ignored_task_keys": _sorted_count_buckets(
+            ignored_task_keys,
+            key_name="task_key",
+        ),
+        "result_assets": _sorted_count_buckets(result_assets, key_name="asset"),
+    }
+
+
+async def get_admin_executor_coverage(
+    db: AsyncSession,
+    *,
+    days: int | None = None,
+    plan_mode: str | None = None,
+) -> dict[str, Any]:
+    """Aggregate internal executor coverage events for the admin control plane."""
+
+    cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None
+    query = (
+        select(GenerationJobEvent, GenerationJob)
+        .join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
+        .where(GenerationJobEvent.event_type == "executor_completed")
+        .order_by(GenerationJobEvent.id)
+    )
+    if cutoff is not None:
+        query = query.where(GenerationJobEvent.created_at >= cutoff)
+
+    rows = (await db.execute(query)).all()
+    return summarize_executor_coverage_rows(rows, days=days, plan_mode=plan_mode)
--- a/backend/app/services/admin_generation_trace.py
+++ b/backend/app/services/admin_generation_trace.py
@@ -0,0 +1,52 @@
+"""Admin-only generation trace detail service."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from fastapi import HTTPException
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models import GenerationJob, GenerationJobEvent
+from app.services.admin_executor_coverage import summarize_executor_coverage_rows
+from app.services.generation_jobs import (
+    generation_event_to_response,
+    generation_job_to_summary,
+)
+
+
+async def get_admin_generation_job_trace(
+    db: AsyncSession,
+    *,
+    job_id: str,
+) -> dict[str, Any]:
+    """Return a complete internal generation trace for the admin control plane."""
+
+    job = (
+        await db.execute(select(GenerationJob).where(GenerationJob.id == job_id))
+    ).scalar_one_or_none()
+    if job is None:
+        raise HTTPException(status_code=404, detail="Generation job not found")
+
+    events = (
+        await db.execute(
+            select(GenerationJobEvent)
+            .where(GenerationJobEvent.job_id == job.id)
+            .order_by(GenerationJobEvent.id)
+        )
+    ).scalars().all()
+    executor_rows = [
+        (event, job) for event in events if event.event_type == "executor_completed"
+    ]
+
+    return {
+        **generation_job_to_summary(job),
+        "user_id": job.user_id,
+        "request_payload": job.request_payload or {},
+        "executor_coverage": summarize_executor_coverage_rows(
+            executor_rows,
+            scope="admin_internal_job_executor_coverage",
+        ),
+        "events": [generation_event_to_response(event) for event in events],
+    }
--- a/backend/app/services/admin_harness_readiness.py
+++ b/backend/app/services/admin_harness_readiness.py
@@ -0,0 +1,262 @@
+"""Admin-only readiness audit for harness-driven generation."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
+from app.services.admin_executor_coverage import get_admin_executor_coverage
+from app.services.harness.evaluation_replay import replay_evaluation_golden_cases
+
+_GOLDEN_CASES_PATH = (
+    Path(__file__).resolve().parent
+    / "harness"
+    / "fixtures"
+    / "evaluation_golden_cases.json"
+)
+
+_MIN_RUNTIME_EVALUATIONS = 1
+_MIN_EXECUTOR_RUNS = 1
+_MIN_EVALUATION_PASS_RATE = 0.7
+_MIN_EVALUATION_AVERAGE_SCORE = 0.7
+_MIN_EXECUTOR_COVERAGE_RATIO = 0.2
+
+
+def _check(
+    *,
+    code: str,
+    status: str,
+    message: str,
+    details: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    return {
+        "code": code,
+        "status": status,
+        "message": message,
+        "details": details or {},
+    }
+
+
+def _overall_status(checks: list[dict[str, Any]]) -> str:
+    statuses = {check["status"] for check in checks}
+    if "blocked" in statuses:
+        return "blocked"
+    if "needs_attention" in statuses:
+        return "needs_attention"
+    return "ready"
+
+
+def _run_golden_replay() -> dict[str, Any]:
+    if not _GOLDEN_CASES_PATH.exists():
+        return {
+            "passed": False,
+            "total_cases": 0,
+            "failed_case_ids": ["fixture_missing"],
+            "coverage_summary": {},
+        }
+
+    result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH)
+    return {
+        "passed": result.passed,
+        "total_cases": len(result.cases),
+        "failed_case_ids": list(result.failed_case_ids),
+        "coverage_summary": result.coverage_summary(),
+    }
+
+
+def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]:
+    if golden_replay["passed"] and golden_replay["total_cases"] > 0:
+        return _check(
+            code="golden_replay",
+            status="ready",
+            message="内部 golden replay 全部通过。",
+            details={
+                "total_cases": golden_replay["total_cases"],
+                "failed_case_count": len(golden_replay["failed_case_ids"]),
+            },
+        )
+
+    return _check(
+        code="golden_replay",
+        status="blocked",
+        message="内部 golden replay 未通过，暂停扩大 harness 接管范围。",
+        details={
+            "total_cases": golden_replay["total_cases"],
+            "failed_case_count": len(golden_replay["failed_case_ids"]),
+            "failed_case_ids": golden_replay["failed_case_ids"],
+        },
+    )
+
+
+def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
+    total = int(evaluation_analytics["total_evaluations"])
+    if total >= _MIN_RUNTIME_EVALUATIONS:
+        return _check(
+            code="runtime_evaluation_samples",
+            status="ready",
+            message="当前窗口已有内部 evaluation 运行样本。",
+            details={
+                "total_evaluations": total,
+                "min_required": _MIN_RUNTIME_EVALUATIONS,
+            },
+        )
+
+    return _check(
+        code="runtime_evaluation_samples",
+        status="needs_attention",
+        message="当前窗口缺少内部 evaluation 运行样本，建议先跑生成烟测。",
+        details={
+            "total_evaluations": total,
+            "min_required": _MIN_RUNTIME_EVALUATIONS,
+        },
+    )
+
+
+def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
+    total = int(evaluation_analytics["total_evaluations"])
+    pass_rate = float(evaluation_analytics["pass_rate"])
+    average_score = evaluation_analytics["average_score"]
+
+    if total == 0:
+        return _check(
+            code="runtime_evaluation_quality",
+            status="needs_attention",
+            message="暂无运行期 evaluation 质量样本。",
+            details={
+                "total_evaluations": total,
+                "min_pass_rate": _MIN_EVALUATION_PASS_RATE,
+                "min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
+            },
+        )
+
+    if pass_rate < _MIN_EVALUATION_PASS_RATE or (
+        average_score is not None
+        and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE
+    ):
+        return _check(
+            code="runtime_evaluation_quality",
+            status="blocked",
+            message="运行期 evaluation 质量未达到内部 readiness 门槛。",
+            details={
+                "pass_rate": pass_rate,
+                "average_score": average_score,
+                "blocked_evaluations": evaluation_analytics["blocked_evaluations"],
+                "min_pass_rate": _MIN_EVALUATION_PASS_RATE,
+                "min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
+            },
+        )
+
+    return _check(
+        code="runtime_evaluation_quality",
+        status="ready",
+        message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。",
+        details={
+            "pass_rate": pass_rate,
+            "average_score": average_score,
+            "blocked_evaluations": evaluation_analytics["blocked_evaluations"],
+        },
+    )
+
+
+def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
+    total_runs = int(executor_coverage["total_runs"])
+    if total_runs >= _MIN_EXECUTOR_RUNS:
+        return _check(
+            code="executor_coverage_samples",
+            status="ready",
+            message="当前窗口已有 executor coverage 运行样本。",
+            details={
+                "total_runs": total_runs,
+                "min_required": _MIN_EXECUTOR_RUNS,
+            },
+        )
+
+    return _check(
+        code="executor_coverage_samples",
+        status="needs_attention",
+        message="当前窗口缺少 executor coverage 样本，建议先跑资产生成或重试烟测。",
+        details={
+            "total_runs": total_runs,
+            "min_required": _MIN_EXECUTOR_RUNS,
+        },
+    )
+
+
+def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
+    total_runs = int(executor_coverage["total_runs"])
+    coverage_ratio = float(executor_coverage["coverage_ratio"])
+
+    if total_runs == 0:
+        return _check(
+            code="executor_coverage_ratio",
+            status="needs_attention",
+            message="暂无 executor coverage 运行样本。",
+            details={
+                "total_runs": total_runs,
+                "min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
+            },
+        )
+
+    if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO:
+        return _check(
+            code="executor_coverage_ratio",
+            status="blocked",
+            message="executor coverage ratio 未达到内部 readiness 门槛。",
+            details={
+                "coverage_ratio": coverage_ratio,
+                "min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
+                "total_planned_tasks": executor_coverage["total_planned_tasks"],
+                "total_executed_tasks": executor_coverage["total_executed_tasks"],
+            },
+        )
+
+    return _check(
+        code="executor_coverage_ratio",
+        status="ready",
+        message="executor coverage ratio 达到内部 readiness 门槛。",
+        details={
+            "coverage_ratio": coverage_ratio,
+            "total_planned_tasks": executor_coverage["total_planned_tasks"],
+            "total_executed_tasks": executor_coverage["total_executed_tasks"],
+        },
+    )
+
+
+async def get_admin_harness_readiness(
+    db: AsyncSession,
+    *,
+    days: int | None = None,
+) -> dict[str, Any]:
+    """Return an admin-only readiness audit for harness release decisions."""
+
+    golden_replay = _run_golden_replay()
+    evaluation_analytics = await get_admin_evaluation_analytics(db, days=days)
+    executor_coverage = await get_admin_executor_coverage(db, days=days)
+
+    checks = [
+        _golden_replay_check(golden_replay),
+        _evaluation_sample_check(evaluation_analytics),
+        _evaluation_quality_check(evaluation_analytics),
+        _executor_sample_check(executor_coverage),
+        _executor_ratio_check(executor_coverage),
+    ]
+
+    return {
+        "scope": "admin_internal_harness_readiness",
+        "window_days": days,
+        "status": _overall_status(checks),
+        "thresholds": {
+            "min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS,
+            "min_executor_runs": _MIN_EXECUTOR_RUNS,
+            "min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE,
+            "min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
+            "min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
+        },
+        "checks": checks,
+        "golden_replay": golden_replay,
+        "evaluation_analytics": evaluation_analytics,
+        "executor_coverage": executor_coverage,
+    }
--- a/backend/app/services/generation_jobs.py
+++ b/backend/app/services/generation_jobs.py
@@ -90,11 +90,13 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]:

    progress_map: dict[str, tuple[int, str]] = {
        "request_accepted": (5, "已接收请求"),
+        "workflow_planned": (8, "工作流已规划"),
        "retry_queued": (8, "重新排队中"),
        "worker_started": (12, "后台任务已开始"),
        "cancel_requested": (15, "已请求取消"),
        "context_prepared": (20, "上下文已准备"),
        "narrative_generated": (45, "正文已生成"),
+        "evaluation_completed": (52, "内容评测已完成"),
        "story_saved": (60, "主记录已保存"),
        "provider_call_started": (65, "Provider 调用中"),
        "provider_call_succeeded": (72, "Provider 调用成功"),
@@ -307,6 +309,137 @@ def generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any]:
    }


+_PUBLIC_EVENT_METADATA_KEYS = {
+    "adapter",
+    "artifact",
+    "asset",
+    "assets",
+    "attempted_cover",
+    "audio_status",
+    "blocks_main_result",
+    "capability",
+    "completed_pages",
+    "cover_prompt_present",
+    "estimated_cost_usd",
+    "failed_pages",
+    "failure_category",
+    "generation_status",
+    "has_memory_context",
+    "image_status",
+    "input_type",
+    "latency_ms",
+    "mode",
+    "output_mode",
+    "page_count",
+    "page_number",
+    "recoverable",
+    "requested_from_step",
+    "retryable",
+    "scope",
+    "stale_after_minutes",
+    "status",
+    "step",
+    "strategy",
+    "text_status",
+}
+
+_PUBLIC_REQUEST_PAYLOAD_KEYS = {
+    "assets",
+    "child_profile_id",
+    "generate_images",
+    "input_type",
+    "output_mode",
+    "page_count",
+    "story_id",
+    "type",
+    "universe_id",
+}
+
+
+def _public_metadata_value(value: Any) -> Any:
+    """Return a JSON-safe public value or None when the value is internal."""
+
+    if isinstance(value, str | int | float | bool) or value is None:
+        return value
+    if isinstance(value, list):
+        public_items = [
+            item
+            for item in value
+            if isinstance(item, str | int | float | bool) or item is None
+        ]
+        return public_items
+    return None
+
+
+def public_generation_request_payload(job: GenerationJob) -> dict[str, Any]:
+    """Return request payload fields safe for user-facing job details."""
+
+    payload = job.request_payload or {}
+    public_payload: dict[str, Any] = {}
+
+    for key in sorted(_PUBLIC_REQUEST_PAYLOAD_KEYS):
+        if key not in payload:
+            continue
+        value = _public_metadata_value(payload[key])
+        if value is not None:
+            public_payload[key] = value
+
+    return public_payload
+
+
+def _public_plan_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
+    """Expose only coarse workflow plan metadata to user-facing responses."""
+
+    plan = metadata.get("plan")
+    if not isinstance(plan, dict):
+        return {}
+
+    public: dict[str, Any] = {}
+    mode = plan.get("mode")
+    if isinstance(mode, str):
+        public["plan_mode"] = mode
+
+    tasks = plan.get("tasks")
+    if isinstance(tasks, list):
+        public["planned_task_count"] = len(tasks)
+        public["recoverable_task_count"] = sum(
+            1
+            for task in tasks
+            if isinstance(task, dict) and task.get("recoverable") is True
+        )
+
+    return public
+
+
+def public_generation_event_metadata(event: GenerationJobEvent) -> dict[str, Any]:
+    """Return event metadata safe for user-facing job event streams."""
+
+    metadata = event.event_metadata or {}
+    public_metadata: dict[str, Any] = {}
+
+    for key in sorted(_PUBLIC_EVENT_METADATA_KEYS):
+        if key not in metadata:
+            continue
+        value = _public_metadata_value(metadata[key])
+        if value is not None:
+            public_metadata[key] = value
+
+    if event.event_type == "workflow_planned":
+        public_metadata.update(_public_plan_metadata(metadata))
+
+    return public_metadata
+
+
+def public_generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any] | None:
+    """Convert a generation event for user-facing APIs with internal data removed."""
+
+    if event.event_type in {"evaluation_completed", "executor_completed"}:
+        return None
+    response = generation_event_to_response(event)
+    response["event_metadata"] = public_generation_event_metadata(event)
+    return response
+
+
 def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
    """Convert a generation job ORM object to an API summary dict."""

@@ -328,6 +461,23 @@ def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
    }


+def public_generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
+    """Convert a generation job for user-facing APIs with internal steps hidden."""
+
+    summary = generation_job_to_summary(job)
+    if summary["current_step"] == "evaluation_completed":
+        summary["current_step"] = "narrative_generated"
+        summary["progress_percent"] = 45
+        summary["progress_label"] = "正文已生成"
+        summary["is_terminal"] = False
+    elif summary["current_step"] == "executor_completed":
+        summary["current_step"] = "workflow_planned"
+        summary["progress_percent"] = 8
+        summary["progress_label"] = "工作流已规划"
+        summary["is_terminal"] = False
+    return summary
+
+
 async def get_generation_job_for_user(
    db: AsyncSession,
    *,
@@ -362,13 +512,13 @@ async def request_generation_job_cancel(
        raise HTTPException(status_code=409, detail="当前任务不支持取消")

    if job.status == "canceled":
-        return generation_job_to_summary(job)
+        return public_generation_job_to_summary(job)

    if _is_terminal_status(job.status):
        raise HTTPException(status_code=409, detail="当前任务已终止，无法取消")

    if job.current_step == "cancel_requested":
-        return generation_job_to_summary(job)
+        return public_generation_job_to_summary(job)

    if job.current_step in {"request_accepted", "retry_queued"}:
        story = None
@@ -391,7 +541,7 @@ async def request_generation_job_cancel(
            error_message="Generation canceled by user before worker execution started.",
            message="Generation job was canceled before worker execution started.",
        )
-        return generation_job_to_summary(job)
+        return public_generation_job_to_summary(job)

    previous_step = job.current_step
    job.error_message = "Cancellation requested by user."
@@ -407,7 +557,7 @@ async def request_generation_job_cancel(
    )
    await db.commit()
    await db.refresh(job)
-    return generation_job_to_summary(job)
+    return public_generation_job_to_summary(job)


 async def get_generation_job_detail(
@@ -437,9 +587,13 @@ async def get_generation_job_detail(
    ).scalars().all()

    return {
-        **generation_job_to_summary(job),
-        "request_payload": job.request_payload or {},
-        "events": [generation_event_to_response(event) for event in events],
+        **public_generation_job_to_summary(job),
+        "request_payload": public_generation_request_payload(job),
+        "events": [
+            response
+            for event in events
+            if (response := public_generation_event_to_response(event)) is not None
+        ],
    }


@@ -461,7 +615,7 @@ async def list_story_generation_jobs(
            .order_by(desc(GenerationJob.created_at), desc(GenerationJob.id))
        )
    ).scalars().all()
-    return [generation_job_to_summary(job) for job in jobs]
+    return [public_generation_job_to_summary(job) for job in jobs]


 async def get_active_story_generation_job(
@@ -513,6 +667,59 @@ def _as_float(value: Any) -> float | None:
    return None


+def _sorted_buckets(counts: dict[str, int]) -> list[dict[str, Any]]:
+    return [
+        {"name": name, "count": count}
+        for name, count in sorted(
+            counts.items(),
+            key=lambda item: (-item[1], item[0]),
+        )
+    ]
+
+
+def _aggregate_trace_events(events: list[GenerationJobEvent]) -> dict[str, Any]:
+    """Aggregate workflow trace metadata across job events."""
+
+    by_step: dict[str, int] = {}
+    by_artifact: dict[str, int] = {}
+    failure_categories: dict[str, int] = {}
+    failed_events = 0
+    total_events = 0
+
+    for event in events:
+        if event.event_type in {"evaluation_completed", "executor_completed"}:
+            continue
+
+        total_events += 1
+        metadata = event.event_metadata or {}
+        step = metadata.get("step")
+        artifact = metadata.get("artifact")
+        failure_category = metadata.get("failure_category")
+
+        if isinstance(step, str) and step:
+            by_step[step] = by_step.get(step, 0) + 1
+
+        if isinstance(artifact, str) and artifact and artifact != "none":
+            by_artifact[artifact] = by_artifact.get(artifact, 0) + 1
+
+        if event.status == "failed":
+            failed_events += 1
+            category = (
+                failure_category
+                if isinstance(failure_category, str) and failure_category
+                else "unknown_error"
+            )
+            failure_categories[category] = failure_categories.get(category, 0) + 1
+
+    return {
+        "total_events": total_events,
+        "failed_events": failed_events,
+        "by_step": _sorted_buckets(by_step),
+        "by_artifact": _sorted_buckets(by_artifact),
+        "failure_categories": _sorted_buckets(failure_categories),
+    }
+
+
 def _aggregate_provider_events(
    events: list[GenerationJobEvent],
    *,
@@ -679,6 +886,38 @@ async def get_story_provider_stats(
    }


+async def get_story_trace_summary(
+    db: AsyncSession,
+    *,
+    story_id: int,
+    user_id: str,
+    days: int | None = None,
+) -> dict[str, Any]:
+    """Aggregate workflow trace metadata from all user-owned jobs for one story."""
+
+    query = (
+        select(GenerationJobEvent)
+        .join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
+        .where(
+            GenerationJob.story_id == story_id,
+            GenerationJob.user_id == user_id,
+        )
+        .order_by(GenerationJobEvent.id)
+    )
+
+    if days is not None:
+        cutoff = datetime.now(timezone.utc) - timedelta(days=days)
+        query = query.where(GenerationJobEvent.created_at >= cutoff)
+
+    events = (await db.execute(query)).scalars().all()
+
+    return {
+        "story_id": story_id,
+        "window_days": days,
+        **_aggregate_trace_events(events),
+    }
+
+
 async def get_user_provider_analytics(
    db: AsyncSession,
    *,
--- a/backend/app/services/harness/evaluation_replay.py
+++ b/backend/app/services/harness/evaluation_replay.py
@@ -0,0 +1,322 @@
+"""Internal golden-case replay support for harness evaluations.
+
+The replay helpers are intentionally not wired to user-facing APIs. They exist
+to make evaluation behavior reproducible in tests and internal tooling.
+"""
+
+import json
+from collections import Counter
+from dataclasses import dataclass, field
+from enum import StrEnum
+from pathlib import Path
+from typing import Any, Iterable
+
+from app.services.adapters.storybook.primary import Storybook, StorybookPage
+from app.services.adapters.text.models import StoryOutput
+from app.services.harness.evaluators import (
+    EvaluationDimension,
+    EvaluationResult,
+    evaluate_story_output,
+    evaluate_storybook_output,
+)
+
+
+class EvaluationReplayArtifact(StrEnum):
+    """Artifacts supported by deterministic evaluation replay."""
+
+    STORY = "story"
+    STORYBOOK = "storybook"
+
+
+@dataclass(frozen=True)
+class ExpectedEvaluation:
+    """Expected evaluation outcome for one golden case."""
+
+    passed: bool
+    blocking: bool
+    min_overall_score: float | None = None
+    max_overall_score: float | None = None
+    required_dimensions: tuple[EvaluationDimension, ...] = field(default_factory=tuple)
+    quality_gate_codes: tuple[str, ...] = field(default_factory=tuple)
+    warning_substrings: tuple[str, ...] = field(default_factory=tuple)
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> "ExpectedEvaluation":
+        """Build expectations from a JSON-safe payload."""
+
+        return cls(
+            passed=bool(payload["passed"]),
+            blocking=bool(payload["blocking"]),
+            min_overall_score=payload.get("min_overall_score"),
+            max_overall_score=payload.get("max_overall_score"),
+            required_dimensions=tuple(
+                EvaluationDimension(dimension)
+                for dimension in payload.get("required_dimensions", [])
+            ),
+            quality_gate_codes=tuple(payload.get("quality_gate_codes", [])),
+            warning_substrings=tuple(payload.get("warning_substrings", [])),
+        )
+
+
+@dataclass(frozen=True)
+class EvaluationReplayCoverage:
+    """Internal coverage labels for one golden replay case."""
+
+    age_band: str = "unknown"
+    content_shape: str = "unknown"
+    risk_area: str = "unknown"
+    tags: tuple[str, ...] = field(default_factory=tuple)
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any] | None) -> "EvaluationReplayCoverage":
+        """Build coverage labels from a JSON-safe payload."""
+
+        payload = payload or {}
+        return cls(
+            age_band=str(payload.get("age_band", "unknown")),
+            content_shape=str(payload.get("content_shape", "unknown")),
+            risk_area=str(payload.get("risk_area", "unknown")),
+            tags=tuple(str(tag) for tag in payload.get("tags", [])),
+        )
+
+
+@dataclass(frozen=True)
+class EvaluationReplayCase:
+    """One internal golden evaluation case."""
+
+    case_id: str
+    artifact: EvaluationReplayArtifact
+    output_payload: dict[str, Any]
+    expected: ExpectedEvaluation
+    education_theme: str | None = None
+    minimum_score: float = 0.7
+    description: str = ""
+    input_payload: dict[str, Any] = field(default_factory=dict)
+    coverage: EvaluationReplayCoverage = field(default_factory=EvaluationReplayCoverage)
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> "EvaluationReplayCase":
+        """Build a replay case from a JSON-safe payload."""
+
+        input_payload = dict(payload.get("input", {}))
+        minimum_score = input_payload.get("minimum_score", payload.get("minimum_score", 0.7))
+        education_theme = input_payload.get("education_theme", payload.get("education_theme"))
+
+        return cls(
+            case_id=str(payload["id"]),
+            artifact=EvaluationReplayArtifact(payload["artifact"]),
+            description=str(payload.get("description", "")),
+            input_payload=input_payload,
+            output_payload=dict(payload["output"]),
+            education_theme=education_theme,
+            minimum_score=float(minimum_score),
+            expected=ExpectedEvaluation.from_payload(payload["expected"]),
+            coverage=EvaluationReplayCoverage.from_payload(payload.get("coverage")),
+        )
+
+    def evaluate(self) -> EvaluationResult:
+        """Run the deterministic evaluator for this case."""
+
+        if self.artifact == EvaluationReplayArtifact.STORY:
+            return evaluate_story_output(
+                _story_output_from_payload(self.output_payload),
+                education_theme=self.education_theme,
+                minimum_score=self.minimum_score,
+            )
+
+        return evaluate_storybook_output(
+            _storybook_from_payload(self.output_payload),
+            education_theme=self.education_theme,
+            minimum_score=self.minimum_score,
+        )
+
+    def replay(self) -> "EvaluationReplayCaseResult":
+        """Evaluate the case and compare it with expected outcomes."""
+
+        evaluation = self.evaluate()
+        failures = tuple(_compare_evaluation(self, evaluation))
+        return EvaluationReplayCaseResult(
+            case_id=self.case_id,
+            artifact=self.artifact,
+            coverage=self.coverage,
+            evaluation=evaluation,
+            failures=failures,
+        )
+
+
+@dataclass(frozen=True)
+class EvaluationReplayCaseResult:
+    """Replay result for one golden case."""
+
+    case_id: str
+    artifact: EvaluationReplayArtifact
+    coverage: EvaluationReplayCoverage
+    evaluation: EvaluationResult
+    failures: tuple[str, ...] = field(default_factory=tuple)
+
+    @property
+    def expectations_met(self) -> bool:
+        """Return whether the case matched all expectations."""
+
+        return not self.failures
+
+
+@dataclass(frozen=True)
+class EvaluationReplaySuiteResult:
+    """Replay result for a set of golden cases."""
+
+    cases: tuple[EvaluationReplayCaseResult, ...]
+
+    @property
+    def passed(self) -> bool:
+        """Return whether every replay case matched expectations."""
+
+        return all(case.expectations_met for case in self.cases)
+
+    @property
+    def failed_case_ids(self) -> tuple[str, ...]:
+        """Return case IDs with expectation mismatches."""
+
+        return tuple(case.case_id for case in self.cases if not case.expectations_met)
+
+    def failure_report(self) -> str:
+        """Return a compact failure report for assertion messages."""
+
+        lines: list[str] = []
+        for case in self.cases:
+            for failure in case.failures:
+                lines.append(f"{case.case_id}: {failure}")
+        return "\n".join(lines)
+
+    def coverage_summary(self) -> dict[str, dict[str, int]]:
+        """Return internal coverage counts for golden replay review."""
+
+        return {
+            "artifact": _count_values(case.artifact.value for case in self.cases),
+            "age_band": _count_values(case.coverage.age_band for case in self.cases),
+            "content_shape": _count_values(
+                case.coverage.content_shape for case in self.cases
+            ),
+            "risk_area": _count_values(case.coverage.risk_area for case in self.cases),
+            "tags": _count_values(
+                tag for case in self.cases for tag in case.coverage.tags
+            ),
+            "outcome": _count_values(
+                "passed" if case.evaluation.passed else "blocked"
+                for case in self.cases
+            ),
+        }
+
+
+def load_evaluation_replay_cases(path: str | Path) -> tuple[EvaluationReplayCase, ...]:
+    """Load internal golden replay cases from a JSON file."""
+
+    raw_cases = json.loads(Path(path).read_text(encoding="utf-8"))
+    if not isinstance(raw_cases, list):
+        raise ValueError("Evaluation replay fixture must be a JSON array.")
+    return tuple(EvaluationReplayCase.from_payload(item) for item in raw_cases)
+
+
+def run_evaluation_replay_cases(
+    cases: Iterable[EvaluationReplayCase],
+) -> EvaluationReplaySuiteResult:
+    """Run a set of internal golden evaluation replay cases."""
+
+    return EvaluationReplaySuiteResult(cases=tuple(case.replay() for case in cases))
+
+
+def replay_evaluation_golden_cases(path: str | Path) -> EvaluationReplaySuiteResult:
+    """Load and run internal golden evaluation replay cases."""
+
+    return run_evaluation_replay_cases(load_evaluation_replay_cases(path))
+
+
+def _story_output_from_payload(payload: dict[str, Any]) -> StoryOutput:
+    return StoryOutput(
+        mode=payload.get("mode", "generated"),
+        title=payload.get("title", ""),
+        story_text=payload.get("story_text", ""),
+        cover_prompt_suggestion=payload.get("cover_prompt_suggestion", ""),
+    )
+
+
+def _storybook_from_payload(payload: dict[str, Any]) -> Storybook:
+    pages = [
+        StorybookPage(
+            page_number=page.get("page_number", index + 1),
+            text=page.get("text", ""),
+            image_prompt=page.get("image_prompt", ""),
+            image_url=page.get("image_url"),
+        )
+        for index, page in enumerate(payload.get("pages", []))
+    ]
+
+    return Storybook(
+        title=payload.get("title", ""),
+        main_character=payload.get("main_character", ""),
+        art_style=payload.get("art_style", ""),
+        pages=pages,
+        cover_prompt=payload.get("cover_prompt", ""),
+        cover_url=payload.get("cover_url"),
+    )
+
+
+def _count_values(values: Iterable[str]) -> dict[str, int]:
+    counts = Counter(value for value in values if value)
+    return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
+
+
+def _compare_evaluation(
+    case: EvaluationReplayCase,
+    evaluation: EvaluationResult,
+) -> list[str]:
+    expected = case.expected
+    failures: list[str] = []
+
+    if evaluation.passed != expected.passed:
+        failures.append(f"expected passed={expected.passed}, got {evaluation.passed}")
+
+    if evaluation.blocking != expected.blocking:
+        failures.append(f"expected blocking={expected.blocking}, got {evaluation.blocking}")
+
+    if (
+        expected.min_overall_score is not None
+        and evaluation.overall_score < expected.min_overall_score
+    ):
+        failures.append(
+            "expected overall_score >= "
+            f"{expected.min_overall_score}, got {evaluation.overall_score}"
+        )
+
+    if (
+        expected.max_overall_score is not None
+        and evaluation.overall_score > expected.max_overall_score
+    ):
+        failures.append(
+            "expected overall_score <= "
+            f"{expected.max_overall_score}, got {evaluation.overall_score}"
+        )
+
+    actual_dimensions = {score.dimension for score in evaluation.scores}
+    missing_dimensions = [
+        dimension.value
+        for dimension in expected.required_dimensions
+        if dimension not in actual_dimensions
+    ]
+    if missing_dimensions:
+        failures.append(f"missing dimensions: {', '.join(missing_dimensions)}")
+
+    actual_quality_gate_codes = tuple(
+        issue.code.value for issue in evaluation.gate_error.issues
+    ) if evaluation.gate_error is not None else ()
+    if actual_quality_gate_codes != expected.quality_gate_codes:
+        failures.append(
+            "expected quality_gate_codes="
+            f"{list(expected.quality_gate_codes)}, got {list(actual_quality_gate_codes)}"
+        )
+
+    for expected_warning in expected.warning_substrings:
+        if not any(expected_warning in warning for warning in evaluation.warnings):
+            failures.append(f"missing warning containing: {expected_warning}")
+
+    return failures
--- a/backend/app/services/harness/evaluators.py
+++ b/backend/app/services/harness/evaluators.py
@@ -0,0 +1,267 @@
+"""Deterministic evaluation helpers for generated child-facing content."""
+
+from dataclasses import dataclass, field
+from enum import StrEnum
+from typing import Any
+
+from app.services.adapters.storybook.primary import Storybook
+from app.services.adapters.text.models import StoryOutput
+from app.services.harness.quality_gates import (
+    QualityGateError,
+    validate_story_output,
+    validate_storybook_output,
+)
+
+
+class EvaluationDimension(StrEnum):
+    """Stable dimensions used by harness evaluations."""
+
+    STRUCTURE = "structure"
+    SAFETY = "safety"
+    AGE_FIT = "age_fit"
+    EDUCATIONAL_VALUE = "educational_value"
+    READABILITY = "readability"
+
+
+@dataclass(frozen=True)
+class EvaluationScore:
+    """One scored evaluation dimension."""
+
+    dimension: EvaluationDimension
+    score: float
+    reason: str
+
+    def to_metadata(self) -> dict[str, Any]:
+        """Return a JSON-safe metadata payload."""
+
+        return {
+            "dimension": self.dimension.value,
+            "score": self.score,
+            "reason": self.reason,
+        }
+
+
+@dataclass(frozen=True)
+class EvaluationResult:
+    """Deterministic evaluation result for one generated artifact."""
+
+    overall_score: float
+    passed: bool
+    blocking: bool
+    scores: tuple[EvaluationScore, ...]
+    gate_error: QualityGateError | None = None
+    warnings: tuple[str, ...] = field(default_factory=tuple)
+
+    def to_metadata(self) -> dict[str, Any]:
+        """Return a JSON-safe metadata payload."""
+
+        metadata: dict[str, Any] = {
+            "overall_score": self.overall_score,
+            "passed": self.passed,
+            "blocking": self.blocking,
+            "scores": [score.to_metadata() for score in self.scores],
+            "warnings": list(self.warnings),
+        }
+        if self.gate_error is not None:
+            metadata["quality_gate"] = self.gate_error.to_metadata()
+        return metadata
+
+
+def _clamp_score(value: float) -> float:
+    return max(0.0, min(1.0, round(value, 2)))
+
+
+def _story_text_readability_score(story_text: str) -> float:
+    """Score text length with a conservative 3-8 age readability heuristic."""
+
+    normalized_length = len(story_text.strip())
+    if normalized_length < 30:
+        return 0.45
+    if normalized_length > 2500:
+        return 0.72
+    if normalized_length > 1800:
+        return 0.84
+    return 0.96
+
+
+def _educational_value_score(story_text: str, education_theme: str | None) -> float:
+    if not education_theme:
+        return 0.82
+    return 0.96 if education_theme.strip() in story_text else 0.88
+
+
+def _storybook_readability_score(page_texts: list[str]) -> float:
+    if not page_texts:
+        return 0.0
+
+    page_lengths = [len(text.strip()) for text in page_texts]
+    if any(length < 8 for length in page_lengths):
+        return 0.62
+    if any(length > 320 for length in page_lengths):
+        return 0.78
+    if any(length > 220 for length in page_lengths):
+        return 0.88
+    return 0.96
+
+
+def _storybook_educational_value_score(
+    page_texts: list[str],
+    education_theme: str | None,
+) -> float:
+    if not education_theme:
+        return 0.82
+    combined_text = " ".join(page_texts)
+    return 0.96 if education_theme.strip() in combined_text else 0.88
+
+
+def evaluate_story_output(
+    output: StoryOutput,
+    *,
+    education_theme: str | None = None,
+    minimum_score: float = 0.7,
+) -> EvaluationResult:
+    """Evaluate a generated text story before persistence."""
+
+    try:
+        validate_story_output(output)
+    except QualityGateError as exc:
+        scores = (
+            EvaluationScore(
+                dimension=EvaluationDimension.STRUCTURE,
+                score=0.0,
+                reason="故事结构未通过质量门。",
+            ),
+            EvaluationScore(
+                dimension=EvaluationDimension.SAFETY,
+                score=0.0,
+                reason="内容未通过儿童安全或结构完整性检查。",
+            ),
+        )
+        return EvaluationResult(
+            overall_score=0.0,
+            passed=False,
+            blocking=True,
+            scores=scores,
+            gate_error=exc,
+        )
+
+    readability_score = _story_text_readability_score(output.story_text)
+    educational_score = _educational_value_score(output.story_text, education_theme)
+    warnings: list[str] = []
+
+    if readability_score < 0.8:
+        warnings.append("故事正文长度可能不适合 3-8 岁儿童的完整阅读体验。")
+
+    scores = (
+        EvaluationScore(
+            dimension=EvaluationDimension.STRUCTURE,
+            score=1.0,
+            reason="标题、正文和封面提示词完整。",
+        ),
+        EvaluationScore(
+            dimension=EvaluationDimension.SAFETY,
+            score=1.0,
+            reason="未命中确定性儿童安全风险词。",
+        ),
+        EvaluationScore(
+            dimension=EvaluationDimension.AGE_FIT,
+            score=readability_score,
+            reason="根据正文长度估算低龄儿童阅读适配度。",
+        ),
+        EvaluationScore(
+            dimension=EvaluationDimension.EDUCATIONAL_VALUE,
+            score=educational_score,
+            reason="根据教育主题是否清晰融入正文估算。",
+        ),
+        EvaluationScore(
+            dimension=EvaluationDimension.READABILITY,
+            score=readability_score,
+            reason="根据正文长度估算朗读和亲子共读流畅度。",
+        ),
+    )
+    overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
+
+    return EvaluationResult(
+        overall_score=overall_score,
+        passed=overall_score >= minimum_score,
+        blocking=overall_score < minimum_score,
+        scores=scores,
+        warnings=tuple(warnings),
+    )
+
+
+def evaluate_storybook_output(
+    output: Storybook,
+    *,
+    education_theme: str | None = None,
+    minimum_score: float = 0.7,
+) -> EvaluationResult:
+    """Evaluate generated storybook structure before persistence."""
+
+    try:
+        validate_storybook_output(output)
+    except QualityGateError as exc:
+        scores = (
+            EvaluationScore(
+                dimension=EvaluationDimension.STRUCTURE,
+                score=0.0,
+                reason="绘本结构未通过质量门。",
+            ),
+            EvaluationScore(
+                dimension=EvaluationDimension.SAFETY,
+                score=0.0,
+                reason="绘本内容未通过儿童安全或结构完整性检查。",
+            ),
+        )
+        return EvaluationResult(
+            overall_score=0.0,
+            passed=False,
+            blocking=True,
+            scores=scores,
+            gate_error=exc,
+        )
+
+    page_texts = [page.text for page in output.pages]
+    readability_score = _storybook_readability_score(page_texts)
+    educational_score = _storybook_educational_value_score(page_texts, education_theme)
+    warnings: list[str] = []
+
+    if readability_score < 0.8:
+        warnings.append("绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。")
+
+    scores = (
+        EvaluationScore(
+            dimension=EvaluationDimension.STRUCTURE,
+            score=1.0,
+            reason="绘本标题、分页和页码结构完整。",
+        ),
+        EvaluationScore(
+            dimension=EvaluationDimension.SAFETY,
+            score=1.0,
+            reason="未命中确定性儿童安全风险词。",
+        ),
+        EvaluationScore(
+            dimension=EvaluationDimension.AGE_FIT,
+            score=readability_score,
+            reason="根据每页正文长度估算低龄儿童翻页阅读适配度。",
+        ),
+        EvaluationScore(
+            dimension=EvaluationDimension.EDUCATIONAL_VALUE,
+            score=educational_score,
+            reason="根据教育主题是否清晰融入分页正文估算。",
+        ),
+        EvaluationScore(
+            dimension=EvaluationDimension.READABILITY,
+            score=readability_score,
+            reason="根据分页正文长度估算亲子共读流畅度。",
+        ),
+    )
+    overall_score = _clamp_score(sum(score.score for score in scores) / len(scores))
+
+    return EvaluationResult(
+        overall_score=overall_score,
+        passed=overall_score >= minimum_score,
+        blocking=overall_score < minimum_score,
+        scores=scores,
+        warnings=tuple(warnings),
+    )
--- a/backend/app/services/harness/executor.py
+++ b/backend/app/services/harness/executor.py
@@ -0,0 +1,150 @@
+"""Small-step workflow executor helpers for generation harness adoption."""
+
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.services.harness.artifacts import AssetCompletionResult
+from app.services.harness.plans import WorkflowPlan
+from app.services.harness.trace import TraceRecorder
+from app.services.harness.types import ArtifactKind, WorkflowStep
+
+if TYPE_CHECKING:
+    from app.db.models import GenerationJob
+
+AssetTask = Callable[[], Awaitable[AssetCompletionResult]]
+
+
+@dataclass(frozen=True)
+class AssetPlanRunResult:
+    """Result of executing asset-producing tasks from one workflow plan."""
+
+    task_results: tuple[AssetCompletionResult, ...]
+    executed_task_keys: tuple[str, ...]
+    ignored_task_keys: tuple[str, ...]
+
+    @property
+    def result_assets(self) -> tuple[str, ...]:
+        """Assets returned by executed task handlers."""
+
+        return tuple(result.asset for result in self.task_results)
+
+    def to_metadata(self, plan: WorkflowPlan) -> dict[str, Any]:
+        """Return internal executor coverage metadata for admin-only analytics."""
+
+        return {
+            "plan_mode": plan.mode.value,
+            "planned_task_count": len(plan.tasks),
+            "executed_task_count": len(self.executed_task_keys),
+            "ignored_task_count": len(self.ignored_task_keys),
+            "result_count": len(self.task_results),
+            "executed_task_keys": list(self.executed_task_keys),
+            "ignored_task_keys": list(self.ignored_task_keys),
+            "result_assets": list(self.result_assets),
+        }
+
+
+async def record_workflow_plan(
+    db: AsyncSession,
+    *,
+    job: "GenerationJob | None",
+    plan: WorkflowPlan,
+) -> None:
+    """Persist a workflow plan snapshot for a tracked job."""
+
+    await TraceRecorder(db).record_step(
+        job=job,
+        event_type="workflow_planned",
+        status="succeeded",
+        message="Workflow plan selected for this generation request.",
+        metadata={"plan": plan.to_snapshot()},
+        step=WorkflowStep.REQUEST_ACCEPTANCE,
+        artifact=ArtifactKind.NONE,
+        blocks_main_result=True,
+    )
+
+
+async def record_evaluation_result(
+    db: AsyncSession,
+    *,
+    job: "GenerationJob | None",
+    story_id: int | None = None,
+    metadata: dict[str, Any],
+    status: str,
+    artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
+) -> None:
+    """Persist a deterministic evaluation result for a tracked job."""
+
+    await TraceRecorder(db).record_step(
+        job=job,
+        story_id=story_id,
+        event_type="evaluation_completed",
+        status=status,
+        message="Generated content evaluation completed.",
+        metadata=metadata,
+        step=WorkflowStep.EVALUATION,
+        artifact=artifact,
+        blocks_main_result=status != "succeeded",
+    )
+
+
+async def record_executor_result(
+    db: AsyncSession,
+    *,
+    job: "GenerationJob | None",
+    plan: WorkflowPlan,
+    result: AssetPlanRunResult,
+) -> None:
+    """Persist internal executor coverage metadata for a tracked job."""
+
+    await TraceRecorder(db).record_step(
+        job=job,
+        event_type="executor_completed",
+        status="succeeded",
+        message="Workflow executor completed planned asset tasks.",
+        metadata=result.to_metadata(plan),
+        step=WorkflowStep.UNKNOWN,
+        artifact=ArtifactKind.NONE,
+        blocks_main_result=False,
+    )
+
+
+async def run_asset_plan(
+    plan: WorkflowPlan,
+    *,
+    image_task: AssetTask | None = None,
+    audio_task: AssetTask | None = None,
+) -> AssetPlanRunResult:
+    """Execute asset-producing tasks in the order declared by a workflow plan."""
+
+    if plan.mode.value not in {"asset_generation", "asset_retry"}:
+        raise ValueError("run_asset_plan only supports asset workflow plans")
+
+    task_results: list[AssetCompletionResult] = []
+    executed_task_keys: list[str] = []
+    ignored_task_keys: list[str] = []
+
+    for task in plan.tasks:
+        if task.key == "complete_image_asset":
+            if image_task is None:
+                raise ValueError("Asset workflow plan requires an image task handler")
+            task_results.append(await image_task())
+            executed_task_keys.append(task.key)
+            continue
+
+        if task.key == "complete_audio_asset":
+            if audio_task is None:
+                raise ValueError("Asset workflow plan requires an audio task handler")
+            task_results.append(await audio_task())
+            executed_task_keys.append(task.key)
+            continue
+
+        ignored_task_keys.append(task.key)
+
+    return AssetPlanRunResult(
+        task_results=tuple(task_results),
+        executed_task_keys=tuple(executed_task_keys),
+        ignored_task_keys=tuple(ignored_task_keys),
+    )
--- a/backend/app/services/harness/fixtures/evaluation_golden_cases.json
+++ b/backend/app/services/harness/fixtures/evaluation_golden_cases.json
@@ -0,0 +1,400 @@
+[
+  {
+    "id": "story-safe-theme-pass",
+    "artifact": "story",
+    "description": "完整、儿童安全且清晰包含教育主题的普通故事。",
+    "coverage": {
+      "age_band": "5-6",
+      "content_shape": "short_story",
+      "risk_area": "happy_path",
+      "tags": ["theme_present", "safe", "story"]
+    },
+    "input": {
+      "keywords": "小兔子, 月光花园",
+      "education_theme": "复盘"
+    },
+    "output": {
+      "mode": "generated",
+      "title": "小兔子的月光花园",
+      "story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水，又邀请朋友一起观察花瓣的变化。晚上睡前，露露和朋友们坐在石凳上复盘今天的努力：下次要先分好小水壶，再轮流照顾花朵。大家都觉得，分享和复盘让花园变得更温暖。",
+      "cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
+    },
+    "expected": {
+      "passed": true,
+      "blocking": false,
+      "min_overall_score": 0.9,
+      "required_dimensions": [
+        "structure",
+        "safety",
+        "age_fit",
+        "educational_value",
+        "readability"
+      ],
+      "quality_gate_codes": []
+    }
+  },
+  {
+    "id": "story-long-safe-pass",
+    "artifact": "story",
+    "description": "较长但仍适合亲子共读的普通故事。",
+    "coverage": {
+      "age_band": "7-8",
+      "content_shape": "long_story",
+      "risk_area": "length_boundary",
+      "tags": ["theme_present", "long_text", "story"]
+    },
+    "input": {
+      "keywords": "小海豚, 图书馆",
+      "education_theme": "合作"
+    },
+    "output": {
+      "mode": "generated",
+      "title": "小海豚的蓝色图书馆",
+      "story_text": "小海豚多多住在一片安静的海湾里，那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚，多多都会把漂来的故事贝壳整理好，放进不同的篮子。可是这一天，风浪把贝壳吹得到处都是，小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳，再按颜色排队，最后把每个故事放回合适的位置。多多发现，合作不是一个人做得最快，而是大家把自己的办法放在一起。夜晚来临时，蓝色图书馆重新亮起柔柔的光，小伙伴们围坐在门口，听多多讲今天学到的合作故事。",
+      "cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
+    },
+    "expected": {
+      "passed": true,
+      "blocking": false,
+      "min_overall_score": 0.9,
+      "required_dimensions": [
+        "structure",
+        "safety",
+        "age_fit",
+        "educational_value",
+        "readability"
+      ],
+      "quality_gate_codes": []
+    }
+  },
+  {
+    "id": "story-missing-text-blocks",
+    "artifact": "story",
+    "description": "故事正文缺失会被确定性质量门阻断。",
+    "coverage": {
+      "age_band": "unknown",
+      "content_shape": "empty_story",
+      "risk_area": "schema_error",
+      "tags": ["missing_text", "story", "blocking"]
+    },
+    "input": {
+      "keywords": "小熊, 星星"
+    },
+    "output": {
+      "mode": "generated",
+      "title": "小熊找星星",
+      "story_text": "",
+      "cover_prompt_suggestion": "A bear looking at friendly stars"
+    },
+    "expected": {
+      "passed": false,
+      "blocking": true,
+      "max_overall_score": 0.0,
+      "quality_gate_codes": [
+        "missing_story_text"
+      ]
+    }
+  },
+  {
+    "id": "story-missing-cover-prompt-blocks",
+    "artifact": "story",
+    "description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
+    "coverage": {
+      "age_band": "5-6",
+      "content_shape": "short_story",
+      "risk_area": "schema_error",
+      "tags": ["missing_cover_prompt", "story", "blocking"]
+    },
+    "input": {
+      "keywords": "小松鼠, 风筝",
+      "education_theme": "勇敢"
+    },
+    "output": {
+      "mode": "generated",
+      "title": "小松鼠的风筝",
+      "story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三，它鼓起勇敢的心，终于让风筝飞上蓝天。",
+      "cover_prompt_suggestion": ""
+    },
+    "expected": {
+      "passed": false,
+      "blocking": true,
+      "max_overall_score": 0.0,
+      "quality_gate_codes": [
+        "missing_cover_prompt"
+      ]
+    }
+  },
+  {
+    "id": "story-unsafe-term-blocks",
+    "artifact": "story",
+    "description": "明显不适合儿童的风险词会被安全质量门阻断。",
+    "coverage": {
+      "age_band": "3-4",
+      "content_shape": "short_story",
+      "risk_area": "safety_error",
+      "tags": ["unsafe_term", "story", "blocking"]
+    },
+    "input": {
+      "keywords": "小猫, 城堡"
+    },
+    "output": {
+      "mode": "generated",
+      "title": "小猫的城堡",
+      "story_text": "小猫在城堡里看到血腥场景，然后感到很害怕。",
+      "cover_prompt_suggestion": "A cat near a castle"
+    },
+    "expected": {
+      "passed": false,
+      "blocking": true,
+      "max_overall_score": 0.0,
+      "quality_gate_codes": [
+        "unsafe_child_content"
+      ]
+    }
+  },
+  {
+    "id": "story-short-high-threshold-blocks",
+    "artifact": "story",
+    "description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
+    "coverage": {
+      "age_band": "3-4",
+      "content_shape": "very_short_story",
+      "risk_area": "readability_warning",
+      "tags": ["short_text", "threshold_block", "story"]
+    },
+    "input": {
+      "keywords": "小鹿, 书签",
+      "education_theme": "耐心",
+      "minimum_score": 0.82
+    },
+    "output": {
+      "mode": "generated",
+      "title": "小鹿的书签",
+      "story_text": "小鹿学会了耐心等待。",
+      "cover_prompt_suggestion": "A deer with a golden bookmark"
+    },
+    "expected": {
+      "passed": false,
+      "blocking": true,
+      "min_overall_score": 0.7,
+      "max_overall_score": 0.8,
+      "required_dimensions": [
+        "structure",
+        "safety",
+        "readability"
+      ],
+      "quality_gate_codes": [],
+      "warning_substrings": [
+        "正文长度"
+      ]
+    }
+  },
+  {
+    "id": "storybook-safe-theme-pass",
+    "artifact": "storybook",
+    "description": "完整、儿童安全且包含教育主题的绘本分页输出。",
+    "coverage": {
+      "age_band": "5-6",
+      "content_shape": "storybook_3_pages",
+      "risk_area": "happy_path",
+      "tags": ["theme_present", "safe", "storybook"]
+    },
+    "input": {
+      "keywords": "小狐狸, 彩虹桥",
+      "education_theme": "合作"
+    },
+    "output": {
+      "title": "彩虹桥上的合作",
+      "main_character": "小狐狸米米",
+      "art_style": "温暖水彩",
+      "cover_prompt": "A warm watercolor fox near a rainbow bridge",
+      "pages": [
+        {
+          "page_number": 1,
+          "text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
+          "image_prompt": "A little fox finds a rainbow bridge"
+        },
+        {
+          "page_number": 2,
+          "text": "桥边的小伙伴们一起商量办法，决定合作把落叶清理干净。",
+          "image_prompt": "Forest friends work together"
+        },
+        {
+          "page_number": 3,
+          "text": "大家轮流搬叶子、扶篮子，还互相说谢谢，彩虹桥终于露出笑脸。",
+          "image_prompt": "Friends carrying leaves together"
+        }
+      ]
+    },
+    "expected": {
+      "passed": true,
+      "blocking": false,
+      "min_overall_score": 0.9,
+      "required_dimensions": [
+        "structure",
+        "safety",
+        "age_fit",
+        "educational_value",
+        "readability"
+      ],
+      "quality_gate_codes": []
+    }
+  },
+  {
+    "id": "storybook-duplicate-page-blocks",
+    "artifact": "storybook",
+    "description": "重复页码的绘本结构会被质量门阻断。",
+    "coverage": {
+      "age_band": "5-6",
+      "content_shape": "storybook_invalid_pages",
+      "risk_area": "schema_error",
+      "tags": ["duplicate_page", "storybook", "blocking"]
+    },
+    "input": {
+      "keywords": "小熊, 森林"
+    },
+    "output": {
+      "title": "森林里的小熊",
+      "main_character": "小熊布布",
+      "art_style": "水彩",
+      "cover_prompt": "A bear in a forest",
+      "pages": [
+        {
+          "page_number": 1,
+          "text": "布布在森林里找到一颗松果。",
+          "image_prompt": "Bear finds a pinecone"
+        },
+        {
+          "page_number": 1,
+          "text": "布布把松果带给朋友一起观察。",
+          "image_prompt": "Bear shares the pinecone"
+        }
+      ]
+    },
+    "expected": {
+      "passed": false,
+      "blocking": true,
+      "max_overall_score": 0.0,
+      "quality_gate_codes": [
+        "invalid_storybook_page_number"
+      ]
+    }
+  },
+  {
+    "id": "storybook-missing-page-blocks",
+    "artifact": "storybook",
+    "description": "没有分页内容的绘本会被结构质量门阻断。",
+    "coverage": {
+      "age_band": "unknown",
+      "content_shape": "storybook_empty_pages",
+      "risk_area": "schema_error",
+      "tags": ["missing_page", "storybook", "blocking"]
+    },
+    "input": {
+      "keywords": "小鸟, 云朵"
+    },
+    "output": {
+      "title": "小鸟和云朵",
+      "main_character": "小鸟啾啾",
+      "art_style": "柔和水彩",
+      "cover_prompt": "A bird near soft clouds",
+      "pages": []
+    },
+    "expected": {
+      "passed": false,
+      "blocking": true,
+      "max_overall_score": 0.0,
+      "quality_gate_codes": [
+        "missing_storybook_page"
+      ]
+    }
+  },
+  {
+    "id": "storybook-unsafe-term-blocks",
+    "artifact": "storybook",
+    "description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
+    "coverage": {
+      "age_band": "3-4",
+      "content_shape": "storybook_2_pages",
+      "risk_area": "safety_error",
+      "tags": ["unsafe_term", "storybook", "blocking"]
+    },
+    "input": {
+      "keywords": "小兔子, 山洞"
+    },
+    "output": {
+      "title": "山洞里的声音",
+      "main_character": "小兔子米粒",
+      "art_style": "温暖水彩",
+      "cover_prompt": "A rabbit near a cave",
+      "pages": [
+        {
+          "page_number": 1,
+          "text": "米粒走到山洞边，听见奇怪的声音。",
+          "image_prompt": "Rabbit near a cave"
+        },
+        {
+          "page_number": 2,
+          "text": "洞里出现血腥画面，米粒吓得跑开。",
+          "image_prompt": "Rabbit running away"
+        }
+      ]
+    },
+    "expected": {
+      "passed": false,
+      "blocking": true,
+      "max_overall_score": 0.0,
+      "quality_gate_codes": [
+        "unsafe_child_content"
+      ]
+    }
+  },
+  {
+    "id": "storybook-short-page-warning",
+    "artifact": "storybook",
+    "description": "分页正文过短时保留内部警告，用于评测回归。",
+    "coverage": {
+      "age_band": "3-4",
+      "content_shape": "storybook_2_pages",
+      "risk_area": "readability_warning",
+      "tags": ["short_page_text", "threshold_block", "storybook"]
+    },
+    "input": {
+      "keywords": "小羊, 风铃",
+      "minimum_score": 0.85
+    },
+    "output": {
+      "title": "风铃响了",
+      "main_character": "小羊团团",
+      "art_style": "柔和蜡笔",
+      "cover_prompt": "A lamb listening to a wind chime",
+      "pages": [
+        {
+          "page_number": 1,
+          "text": "风响。",
+          "image_prompt": "Wind chime rings"
+        },
+        {
+          "page_number": 2,
+          "text": "团团笑。",
+          "image_prompt": "Lamb smiles"
+        }
+      ]
+    },
+    "expected": {
+      "passed": false,
+      "blocking": true,
+      "min_overall_score": 0.8,
+      "max_overall_score": 0.82,
+      "required_dimensions": [
+        "structure",
+        "safety",
+        "readability"
+      ],
+      "quality_gate_codes": [],
+      "warning_substrings": [
+        "分页正文长度"
+      ]
+    }
+  }
+]
--- a/backend/app/services/harness/plans.py
+++ b/backend/app/services/harness/plans.py
@@ -69,6 +69,11 @@ def build_story_plan(*, generate_images: bool) -> WorkflowPlan:
            step=WorkflowStep.NARRATIVE_GENERATION,
            artifact=ArtifactKind.STORY_TEXT,
        ),
+        WorkflowTask(
+            key="evaluate_narrative",
+            step=WorkflowStep.EVALUATION,
+            artifact=ArtifactKind.STORY_TEXT,
+        ),
        WorkflowTask(
            key="persist_story",
            step=WorkflowStep.STORY_PERSISTENCE,
@@ -124,6 +129,11 @@ def build_storybook_plan(*, generate_images: bool) -> WorkflowPlan:
            step=WorkflowStep.NARRATIVE_GENERATION,
            artifact=ArtifactKind.STORYBOOK_PAGES,
        ),
+        WorkflowTask(
+            key="evaluate_storybook_pages",
+            step=WorkflowStep.EVALUATION,
+            artifact=ArtifactKind.STORYBOOK_PAGES,
+        ),
    ]

    if generate_images:
--- a/backend/app/services/harness/types.py
+++ b/backend/app/services/harness/types.py
@@ -11,6 +11,7 @@ class WorkflowStep(StrEnum):
    WORKER_START = "worker_start"
    CONTEXT_PREPARATION = "context_preparation"
    NARRATIVE_GENERATION = "narrative_generation"
+    EVALUATION = "evaluation"
    STORY_PERSISTENCE = "story_persistence"
    PROVIDER_INVOCATION = "provider_invocation"
    IMAGE_GENERATION = "image_generation"
@@ -64,6 +65,8 @@ class StepStatus(StrEnum):

 EVENT_STEP_MAP: dict[str, WorkflowStep] = {
    "request_accepted": WorkflowStep.REQUEST_ACCEPTANCE,
+    "workflow_planned": WorkflowStep.REQUEST_ACCEPTANCE,
+    "executor_completed": WorkflowStep.UNKNOWN,
    "retry_queued": WorkflowStep.REQUEST_ACCEPTANCE,
    "worker_started": WorkflowStep.WORKER_START,
    "context_prepared": WorkflowStep.CONTEXT_PREPARATION,
@@ -73,6 +76,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
    "provider_call_succeeded": WorkflowStep.PROVIDER_INVOCATION,
    "provider_call_failed": WorkflowStep.PROVIDER_INVOCATION,
    "quality_gate_failed": WorkflowStep.NARRATIVE_GENERATION,
+    "evaluation_completed": WorkflowStep.EVALUATION,
    "cover_image_started": WorkflowStep.IMAGE_GENERATION,
    "cover_image_succeeded": WorkflowStep.IMAGE_GENERATION,
    "cover_image_failed": WorkflowStep.IMAGE_GENERATION,
@@ -100,6 +104,7 @@ EVENT_STEP_MAP: dict[str, WorkflowStep] = {
 EVENT_ARTIFACT_MAP: dict[str, ArtifactKind] = {
    "narrative_generated": ArtifactKind.STORY_TEXT,
    "quality_gate_failed": ArtifactKind.STORY_TEXT,
+    "evaluation_completed": ArtifactKind.STORY_TEXT,
    "cover_image_started": ArtifactKind.COVER_IMAGE,
    "cover_image_succeeded": ArtifactKind.COVER_IMAGE,
    "cover_image_failed": ArtifactKind.COVER_IMAGE,
--- a/backend/app/services/story_service.py
+++ b/backend/app/services/story_service.py
@@ -36,8 +36,8 @@ from app.services.generation_jobs import (
    ensure_no_active_story_generation_job,
    finish_generation_job,
    generation_job_can_retry,
-    generation_job_to_summary,
    get_generation_job_for_user,
+    public_generation_job_to_summary,
    record_generation_event,
 )
 from app.services.harness.artifacts import (
@@ -57,12 +57,27 @@ from app.services.harness.control import (
    ExecutionControl,
    GenerationJobCanceledError,
 )
+from app.services.harness.evaluators import (
+    EvaluationResult,
+    evaluate_story_output,
+    evaluate_storybook_output,
+)
+from app.services.harness.executor import (
+    record_evaluation_result,
+    record_executor_result,
+    record_workflow_plan,
+    run_asset_plan,
+)
+from app.services.harness.plans import (
+    build_asset_plan,
+    build_story_plan,
+    build_storybook_plan,
+)
 from app.services.harness.quality_gates import (
    QualityGateError,
-    validate_story_output,
-    validate_storybook_output,
 )
 from app.services.harness.trace import TraceRecorder
+from app.services.harness.types import ArtifactKind
 from app.services.memory_service import build_enhanced_memory_context
 from app.services.provider_router import (
    generate_image,
@@ -129,6 +144,24 @@ async def _record_quality_gate_failure_if_present(
    )


+async def _record_evaluation_result_if_present(
+    db: AsyncSession,
+    *,
+    job,
+    evaluation: EvaluationResult,
+    artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
+) -> None:
+    """Append deterministic evaluation metadata for tracked worker jobs."""
+
+    await record_evaluation_result(
+        db,
+        job=job,
+        metadata=evaluation.to_metadata(),
+        status="succeeded" if evaluation.passed else "failed",
+        artifact=artifact,
+    )
+
+
 def _asset_result_metadata(result: AssetCompletionResult) -> dict:
    """Build JSON-safe metadata for asset workflow events."""

@@ -643,18 +676,33 @@ async def generate_and_save_story(
            user_id=user_id,
            generation_job=job,
        )
-        validate_story_output(result)
-    except QualityGateError as exc:
-        await _record_quality_gate_failure_if_present(db, job=job, error=exc)
-        raise HTTPException(
-            status_code=502,
-            detail="Story generation failed quality checks, please try again.",
-        ) from exc
    except Exception as exc:
        raise HTTPException(
            status_code=502,
            detail="Story generation failed, please try again.",
        ) from exc
+
+    evaluation = evaluate_story_output(
+        result,
+        education_theme=request.education_theme,
+    )
+    if evaluation.gate_error is not None:
+        await _record_quality_gate_failure_if_present(
+            db,
+            job=job,
+            error=evaluation.gate_error,
+        )
+    await _record_evaluation_result_if_present(
+        db,
+        job=job,
+        evaluation=evaluation,
+    )
+    if evaluation.blocking:
+        raise HTTPException(
+            status_code=502,
+            detail="Story generation failed quality checks, please try again.",
+        )
+
    await _record_job_event_if_present(
        db,
        job=job,
@@ -758,13 +806,32 @@ async def generate_storybook_service(
            user_id=user_id,
            generation_job=job,
        )
-        validate_storybook_output(storybook)
-    except QualityGateError as exc:
-        await _record_quality_gate_failure_if_present(db, job=job, error=exc)
-        raise HTTPException(status_code=500, detail=f"故事书质量检查失败: {exc}") from exc
    except Exception as e:
        logger.error("storybook_generation_failed", error=str(e))
        raise HTTPException(status_code=500, detail=f"故事书生成失败: {e}")
+
+    evaluation = evaluate_storybook_output(
+        storybook,
+        education_theme=request.education_theme,
+    )
+    if evaluation.gate_error is not None:
+        await _record_quality_gate_failure_if_present(
+            db,
+            job=job,
+            error=evaluation.gate_error,
+        )
+    await _record_evaluation_result_if_present(
+        db,
+        job=job,
+        evaluation=evaluation,
+        artifact=ArtifactKind.STORYBOOK_PAGES,
+    )
+    if evaluation.blocking:
+        raise HTTPException(
+            status_code=500,
+            detail=f"故事书质量检查失败: {evaluation.gate_error or 'evaluation blocked'}",
+        )
+
    await _record_job_event_if_present(
        db,
        job=job,
@@ -1025,28 +1092,50 @@ async def _generate_asset_generation_service_with_job(
    if not requested_assets:
        raise HTTPException(status_code=400, detail="资源任务缺少 assets。")

+    plan = build_asset_plan(
+        output_mode="asset_generation",
+        assets=requested_assets,
+    )
+    await record_workflow_plan(
+        db,
+        job=job,
+        plan=plan,
+    )
+
    story = await get_story_detail(int(story_id), job.user_id, db)

-    if "image" in requested_assets:
+    async def complete_image() -> AssetCompletionResult:
        if story.mode == "storybook":
-            await _complete_storybook_image_assets(story, db, job=job)
-        else:
-            await _complete_cover_image_asset(
-                story,
-                db,
-                raise_on_failure=True,
-                log_event="cover_generation_failed",
-                job=job,
-            )
+            return await _complete_storybook_image_assets(story, db, job=job)

-    if "audio" in requested_assets:
-        await _complete_audio_asset(
+        return await _complete_cover_image_asset(
+            story,
+            db,
+            raise_on_failure=True,
+            log_event="cover_generation_failed",
+            job=job,
+        )
+
+    async def complete_audio() -> AssetCompletionResult:
+        return await _complete_audio_asset(
            story,
            db,
            raise_on_failure=True,
            job=job,
        )

+    asset_plan_result = await run_asset_plan(
+        plan,
+        image_task=complete_image if "image" in requested_assets else None,
+        audio_task=complete_audio if "audio" in requested_assets else None,
+    )
+    await record_executor_result(
+        db,
+        job=job,
+        plan=plan,
+        result=asset_plan_result,
+    )
+
    story = await get_story_detail(story.id, job.user_id, db)
    await finish_generation_job(
        db,
@@ -1096,7 +1185,7 @@ async def retry_generation_job_service(
    )
    await _dispatch_generation_job(db, job=retry_job)
    await db.refresh(retry_job)
-    return generation_job_to_summary(retry_job)
+    return public_generation_job_to_summary(retry_job)


 async def _generate_generation_service_with_job(
@@ -1109,6 +1198,11 @@ async def _generate_generation_service_with_job(
    """Run the unified generation workflow after the tracking job has been created."""

    if request.output_mode == "storybook":
+        await record_workflow_plan(
+            db,
+            job=job,
+            plan=build_storybook_plan(generate_images=request.generate_images),
+        )
        storybook = await generate_storybook_service(
            StorybookRequest(
                keywords=request.data,
@@ -1155,6 +1249,9 @@ async def _generate_generation_service_with_job(
            retryable_assets=saved_story.retryable_assets,
        )

+    if request.output_mode == "story" and not request.generate_images:
+        return await _execute_story_without_assets_plan(request, user_id, db, job=job)
+
    generate_request = GenerateRequest(
        type=request.type,
        data=request.data,
@@ -1164,6 +1261,11 @@ async def _generate_generation_service_with_job(
    )

    if request.generate_images:
+        await record_workflow_plan(
+            db,
+            job=job,
+            plan=build_story_plan(generate_images=True),
+        )
        story = await generate_full_story_service(generate_request, user_id, db, job=job)
        saved_story = await get_story_detail(story.id, user_id, db)
        await _record_postprocessing_event_if_needed(db, job=job, story=saved_story)
@@ -1222,6 +1324,54 @@ async def _generate_generation_service_with_job(
        universe_id=story.universe_id,
        retryable_assets=story.retryable_assets,
    )
+
+
+async def _execute_story_without_assets_plan(
+    request: GenerationRequest,
+    user_id: str,
+    db: AsyncSession,
+    *,
+    job,
+) -> GenerationResponse:
+    """Execute the minimal text-story workflow through an explicit plan."""
+
+    plan = build_story_plan(generate_images=False)
+    await record_workflow_plan(db, job=job, plan=plan)
+
+    generate_request = GenerateRequest(
+        type=request.type,
+        data=request.data,
+        education_theme=request.education_theme,
+        child_profile_id=request.child_profile_id,
+        universe_id=request.universe_id,
+    )
+    story = await generate_and_save_story(generate_request, user_id, db, job=job)
+    await _record_postprocessing_event_if_needed(db, job=job, story=story)
+    await finish_generation_job(
+        db,
+        job=job,
+        story=story,
+        current_step="generation_completed",
+        message="Story generation completed with a persisted readable narrative.",
+    )
+    return GenerationResponse(
+        id=story.id,
+        generation_job_id=job.id,
+        title=story.title,
+        mode=story.mode,
+        story_text=story.story_text,
+        cover_prompt=story.cover_prompt,
+        image_url=story.image_url,
+        cover_url=story.image_url,
+        generation_status=story.generation_status,
+        text_status=story.text_status,
+        image_status=story.image_status,
+        audio_status=story.audio_status,
+        last_error=story.last_error,
+        child_profile_id=story.child_profile_id,
+        universe_id=story.universe_id,
+        retryable_assets=story.retryable_assets,
+    )


 async def list_stories(
@@ -1321,36 +1471,7 @@ async def queue_story_asset_generation(
    )
    await _dispatch_generation_job(db, job=job)
    await db.refresh(job)
-    return generation_job_to_summary(job)
-
-
-async def _retry_cover_image_asset(story: Story, db: AsyncSession, *, job=None) -> None:
-    """Retry cover generation for a text story."""
-
-    await _complete_cover_image_asset(
-        story,
-        db,
-        last_error_prefix="封面生成失败",
-        log_event="cover_asset_retry_failed",
-        job=job,
-    )
-
-
-async def _retry_storybook_image_assets(
-    story: Story,
-    db: AsyncSession,
-    *,
-    job=None,
-) -> None:
-    """Retry missing storybook cover/page images."""
-
-    await _complete_storybook_image_assets(story, db, job=job)
-
-
-async def _retry_audio_asset(story: Story, db: AsyncSession, *, job=None) -> None:
-    """Retry audio generation while preserving persisted status on provider failure."""
-
-    await _complete_audio_asset(story, db, raise_on_failure=False, job=job)
+    return public_generation_job_to_summary(job)


 async def retry_story_assets(
@@ -1374,6 +1495,15 @@ async def retry_story_assets(

    try:
        story = await get_story_detail(story_id, user_id, db)
+        plan = build_asset_plan(
+            output_mode="asset_retry",
+            assets=requested_assets,
+        )
+        await record_workflow_plan(
+            db,
+            job=job,
+            plan=plan,
+        )
        await record_generation_event(
            db,
            job=job,
@@ -1384,14 +1514,37 @@ async def retry_story_assets(
            metadata={"assets": requested_assets},
        )

-        if "image" in requested_assets:
+        async def retry_image() -> AssetCompletionResult:
            if story.mode == "storybook":
-                await _retry_storybook_image_assets(story, db, job=job)
-            else:
-                await _retry_cover_image_asset(story, db, job=job)
+                return await _complete_storybook_image_assets(story, db, job=job)

-        if "audio" in requested_assets:
-            await _retry_audio_asset(story, db, job=job)
+            return await _complete_cover_image_asset(
+                story,
+                db,
+                last_error_prefix="封面生成失败",
+                log_event="cover_asset_retry_failed",
+                job=job,
+            )
+
+        async def retry_audio() -> AssetCompletionResult:
+            return await _complete_audio_asset(
+                story,
+                db,
+                raise_on_failure=False,
+                job=job,
+            )
+
+        asset_plan_result = await run_asset_plan(
+            plan,
+            image_task=retry_image if "image" in requested_assets else None,
+            audio_task=retry_audio if "audio" in requested_assets else None,
+        )
+        await record_executor_result(
+            db,
+            job=job,
+            plan=plan,
+            result=asset_plan_result,
+        )

        story = await get_story_detail(story_id, user_id, db)
        await finish_generation_job(
@@ -1448,13 +1601,29 @@ async def generate_story_cover(

    try:
        story = await get_story_detail(story_id, user_id, db)
-        image_result = await _complete_cover_image_asset(
-            story,
+        plan = build_asset_plan(output_mode="asset_generation", assets=["image"])
+        await record_workflow_plan(
            db,
-            raise_on_failure=True,
-            log_event="cover_generation_failed",
            job=job,
+            plan=plan,
        )
+        asset_result = await run_asset_plan(
+            plan,
+            image_task=lambda: _complete_cover_image_asset(
+                story,
+                db,
+                raise_on_failure=True,
+                log_event="cover_generation_failed",
+                job=job,
+            ),
+        )
+        await record_executor_result(
+            db,
+            job=job,
+            plan=plan,
+            result=asset_result,
+        )
+        image_result = asset_result.task_results[0] if asset_result.task_results else None
        story = await get_story_detail(story_id, user_id, db)
        await finish_generation_job(
            db,
@@ -1464,7 +1633,11 @@ async def generate_story_cover(
            message="Cover image generation completed.",
            metadata={"assets": ["image"]},
        )
-        if image_result.succeeded and isinstance(image_result.value, str):
+        if (
+            image_result is not None
+            and image_result.succeeded
+            and isinstance(image_result.value, str)
+        ):
            return image_result.value
    except HTTPException as exc:
        await finish_generation_job(
@@ -1501,12 +1674,28 @@ async def generate_story_audio(

    try:
        story = await get_story_detail(story_id, user_id, db)
-        audio_result = await _complete_audio_asset(
-            story,
+        plan = build_asset_plan(output_mode="asset_generation", assets=["audio"])
+        await record_workflow_plan(
            db,
-            raise_on_failure=True,
            job=job,
+            plan=plan,
        )
+        asset_result = await run_asset_plan(
+            plan,
+            audio_task=lambda: _complete_audio_asset(
+                story,
+                db,
+                raise_on_failure=True,
+                job=job,
+            ),
+        )
+        await record_executor_result(
+            db,
+            job=job,
+            plan=plan,
+            result=asset_result,
+        )
+        audio_result = asset_result.task_results[0] if asset_result.task_results else None
        story = await get_story_detail(story_id, user_id, db)
        await finish_generation_job(
            db,
@@ -1516,7 +1705,11 @@ async def generate_story_audio(
            message="Story audio generation completed.",
            metadata={"assets": ["audio"]},
        )
-        if audio_result.succeeded and isinstance(audio_result.value, bytes):
+        if (
+            audio_result is not None
+            and audio_result.succeeded
+            and isinstance(audio_result.value, bytes)
+        ):
            return audio_result.value
    except HTTPException as exc:
        await finish_generation_job(