Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -0,0 +1,400 @@
[
{
"id": "story-safe-theme-pass",
"artifact": "story",
"description": "完整、儿童安全且清晰包含教育主题的普通故事。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "story"]
},
"input": {
"keywords": "小兔子, 月光花园",
"education_theme": "复盘"
},
"output": {
"mode": "generated",
"title": "小兔子的月光花园",
"story_text": "小兔子露露在月光花园里照顾一朵会发光的小花。她先给小花浇水,又邀请朋友一起观察花瓣的变化。晚上睡前,露露和朋友们坐在石凳上复盘今天的努力:下次要先分好小水壶,再轮流照顾花朵。大家都觉得,分享和复盘让花园变得更温暖。",
"cover_prompt_suggestion": "A gentle watercolor rabbit in a moonlit garden"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-long-safe-pass",
"artifact": "story",
"description": "较长但仍适合亲子共读的普通故事。",
"coverage": {
"age_band": "7-8",
"content_shape": "long_story",
"risk_area": "length_boundary",
"tags": ["theme_present", "long_text", "story"]
},
"input": {
"keywords": "小海豚, 图书馆",
"education_theme": "合作"
},
"output": {
"mode": "generated",
"title": "小海豚的蓝色图书馆",
"story_text": "小海豚多多住在一片安静的海湾里,那里有一座用贝壳和海草搭成的蓝色图书馆。每天傍晚,多多都会把漂来的故事贝壳整理好,放进不同的篮子。可是这一天,风浪把贝壳吹得到处都是,小章鱼、小海马和小螃蟹都赶来帮忙。大家先一起数贝壳,再按颜色排队,最后把每个故事放回合适的位置。多多发现,合作不是一个人做得最快,而是大家把自己的办法放在一起。夜晚来临时,蓝色图书馆重新亮起柔柔的光,小伙伴们围坐在门口,听多多讲今天学到的合作故事。",
"cover_prompt_suggestion": "A gentle dolphin organizing a blue underwater library"
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "story-missing-text-blocks",
"artifact": "story",
"description": "故事正文缺失会被确定性质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "empty_story",
"risk_area": "schema_error",
"tags": ["missing_text", "story", "blocking"]
},
"input": {
"keywords": "小熊, 星星"
},
"output": {
"mode": "generated",
"title": "小熊找星星",
"story_text": "",
"cover_prompt_suggestion": "A bear looking at friendly stars"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_story_text"
]
}
},
{
"id": "story-missing-cover-prompt-blocks",
"artifact": "story",
"description": "故事正文完整但封面提示词缺失会被结构质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "short_story",
"risk_area": "schema_error",
"tags": ["missing_cover_prompt", "story", "blocking"]
},
"input": {
"keywords": "小松鼠, 风筝",
"education_theme": "勇敢"
},
"output": {
"mode": "generated",
"title": "小松鼠的风筝",
"story_text": "小松鼠第一次放风筝时有点紧张。朋友们陪它一起数一二三,它鼓起勇敢的心,终于让风筝飞上蓝天。",
"cover_prompt_suggestion": ""
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_cover_prompt"
]
}
},
{
"id": "story-unsafe-term-blocks",
"artifact": "story",
"description": "明显不适合儿童的风险词会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "short_story",
"risk_area": "safety_error",
"tags": ["unsafe_term", "story", "blocking"]
},
"input": {
"keywords": "小猫, 城堡"
},
"output": {
"mode": "generated",
"title": "小猫的城堡",
"story_text": "小猫在城堡里看到血腥场景,然后感到很害怕。",
"cover_prompt_suggestion": "A cat near a castle"
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "story-short-high-threshold-blocks",
"artifact": "story",
"description": "结构合格但阅读体验偏短的故事在高阈值下会被内部评测阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "very_short_story",
"risk_area": "readability_warning",
"tags": ["short_text", "threshold_block", "story"]
},
"input": {
"keywords": "小鹿, 书签",
"education_theme": "耐心",
"minimum_score": 0.82
},
"output": {
"mode": "generated",
"title": "小鹿的书签",
"story_text": "小鹿学会了耐心等待。",
"cover_prompt_suggestion": "A deer with a golden bookmark"
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.7,
"max_overall_score": 0.8,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"正文长度"
]
}
},
{
"id": "storybook-safe-theme-pass",
"artifact": "storybook",
"description": "完整、儿童安全且包含教育主题的绘本分页输出。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_3_pages",
"risk_area": "happy_path",
"tags": ["theme_present", "safe", "storybook"]
},
"input": {
"keywords": "小狐狸, 彩虹桥",
"education_theme": "合作"
},
"output": {
"title": "彩虹桥上的合作",
"main_character": "小狐狸米米",
"art_style": "温暖水彩",
"cover_prompt": "A warm watercolor fox near a rainbow bridge",
"pages": [
{
"page_number": 1,
"text": "小狐狸米米在雨后的森林里发现一座亮晶晶的彩虹桥。",
"image_prompt": "A little fox finds a rainbow bridge"
},
{
"page_number": 2,
"text": "桥边的小伙伴们一起商量办法,决定合作把落叶清理干净。",
"image_prompt": "Forest friends work together"
},
{
"page_number": 3,
"text": "大家轮流搬叶子、扶篮子,还互相说谢谢,彩虹桥终于露出笑脸。",
"image_prompt": "Friends carrying leaves together"
}
]
},
"expected": {
"passed": true,
"blocking": false,
"min_overall_score": 0.9,
"required_dimensions": [
"structure",
"safety",
"age_fit",
"educational_value",
"readability"
],
"quality_gate_codes": []
}
},
{
"id": "storybook-duplicate-page-blocks",
"artifact": "storybook",
"description": "重复页码的绘本结构会被质量门阻断。",
"coverage": {
"age_band": "5-6",
"content_shape": "storybook_invalid_pages",
"risk_area": "schema_error",
"tags": ["duplicate_page", "storybook", "blocking"]
},
"input": {
"keywords": "小熊, 森林"
},
"output": {
"title": "森林里的小熊",
"main_character": "小熊布布",
"art_style": "水彩",
"cover_prompt": "A bear in a forest",
"pages": [
{
"page_number": 1,
"text": "布布在森林里找到一颗松果。",
"image_prompt": "Bear finds a pinecone"
},
{
"page_number": 1,
"text": "布布把松果带给朋友一起观察。",
"image_prompt": "Bear shares the pinecone"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"invalid_storybook_page_number"
]
}
},
{
"id": "storybook-missing-page-blocks",
"artifact": "storybook",
"description": "没有分页内容的绘本会被结构质量门阻断。",
"coverage": {
"age_band": "unknown",
"content_shape": "storybook_empty_pages",
"risk_area": "schema_error",
"tags": ["missing_page", "storybook", "blocking"]
},
"input": {
"keywords": "小鸟, 云朵"
},
"output": {
"title": "小鸟和云朵",
"main_character": "小鸟啾啾",
"art_style": "柔和水彩",
"cover_prompt": "A bird near soft clouds",
"pages": []
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"missing_storybook_page"
]
}
},
{
"id": "storybook-unsafe-term-blocks",
"artifact": "storybook",
"description": "绘本分页文字包含明显不适龄风险词时会被安全质量门阻断。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "safety_error",
"tags": ["unsafe_term", "storybook", "blocking"]
},
"input": {
"keywords": "小兔子, 山洞"
},
"output": {
"title": "山洞里的声音",
"main_character": "小兔子米粒",
"art_style": "温暖水彩",
"cover_prompt": "A rabbit near a cave",
"pages": [
{
"page_number": 1,
"text": "米粒走到山洞边,听见奇怪的声音。",
"image_prompt": "Rabbit near a cave"
},
{
"page_number": 2,
"text": "洞里出现血腥画面,米粒吓得跑开。",
"image_prompt": "Rabbit running away"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"max_overall_score": 0.0,
"quality_gate_codes": [
"unsafe_child_content"
]
}
},
{
"id": "storybook-short-page-warning",
"artifact": "storybook",
"description": "分页正文过短时保留内部警告,用于评测回归。",
"coverage": {
"age_band": "3-4",
"content_shape": "storybook_2_pages",
"risk_area": "readability_warning",
"tags": ["short_page_text", "threshold_block", "storybook"]
},
"input": {
"keywords": "小羊, 风铃",
"minimum_score": 0.85
},
"output": {
"title": "风铃响了",
"main_character": "小羊团团",
"art_style": "柔和蜡笔",
"cover_prompt": "A lamb listening to a wind chime",
"pages": [
{
"page_number": 1,
"text": "风响。",
"image_prompt": "Wind chime rings"
},
{
"page_number": 2,
"text": "团团笑。",
"image_prompt": "Lamb smiles"
}
]
},
"expected": {
"passed": false,
"blocking": true,
"min_overall_score": 0.8,
"max_overall_score": 0.82,
"required_dimensions": [
"structure",
"safety",
"readability"
],
"quality_gate_codes": [],
"warning_substrings": [
"分页正文长度"
]
}
}
]

View File

@@ -0,0 +1,610 @@
# Test Cases: Harness Evaluation Driven Generation
## Overview
- **Feature**: Harness evaluation driven generation
- **Requirements Source**: `docs/technical/harness-engineering-modernization.md`
- **Test Coverage**: evaluation scoring, blocking quality failures, workflow plan events, trace aggregation, state transitions, internal golden replay, admin-only analytics, admin-only executor coverage summary, admin-only harness readiness
- **Last Updated**: 2026-06-23
## Test Case Categories
### 1. Functional Tests
#### TC-F-001: 普通故事无图片生成写入评测事件
- **Requirement**: H7-3, H7-4
- **Priority**: High
- **Preconditions**:
- 用户已登录。
- 文本 provider 返回完整、儿童安全的故事。
- **Test Steps**:
1. 调用 `POST /api/generations`,设置 `output_mode=story``generate_images=false`
2. 执行 worker 任务。
3. 查询 job detail。
- **Expected Results**:
- job 状态为 `completed`
- event 顺序包含 `workflow_planned`
- event 顺序包含 `evaluation_completed`
- `evaluation_completed.event_metadata.passed=true`
- `evaluation_completed.event_metadata.overall_score >= 0.7`
- **Postconditions**: 故事已持久化,`story_id` 写入 job。
#### TC-F-003: 用户 Trace summary 不返回评测摘要
- **Requirement**: H7-4, H7B-1
- **Priority**: High
- **Preconditions**:
- 故事已有 `evaluation_completed` job event。
- **Test Steps**:
1. 调用 `GET /api/generations/{story_id}/trace-summary`
2. 检查响应字段。
- **Expected Results**:
- 响应不包含 `evaluation` 字段。
- `by_step` 不包含 `evaluation`
- `by_artifact` 不因 `evaluation_completed` 增加 `story_text` 计数。
- `failed_events` 不统计 `evaluation_completed`
- `total_events` 不统计 `evaluation_completed`,避免通过事件数量泄露内部评测步骤。
- **Postconditions**: 无数据修改。
#### TC-F-004: 用户 Job detail 不返回评测事件
- **Requirement**: H7-4, H7B-2
- **Priority**: High
- **Preconditions**:
- job 已记录 `evaluation_completed` 事件。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查 `events` 列表。
- **Expected Results**:
- `events` 不包含 `evaluation_completed`
- 响应不包含评测分数、维度分数、通过率或阻断阈值。
- **Postconditions**: 内部数据库事件不被删除。
#### TC-F-002: 完整故事输出获得通过评分
- **Requirement**: H7-1
- **Priority**: High
- **Preconditions**:
- 构造完整 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 `EvaluationResult`
- **Expected Results**:
- `passed=true`
- `blocking=false`
- scores 包含 `structure``safety``age_fit``educational_value``readability`
- **Postconditions**: 无持久化副作用。
#### TC-F-005: 完整绘本输出获得通过评分
- **Requirement**: H7-1, H7C-1
- **Priority**: High
- **Preconditions**:
- 构造完整 `Storybook`
- **Test Steps**:
1. 调用 `evaluate_storybook_output`
2. 读取 `EvaluationResult`
- **Expected Results**:
- `passed=true`
- `blocking=false`
- scores 包含 `structure``safety``age_fit``educational_value``readability`
- **Postconditions**: 无持久化副作用。
#### TC-F-006: 内部 golden cases 可回放且全部符合预期
- **Requirement**: H7-7, H7-8
- **Priority**: High
- **Preconditions**:
- `backend/app/services/harness/fixtures/evaluation_golden_cases.json` 存在。
- fixture 只由后端测试、内部工具或 admin-only readiness 读取。
- **Test Steps**:
1. 调用 `replay_evaluation_golden_cases`
2. 读取 `EvaluationReplaySuiteResult`
- **Expected Results**:
- `passed=true`
- `failed_case_ids` 为空。
- 普通故事和绘本样本都被覆盖。
- 样本覆盖完整普通故事、较长普通故事、空正文、缺失封面提示词、安全风险词、短文本阈值阻断、绘本重复页码、绘本缺页、绘本安全风险和绘本短分页。
- 结果不通过任何用户端 API 返回。
- **Postconditions**: 无持久化副作用。
#### TC-F-007: 内部 golden replay 覆盖摘要稳定
- **Requirement**: H7-8
- **Priority**: High
- **Preconditions**:
- golden replay suite 已执行。
- **Test Steps**:
1. 调用 `coverage_summary`
2. 检查 artifact、age_band、risk_area、tags 和 outcome 分布。
- **Expected Results**:
- artifact 覆盖 `story=6``storybook=5`
- age_band 覆盖 `3-4``5-6``7-8``unknown`
- risk_area 覆盖 `happy_path``schema_error``safety_error``readability_warning``length_boundary`
- outcome 覆盖 `passed=3``blocked=8`
- 覆盖摘要不通过任何用户端 API 返回。
- **Postconditions**: 无持久化副作用。
### 2. Edge Case Tests
#### TC-E-001: 很短故事通过结构但产生低龄阅读体验警告
- **Requirement**: H7-1
- **Priority**: Medium
- **Preconditions**:
- 构造标题、正文、封面提示词完整但正文很短的 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 warnings 和维度分数。
- **Expected Results**:
- 不触发质量门异常。
- `age_fit``readability` 分数低于完整故事。
- warnings 包含阅读体验提示。
- **Postconditions**: 无持久化副作用。
#### TC-E-002: 内部 golden replay 能报告预期不匹配
- **Requirement**: H7-7
- **Priority**: Medium
- **Preconditions**:
- 构造一个实际得分低于期望阈值的 `EvaluationReplayCase`
- **Test Steps**:
1. 调用 `run_evaluation_replay_cases`
2. 读取 `failure_report`
- **Expected Results**:
- `passed=false`
- `failed_case_ids` 包含该 case id。
- `failure_report` 包含 `overall_score` 差异。
- **Postconditions**: 无持久化副作用。
### 3. Error Handling Tests
#### TC-ERR-001: 空正文阻断持久化
- **Requirement**: H7-4
- **Priority**: High
- **Preconditions**:
- 文本 provider 返回空 `story_text`
- **Test Steps**:
1. 执行 worker 任务。
2. 查询 job 和 story 表。
3. 查询 job events。
- **Expected Results**:
- job 状态为 `failed`
- 没有 story 被持久化。
- events 包含 `quality_gate_failed`
- events 包含 `evaluation_completed`
- `evaluation_completed.event_metadata.blocking=true`
- **Postconditions**: 用户可重试该 job。
#### TC-ERR-002: 不适龄风险词阻断生成
- **Requirement**: H7-1
- **Priority**: High
- **Preconditions**:
- 构造包含明显不适龄风险词的 `StoryOutput`
- **Test Steps**:
1. 调用 `evaluate_story_output`
2. 读取 `quality_gate` metadata。
- **Expected Results**:
- `passed=false`
- `blocking=true`
- `quality_gate.issues[0].failure_category=safety_error`
- **Postconditions**: 无持久化副作用。
#### TC-ERR-003: 绘本结构错误阻断生成
- **Requirement**: H7-1, H7C-1
- **Priority**: High
- **Preconditions**:
- 构造页码重复或页面缺失的 `Storybook`
- **Test Steps**:
1. 调用 `evaluate_storybook_output`
2. 读取 `quality_gate` metadata。
- **Expected Results**:
- `passed=false`
- `blocking=true`
- `quality_gate.issues[0].code=invalid_storybook_page_number` 或对应结构错误。
- **Postconditions**: 无持久化副作用。
### 4. State Transition Tests
#### TC-ST-001: 普通故事无图片路径事件顺序稳定
- **Requirement**: H7-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询 events。
- **Expected Results**:
- event 顺序为 `request_accepted``worker_started``workflow_planned``context_prepared``evaluation_completed``narrative_generated``story_saved``generation_completed`
- **Postconditions**: job `current_step=generation_completed`
#### TC-ST-002: 普通故事带图片路径记录可恢复资产计划
- **Requirement**: H9-1, H9-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- 请求设置 `output_mode=story``generate_images=true`
- 文本 provider 返回合格故事,图片 provider 返回封面 URL。
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询内部 events。
3. 读取 `workflow_planned.event_metadata.plan`
- **Expected Results**:
- event 顺序为 `request_accepted``worker_started``workflow_planned``context_prepared``evaluation_completed``narrative_generated``story_saved``cover_image_started``cover_image_succeeded``generation_completed`
- `plan.mode=story_with_assets`
- plan tasks 包含 `evaluate_narrative`
- plan tasks 包含 `generate_cover_image`
- `generate_cover_image.required=false`
- `generate_cover_image.recoverable=true`
- **Postconditions**: job `current_step=generation_completed`,故事 `image_status=ready`
#### TC-ST-003: 绘本路径记录绘本计划快照
- **Requirement**: H9-2, H9-3
- **Priority**: High
- **Preconditions**:
- job 初始状态为 `running/request_accepted`
- 请求设置 `output_mode=storybook`
- **Test Steps**:
1. 执行 worker 任务。
2. 按 id 查询内部 events。
3. 读取 `workflow_planned.event_metadata.plan`
- **Expected Results**:
- event 顺序包含 `workflow_planned`,且位于 `worker_started``context_prepared` 之间。
- `plan.mode=storybook`
- plan tasks 包含 `generate_storybook_pages`
- plan tasks 包含 `evaluate_storybook_pages`
-`generate_images=true`plan tasks 包含 `generate_storybook_images`
- `generate_storybook_images.required=false`
- `generate_storybook_images.recoverable=true`
- **Postconditions**: job `current_step=generation_completed`
#### TC-ST-004: 绘本生成内部记录评测但用户事件脱敏
- **Requirement**: H7C-1, H7B-2, H9-4
- **Priority**: High
- **Preconditions**:
- 绘本生成 job 已执行完成。
- **Test Steps**:
1. 直接查询内部 `generation_job_events`
2. 调用 `GET /api/generations/jobs/{job_id}`
- **Expected Results**:
- 内部事件包含 `evaluation_completed`
- 内部 `evaluation_completed.event_metadata.artifact=storybook_pages`
- 用户 API events 不包含 `evaluation_completed`
- 用户 API 响应不包含 `overall_score`、维度分数、阈值或 golden replay 字段。
- **Postconditions**: job 完成,绘本已持久化。
#### TC-ST-005: 资产生成和重试路径记录资产计划快照
- **Requirement**: H10-1, H10-2, H10-3
- **Priority**: High
- **Preconditions**:
- 故事已有可生成或可重试的图片/音频资源。
- **Test Steps**:
1. 执行 `asset_generation` worker 任务。
2. 调用 `/api/generations/{story_id}/retry-assets`
3. 按 id 查询内部 events。
- **Expected Results**:
- `asset_generation` 事件顺序包含 `workflow_planned`
- `asset_generation``plan.mode=asset_generation`
- `asset_retry` 事件顺序包含 `workflow_planned`
- `asset_retry``plan.mode=asset_retry`
- 图片和音频任务在 plan 中为 `required=false``recoverable=true`
- **Postconditions**: 资源状态按原有语义更新。
#### TC-ST-006: 用户事件 metadata 使用白名单脱敏
- **Requirement**: H10-4, H10-5
- **Priority**: High
- **Preconditions**:
- 内部 job events 包含原始 `plan.tasks``result_snapshot`、内部阈值或内部错误详情。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查 `events[*].event_metadata`
- **Expected Results**:
- 用户响应保留 `step``artifact``asset``assets``failure_category` 等可解释字段。
- `workflow_planned` 只返回 `plan_mode``planned_task_count``recoverable_task_count`
- 用户响应不包含原始 `plan``tasks``result_snapshot`、内部阈值、内部错误原文。
- 用户响应仍不包含 `evaluation_completed``overall_score`、维度分数或 golden replay 字段。
- **Postconditions**: 内部数据库事件不被修改。
#### TC-ST-007: 用户 request payload 使用白名单脱敏
- **Requirement**: H11-1, H11-4
- **Priority**: High
- **Preconditions**:
- 生成 job 的 `request_payload` 同时包含用户输入、公开控制字段、内部调度 token、Provider override 和评测策略。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 检查响应中的 `request_payload`
- **Expected Results**:
- 用户响应只保留 `output_mode``input_type``type``story_id``assets``page_count``generate_images` 等安全控制字段。
- 用户响应不包含原始 `data``education_theme`、内部调度 token、Provider override 或 evaluation policy。
- 内部数据库中的完整 request payload 不被修改。
- **Postconditions**: 用户端仍可根据公开字段展示任务进度和可用操作。
#### TC-ST-008: 资产 plan runner 按 WorkflowPlan 顺序执行任务
- **Requirement**: H12-1, H12-5
- **Priority**: High
- **Preconditions**:
- 构造 `asset_generation``asset_retry` plan包含图片和音频 task。
- **Test Steps**:
1. 调用 `run_asset_plan(...)`
2. 记录 image/audio handler 的调用顺序。
3. 检查 runner 返回的 executed/ignored task keys。
- **Expected Results**:
- 图片和音频 handler 按 plan 中 `WorkflowTask` 顺序执行。
- `start_asset_*``complete_asset_*` 这类非资产生产 task 被记录为 ignored不触发 provider handler。
- 未知非资产 task 默认 ignored不影响已知资产 task。
- **Postconditions**: 无数据库修改。
#### TC-ST-009: 后台资产生成由 plan runner 执行组合资产
- **Requirement**: H12-2, H12-5
- **Priority**: High
- **Preconditions**:
- 已持久化故事同时具备可生成图片和音频的输入。
- 创建 `asset_generation` job`assets=["audio", "image"]`
- **Test Steps**:
1. 调用 worker 执行该 job。
2. 查询 job events 和 story 状态。
- **Expected Results**:
- event stream 为 `workflow_planned` 后依次出现音频和图片生成事件。
- plan tasks 顺序包含 `complete_audio_asset``complete_image_asset`
- story 的 `audio_status``image_status` 均为 `ready`
- 用户 API 仍只暴露 coarse plan metadata不返回原始 `plan.tasks`
- **Postconditions**: job 完成,资源状态与原有语义一致。
#### TC-ST-010: 用户侧过滤 executor coverage 内部事件
- **Requirement**: H13-4, H13-5
- **Priority**: High
- **Preconditions**:
- 生成 job 包含内部 `executor_completed` 事件。
- `executor_completed.event_metadata` 包含 task keys 和 result assets。
- **Test Steps**:
1. 调用 `GET /api/generations/jobs/{job_id}`
2. 调用 `GET /api/generations/{story_id}/jobs`
3. 调用 `GET /api/generations/{story_id}/trace-summary`
- **Expected Results**:
- 用户 job detail 不包含 `executor_completed`
- 用户 job detail 不包含 `executed_task_keys``ignored_task_keys` 或具体 task key。
- 当 job 当前步骤短暂停留在 `executor_completed` 时,用户 summary 显示为安全公开的 `workflow_planned` 进度。
- 用户 trace summary 不包含 `executor_completed` 或具体 task key。
- 用户 trace summary 的 `total_events` 不统计内部 `executor_completed`
- **Postconditions**: 内部数据库事件不被修改。
### 5. Admin-Only Analytics Tests
#### TC-ADM-001: 管理端评测 analytics 聚合内部评测事件
- **Requirement**: H8-1, H8-2
- **Priority**: High
- **Preconditions**:
- 数据库存在多个用户的 `evaluation_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics`
2. 检查聚合结果。
- **Expected Results**:
- 返回通过数、阻断数、通过率和平均分。
- 返回 artifact、output mode、score band、dimension score、quality gate issue、failure category 和 warning 聚合。
- 不返回故事正文、prompt、单条 evaluation event 或评分 reason。
- **Postconditions**: 无数据修改。
#### TC-ADM-002: 管理端评测 analytics 支持过滤
- **Requirement**: H8-3
- **Priority**: Medium
- **Preconditions**:
- 数据库存在新旧评测事件以及不同 artifact。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics?days=7`
2. 调用 `GET /admin/evaluations/analytics?artifact=story_text`
3. 调用非法 artifact。
- **Expected Results**:
- `days` 过滤只统计窗口内事件。
- `artifact` 过滤只统计对应 artifact。
- 非法 artifact 返回 `422`
- **Postconditions**: 无数据修改。
#### TC-ADM-003: 管理端评测 analytics 需要 admin 鉴权
- **Requirement**: H8-2
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/evaluations/analytics`
- **Expected Results**:
- 返回 `401`
- 不返回任何评测统计。
- **Postconditions**: 无数据修改。
#### TC-ADM-004: 管理端完整生成 trace 返回内部事件流
- **Requirement**: H11-2, H11-3, H11-4
- **Priority**: High
- **Preconditions**:
- 数据库存在包含 `workflow_planned``evaluation_completed` 的生成 job。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
2. 检查 request payload 与 event stream。
- **Expected Results**:
- 返回完整 request payload包括原始用户输入和内部调度字段。
- 返回完整 `workflow_planned.event_metadata.plan.tasks`
- 返回 `evaluation_completed` 事件及其内部评分 metadata。
- 响应包含 `user_id`,便于管理控制面审计。
- **Postconditions**: 无数据修改。
#### TC-ADM-005: 管理端完整生成 trace 需要 admin 鉴权
- **Requirement**: H11-3
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
- **Expected Results**:
- 返回 `401`
- 不返回 request payload 或内部 event metadata。
- **Postconditions**: 无数据修改。
#### TC-ADM-006: 管理端 executor coverage 聚合内部执行事件
- **Requirement**: H13-1, H13-2, H13-3, H13-5
- **Priority**: High
- **Preconditions**:
- 数据库存在多个 `executor_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/executors/coverage`
2. 调用 `GET /admin/executors/coverage?plan_mode=asset_retry`
3. 调用非法 plan mode。
- **Expected Results**:
- 返回 total runs、planned/executed/ignored task counts 和 coverage ratio。
- 返回 plan mode、output mode、executed task keys、ignored task keys 和 result assets 聚合。
- `plan_mode` 过滤只统计对应 executor run。
- 非法 plan mode 返回 `422`
- **Postconditions**: 无数据修改。
#### TC-ADM-007: 管理端 executor coverage 需要 admin 鉴权
- **Requirement**: H13-3
- **Priority**: High
- **Preconditions**:
- 未提供 admin Basic Auth。
- **Test Steps**:
1. 调用 `GET /admin/executors/coverage`
- **Expected Results**:
- 返回 `401`
- 不返回 executor task keys 或 coverage metadata。
- **Postconditions**: 无数据修改。
#### TC-ADM-008: 管理端完整生成 trace 返回单 job executor coverage 摘要
- **Requirement**: H14-1, H14-2, H14-4
- **Priority**: High
- **Preconditions**:
- 数据库存在包含 `executor_completed` 事件的生成 job。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/generations/jobs/{job_id}/trace`
2. 检查 `executor_coverage`
- **Expected Results**:
- 响应包含 `executor_coverage.scope=admin_internal_job_executor_coverage`
- `executor_coverage` 只统计当前 job 的 runs、planned/executed/ignored task counts 和 coverage ratio。
- `executor_coverage.executed_task_keys``ignored_task_keys``result_assets` 与当前 job 的内部 executor event 一致。
- 完整 event stream 仍保留 `executor_completed`,便于 admin 调试。
- **Postconditions**: 无数据修改。
#### TC-ADM-009: 管理端 harness readiness 聚合内部质量门
- **Requirement**: H15-1, H15-2, H15-3, H15-4
- **Priority**: High
- **Preconditions**:
- app 内部 harness fixture 存在 golden replay cases。
- 数据库存在至少一条通过的 `evaluation_completed` 事件。
- 数据库存在至少一条 `executor_completed` 事件。
- 请求通过 admin guard。
- **Test Steps**:
1. 调用 `GET /admin/harness/readiness`
2. 检查 readiness status、checks 和聚合摘要。
- **Expected Results**:
- `status=ready`
- checks 包含 `golden_replay``runtime_evaluation_samples``runtime_evaluation_quality``executor_coverage_samples``executor_coverage_ratio`
- golden replay 显示全部通过。
- evaluation analytics 与 executor coverage 只以聚合形式返回。
- 响应不包含故事标题、正文、prompt、score reason 或 quality gate message。
- **Postconditions**: 无数据修改。
#### TC-ADM-010: 管理端 harness readiness 阻断低质量运行样本并需要 admin 鉴权
- **Requirement**: H15-2, H15-3, H15-4, H15-5
- **Priority**: High
- **Preconditions**:
- 数据库存在低质量或 blocking 的 `evaluation_completed` 事件。
- executor coverage 运行样本缺失或不足。
- **Test Steps**:
1. 通过 admin guard 调用 `GET /admin/harness/readiness`
2. 未提供 admin Basic Auth 调用同一路径。
- **Expected Results**:
- 有 admin 权限时返回 `status=blocked`
- `runtime_evaluation_quality.status=blocked`
- executor 样本缺失时对应 check 为 `needs_attention`
- 无 admin 权限时返回 `401`
- 响应不包含 quality gate message 或单条事件明细。
- **Postconditions**: 无数据修改。
## Test Coverage Matrix
| Requirement ID | Test Cases | Coverage Status |
| --- | --- | --- |
| H7-1 | TC-F-002, TC-F-005, TC-E-001, TC-ERR-002, TC-ERR-003 | Complete |
| H7-2 | TC-F-001, TC-ST-001 | Complete |
| H7-3 | TC-F-001, TC-ST-001 | Complete |
| H7-4 | TC-F-003, TC-ERR-001 | Complete |
| H7-5 | This document | Complete |
| H7-7 | TC-F-006, TC-E-002 | Complete |
| H7-8 | TC-F-006, TC-F-007 | Complete |
| H7B-1 | TC-F-003 | Complete |
| H7B-2 | TC-F-004 | Complete |
| H7C-1 | TC-F-005, TC-ERR-003, TC-ST-002 | Complete |
| H8-1 | TC-ADM-001 | Complete |
| H8-2 | TC-ADM-001, TC-ADM-003 | Complete |
| H8-3 | TC-ADM-002 | Complete |
| H8-4 | TC-F-003, TC-F-004, TC-ADM-001 | Complete |
| H9-1 | TC-ST-002 | Complete |
| H9-2 | TC-ST-003 | Complete |
| H9-3 | TC-ST-001, TC-ST-002, TC-ST-003 | Complete |
| H9-4 | TC-F-003, TC-F-004, TC-ST-004 | Complete |
| H10-1 | TC-ST-005 | Complete |
| H10-2 | TC-ST-005 | Complete |
| H10-3 | TC-ST-005 | Complete |
| H10-4 | TC-ST-006 | Complete |
| H10-5 | TC-ST-005, TC-ST-006 | Complete |
| H11-1 | TC-ST-007 | Complete |
| H11-2 | TC-ADM-004 | Complete |
| H11-3 | TC-ADM-004, TC-ADM-005 | Complete |
| H11-4 | TC-ST-007, TC-ADM-004, TC-ADM-005 | Complete |
| H11-5 | This document, `docs/planning/harness-stage-11-report.md` | Complete |
| H12-1 | TC-ST-008 | Complete |
| H12-2 | TC-ST-009 | Complete |
| H12-3 | TC-ST-005, TC-ST-008 | Complete |
| H12-4 | TC-ST-005, backend story endpoint regression tests | Complete |
| H12-5 | TC-ST-008, TC-ST-009 | Complete |
| H13-1 | TC-ADM-006 | Complete |
| H13-2 | TC-ST-009, TC-ADM-006 | Complete |
| H13-3 | TC-ADM-006, TC-ADM-007 | Complete |
| H13-4 | TC-ST-010 | Complete |
| H13-5 | TC-ST-010, TC-ADM-006, TC-ADM-007 | Complete |
| H14-1 | TC-ADM-006, TC-ADM-008 | Complete |
| H14-2 | TC-ADM-008 | Complete |
| H14-3 | TC-ST-010 | Complete |
| H14-4 | TC-ST-010, TC-ADM-008 | Complete |
| H14-5 | This document, `docs/planning/harness-stage-14-report.md` | Complete |
| H15-1 | TC-F-006, TC-ADM-009 | Complete |
| H15-2 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-3 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-4 | TC-ADM-009, TC-ADM-010 | Complete |
| H15-5 | This document, `docs/planning/harness-stage-15-report.md` | Complete |
## Notes
- 当前自动化已覆盖 TC-F-001、TC-F-002、TC-F-003、TC-F-004、TC-F-005、TC-F-006、TC-F-007、TC-E-002、TC-ERR-001、TC-ERR-002、TC-ERR-003、TC-ST-001、TC-ST-002、TC-ST-003、TC-ST-004、TC-ST-005、TC-ST-006、TC-ST-007、TC-ST-008、TC-ST-009、TC-ST-010、TC-ADM-001、TC-ADM-002、TC-ADM-003、TC-ADM-004、TC-ADM-005、TC-ADM-006、TC-ADM-007、TC-ADM-008、TC-ADM-009、TC-ADM-010。
- TC-E-001 可在下一轮补成显式单测。
- 所有 `evaluation_completed`、golden replay 和评分维度数据均按内部质量资产处理,不应进入用户端接口或用户前端。
- `GET /admin/evaluations/analytics` 只允许 admin-only 聚合摘要不应返回原始内容、prompt、单条事件或评分 reason。
- `GET /admin/generations/jobs/{job_id}/trace` 是 admin-only 调试和审查接口,可返回完整内部链路,不应被用户前端调用。
- `GET /admin/executors/coverage` 是 admin-only executor 覆盖率接口,可返回 task keys 和 result assets不应被用户前端调用。
- `GET /admin/generations/jobs/{job_id}/trace` 可返回当前 job 的 `executor_coverage` 摘要;该摘要与 task keys 一样属于内部执行资产。
- `GET /admin/harness/readiness` 是 admin-only harness 上线前审查摘要,可返回聚合 readiness、thresholds、golden coverage、evaluation analytics 和 executor coverage不应返回正文、prompt、score reason、quality gate message 或单条事件明细。

View File

@@ -27,6 +27,17 @@ def _build_admin_test_app(db_session) -> FastAPI:
return app
def _build_admin_auth_required_test_app(db_session) -> FastAPI:
app = FastAPI()
app.include_router(admin_providers.router, prefix="/admin")
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
return app
async def _create_story(
db_session,
*,
@@ -51,6 +62,38 @@ async def _create_story(
return story
async def _record_evaluation_event(
db_session,
*,
user_id: str,
story_id: int,
output_mode: str,
artifact: str,
status: str,
metadata: dict,
):
job = await create_generation_job(
db_session,
user_id=user_id,
output_mode=output_mode,
input_type="keywords",
request_payload={"data": "测试"},
story_id=story_id,
)
return await record_generation_event(
db_session,
job=job,
story_id=story_id,
event_type="evaluation_completed",
status=status,
metadata={
"step": "evaluation",
"artifact": artifact,
**metadata,
},
)
async def test_admin_provider_analytics_aggregate_across_users(db_session, test_user):
second_user = User(
id="github:67890",
@@ -197,6 +240,616 @@ async def test_admin_provider_analytics_aggregate_across_users(db_session, test_
]
async def test_admin_evaluation_analytics_aggregate_internal_events(
db_session,
test_user,
):
second_user = User(
id="google:evaluation-user",
name="Evaluation User",
avatar_url="https://example.com/eval.png",
provider="google",
)
db_session.add(second_user)
await db_session.commit()
story = await _create_story(db_session, user_id=test_user.id, title="评测故事")
storybook = await _create_story(
db_session,
user_id=second_user.id,
title="评测绘本",
mode="storybook",
)
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.92,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "完整"},
{"dimension": "readability", "score": 0.84, "reason": "可读"},
],
"warnings": [],
},
)
await _record_evaluation_event(
db_session,
user_id=second_user.id,
story_id=storybook.id,
output_mode="storybook",
artifact="storybook_pages",
status="failed",
metadata={
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [
{"dimension": "structure", "score": 0.0, "reason": "结构失败"},
{"dimension": "safety", "score": 0.0, "reason": "安全失败"},
],
"quality_gate": {
"issues": [
{
"code": "unsafe_child_content",
"message": "风险词",
"failure_category": "safety_error",
"field": "pages",
}
]
},
"warnings": ["绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_evaluations"
assert data["total_evaluations"] == 2
assert data["passed_evaluations"] == 1
assert data["blocked_evaluations"] == 1
assert data["pass_rate"] == 0.5
assert data["average_score"] == 0.46
assert data["job_count"] == 2
assert data["story_count"] == 2
assert data["user_count"] == 2
assert data["by_artifact"] == [
{"artifact": "story_text", "count": 1},
{"artifact": "storybook_pages", "count": 1},
]
assert data["by_output_mode"] == [
{"output_mode": "story", "count": 1},
{"output_mode": "storybook", "count": 1},
]
assert data["score_bands"] == [
{"band": "blocked_quality_gate", "count": 1},
{"band": "excellent", "count": 1},
]
assert data["dimension_scores"] == [
{"dimension": "structure", "average_score": 0.5, "count": 2},
{"dimension": "readability", "average_score": 0.84, "count": 1},
{"dimension": "safety", "average_score": 0.0, "count": 1},
]
assert data["quality_gate_issues"] == [
{"code": "unsafe_child_content", "count": 1},
]
assert data["failure_categories"] == [
{"category": "safety_error", "count": 1},
]
assert data["warnings"] == [
{
"message": "绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。",
"count": 1,
},
]
assert "评测故事" not in str(data)
assert "风险词" not in str(data)
assert "完整" not in str(data)
async def test_admin_evaluation_analytics_support_days_and_artifact_filters(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="旧评测")
storybook = await _create_story(
db_session,
user_id=test_user.id,
title="新评测",
mode="storybook",
)
old_event = await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.96,
"passed": True,
"blocking": False,
"scores": [{"dimension": "structure", "score": 1.0, "reason": "完整"}],
"warnings": [],
},
)
old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10)
await db_session.commit()
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=storybook.id,
output_mode="storybook",
artifact="storybook_pages",
status="failed",
metadata={
"overall_score": 0.72,
"passed": False,
"blocking": True,
"scores": [{"dimension": "readability", "score": 0.62, "reason": "过短"}],
"warnings": ["分页正文长度偏短"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics?days=7")
assert response.status_code == 200
data = response.json()
assert data["window_days"] == 7
assert data["total_evaluations"] == 1
assert data["artifact"] is None
assert data["by_artifact"] == [{"artifact": "storybook_pages", "count": 1}]
response = await client.get(
"/admin/evaluations/analytics?artifact=story_text"
)
assert response.status_code == 200
data = response.json()
assert data["artifact"] == "story_text"
assert data["total_evaluations"] == 1
assert data["average_score"] == 0.96
response = await client.get("/admin/evaluations/analytics?artifact=image")
assert response.status_code == 422
async def test_admin_evaluation_analytics_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics")
assert response.status_code == 401
async def test_admin_generation_job_trace_returns_internal_event_stream(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="内部链路故事")
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "月亮森林",
"internal_dispatch_token": "admin-visible-token",
"provider_override": "internal-provider",
"evaluation_policy": {"threshold": 0.9},
},
story_id=story.id,
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="workflow_planned",
status="succeeded",
metadata={
"step": "request_acceptance",
"artifact": "none",
"plan": {
"mode": "story",
"tasks": [
{
"key": "generate_narrative",
"step": "text_generation",
"artifact": "story_text",
"required": True,
"recoverable": False,
}
],
},
"internal_threshold": 0.9,
},
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="evaluation_completed",
status="succeeded",
metadata={
"step": "evaluation",
"artifact": "story_text",
"overall_score": 0.94,
"passed": True,
"blocking": False,
"scores": [{"dimension": "structure", "score": 1.0}],
},
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get(f"/admin/generations/jobs/{job.id}/trace")
assert response.status_code == 200
data = response.json()
assert data["id"] == job.id
assert data["user_id"] == test_user.id
assert data["request_payload"]["data"] == "月亮森林"
assert data["request_payload"]["internal_dispatch_token"] == "admin-visible-token"
assert data["request_payload"]["evaluation_policy"] == {"threshold": 0.9}
event_types = [event["event_type"] for event in data["events"]]
assert event_types == [
"request_accepted",
"workflow_planned",
"evaluation_completed",
"executor_completed",
]
workflow_event = data["events"][1]
assert workflow_event["event_metadata"]["plan"]["tasks"][0]["key"] == (
"generate_narrative"
)
assert workflow_event["event_metadata"]["internal_threshold"] == 0.9
evaluation_event = data["events"][2]
assert evaluation_event["event_metadata"]["overall_score"] == 0.94
assert evaluation_event["event_metadata"]["scores"] == [
{"dimension": "structure", "score": 1.0}
]
executor_event = data["events"][3]
assert executor_event["event_metadata"]["executed_task_keys"] == [
"complete_image_asset"
]
assert executor_event["event_metadata"]["result_assets"] == ["cover_image"]
executor_coverage = data["executor_coverage"]
assert executor_coverage["scope"] == "admin_internal_job_executor_coverage"
assert executor_coverage["total_runs"] == 1
assert executor_coverage["total_planned_tasks"] == 3
assert executor_coverage["total_executed_tasks"] == 1
assert executor_coverage["total_ignored_tasks"] == 2
assert executor_coverage["coverage_ratio"] == 0.3333
assert executor_coverage["job_count"] == 1
assert executor_coverage["story_count"] == 1
assert executor_coverage["user_count"] == 1
assert executor_coverage["by_plan_mode"] == [
{"plan_mode": "asset_generation", "count": 1}
]
assert executor_coverage["by_output_mode"] == [
{"output_mode": "story", "count": 1}
]
assert executor_coverage["executed_task_keys"] == [
{"task_key": "complete_image_asset", "count": 1}
]
assert executor_coverage["ignored_task_keys"] == [
{"task_key": "complete_asset_generation", "count": 1},
{"task_key": "start_asset_generation", "count": 1},
]
assert executor_coverage["result_assets"] == [
{"asset": "cover_image", "count": 1}
]
async def test_admin_generation_job_trace_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/generations/jobs/missing-job/trace")
assert response.status_code == 401
async def test_admin_executor_coverage_aggregates_internal_events(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="执行器覆盖故事")
asset_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_generation",
input_type="audio,image",
request_payload={"story_id": story.id, "assets": ["audio", "image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=asset_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 4,
"executed_task_count": 2,
"ignored_task_count": 2,
"executed_task_keys": ["complete_audio_asset", "complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["audio", "cover_image"],
},
)
retry_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_retry",
input_type="image",
request_payload={"story_id": story.id, "assets": ["image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=retry_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_retry",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": ["start_asset_retry", "complete_asset_retry"],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/executors/coverage")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_executor_coverage"
assert data["total_runs"] == 2
assert data["total_planned_tasks"] == 7
assert data["total_executed_tasks"] == 3
assert data["total_ignored_tasks"] == 4
assert data["coverage_ratio"] == 0.4286
assert data["job_count"] == 2
assert data["story_count"] == 1
assert data["user_count"] == 1
assert data["by_plan_mode"] == [
{"plan_mode": "asset_generation", "count": 1},
{"plan_mode": "asset_retry", "count": 1},
]
assert data["executed_task_keys"] == [
{"task_key": "complete_image_asset", "count": 2},
{"task_key": "complete_audio_asset", "count": 1},
]
assert data["result_assets"] == [
{"asset": "cover_image", "count": 2},
{"asset": "audio", "count": 1},
]
response = await client.get("/admin/executors/coverage?plan_mode=asset_retry")
assert response.status_code == 200
data = response.json()
assert data["plan_mode"] == "asset_retry"
assert data["total_runs"] == 1
assert data["total_planned_tasks"] == 3
assert data["total_executed_tasks"] == 1
response = await client.get("/admin/executors/coverage?plan_mode=story")
assert response.status_code == 422
async def test_admin_executor_coverage_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/executors/coverage")
assert response.status_code == 401
async def test_admin_harness_readiness_returns_ready_when_internal_gates_pass(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="readiness 故事")
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.92,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "内部 reason"},
{"dimension": "readability", "score": 0.84, "reason": "内部 reason"},
],
"warnings": [],
},
)
asset_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": story.id, "assets": ["image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=asset_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_harness_readiness"
assert data["status"] == "ready"
assert data["thresholds"] == {
"min_runtime_evaluations": 1,
"min_executor_runs": 1,
"min_evaluation_pass_rate": 0.7,
"min_evaluation_average_score": 0.7,
"min_executor_coverage_ratio": 0.2,
}
assert {check["code"]: check["status"] for check in data["checks"]} == {
"golden_replay": "ready",
"runtime_evaluation_samples": "ready",
"runtime_evaluation_quality": "ready",
"executor_coverage_samples": "ready",
"executor_coverage_ratio": "ready",
}
assert data["golden_replay"]["passed"] is True
assert data["golden_replay"]["total_cases"] == 11
assert data["evaluation_analytics"]["total_evaluations"] == 1
assert data["evaluation_analytics"]["pass_rate"] == 1.0
assert data["executor_coverage"]["total_runs"] == 1
assert data["executor_coverage"]["coverage_ratio"] == 0.3333
assert "内部 reason" not in str(data)
assert "readiness 故事" not in str(data)
async def test_admin_harness_readiness_blocks_low_runtime_quality(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="低质量 readiness")
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="failed",
metadata={
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [{"dimension": "structure", "score": 0.0, "reason": "缺失"}],
"quality_gate": {
"issues": [
{
"code": "missing_story_text",
"message": "正文缺失",
"failure_category": "schema_error",
"field": "story_text",
}
]
},
"warnings": [],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 200
data = response.json()
assert data["status"] == "blocked"
checks = {check["code"]: check for check in data["checks"]}
assert checks["golden_replay"]["status"] == "ready"
assert checks["runtime_evaluation_samples"]["status"] == "ready"
assert checks["runtime_evaluation_quality"]["status"] == "blocked"
assert checks["executor_coverage_samples"]["status"] == "needs_attention"
assert checks["executor_coverage_ratio"]["status"] == "needs_attention"
assert data["evaluation_analytics"]["blocked_evaluations"] == 1
assert data["executor_coverage"]["total_runs"] == 0
assert "正文缺失" not in str(data)
assert "低质量 readiness" not in str(data)
async def test_admin_harness_readiness_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 401
async def test_admin_provider_analytics_support_days_and_capability_filters(
db_session,
test_user,

View File

@@ -123,14 +123,19 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"story_saved",
"generation_completed",
]
assert events[2].event_metadata["has_memory_context"] is False
assert events[3].event_metadata["title"] == "小兔子的冒险"
assert events[4].story_id == job.story_id
assert events[2].event_metadata["plan"]["mode"] == "story"
assert events[3].event_metadata["has_memory_context"] is False
assert events[4].event_metadata["passed"] is True
assert events[4].event_metadata["overall_score"] >= 0.7
assert events[5].event_metadata["title"] == "小兔子的冒险"
assert events[6].story_id == job.story_id
detail_response = await client.get(f"/api/generations/jobs/{job.id}")
assert detail_response.status_code == 200
@@ -143,11 +148,16 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [event["event_type"] for event in detail["events"]] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"narrative_generated",
"story_saved",
"generation_completed",
]
assert all(
event["event_type"] != "evaluation_completed"
for event in detail["events"]
)
story_response = await client.get(f"/api/generations/{job.story_id}")
assert story_response.status_code == 200
@@ -161,6 +171,13 @@ async def test_unified_generation_is_queued_then_worker_persists_story_and_event
assert [item["id"] for item in job_list] == [job.id]
assert job_list[0]["progress_percent"] == 100
assert job_list[0]["is_terminal"] is True
trace_response = await client.get(
f"/api/generations/{job.story_id}/trace-summary"
)
assert trace_response.status_code == 200
trace = trace_response.json()
assert "evaluation" not in trace
finally:
app.dependency_overrides.clear()
@@ -220,13 +237,88 @@ async def test_generation_worker_records_quality_gate_failure_without_persisting
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"quality_gate_failed",
"evaluation_completed",
"generation_failed",
]
quality_event = events[3]
quality_event = events[4]
assert quality_event.event_metadata["step"] == "narrative_generation"
assert quality_event.event_metadata["issues"][0]["code"] == "missing_story_text"
evaluation_event = events[5]
assert evaluation_event.event_metadata["step"] == "evaluation"
assert evaluation_event.event_metadata["passed"] is False
assert evaluation_event.event_metadata["blocking"] is True
async def test_story_with_images_worker_records_plan_before_assets(
db_session,
test_user,
mock_text_provider,
mock_image_provider,
):
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "小兔子, 森林",
"generate_images": True,
},
)
await run_generation_job_service(job.id, db_session)
refreshed_job = (
await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id))
).scalar_one()
assert refreshed_job.story_id is not None
assert refreshed_job.status == "completed"
assert refreshed_job.current_step == "generation_completed"
assert refreshed_job.result_snapshot["image_status"] == "ready"
events = (
await db_session.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"story_saved",
"cover_image_started",
"cover_image_succeeded",
"generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "story_with_assets"
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_narrative",
"evaluate_narrative",
"persist_story",
"generate_cover_image",
"queue_postprocessing",
"complete_generation",
]
cover_task = next(task for task in plan["tasks"] if task["key"] == "generate_cover_image")
assert cover_task["required"] is False
assert cover_task["recoverable"] is True
assert events[4].event_metadata["passed"] is True
assert events[8].event_metadata["asset"] == "cover_image"
mock_text_provider.assert_called_once()
mock_image_provider.assert_called_once()
async def test_asset_retry_records_job_events_and_updates_retryable_assets(
@@ -279,12 +371,30 @@ async def test_asset_retry_records_job_events_and_updates_retryable_assets(
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"workflow_planned",
"asset_retry_started",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_retry_completed",
]
assert events[3].event_metadata["asset"] == "cover_image"
plan = events[1].event_metadata["plan"]
assert plan["mode"] == "asset_retry"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_retry",
"complete_image_asset",
"complete_asset_retry",
]
image_task = next(
task for task in plan["tasks"] if task["key"] == "complete_image_asset"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
assert events[4].event_metadata["asset"] == "cover_image"
assert events[5].event_metadata["plan_mode"] == "asset_retry"
assert events[5].event_metadata["executed_task_keys"] == [
"complete_image_asset"
]
finally:
app.dependency_overrides.clear()
@@ -365,10 +475,110 @@ async def test_asset_generation_job_worker_completes_cover_image(
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "asset_generation"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_generation",
"complete_image_asset",
"complete_asset_generation",
]
image_task = next(
task for task in plan["tasks"] if task["key"] == "complete_image_asset"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
executor_event = events[5]
assert executor_event.event_metadata["plan_mode"] == "asset_generation"
assert executor_event.event_metadata["executed_task_keys"] == [
"complete_image_asset"
]
assert executor_event.event_metadata["ignored_task_keys"] == [
"start_asset_generation",
"complete_asset_generation",
]
assert executor_event.event_metadata["result_assets"] == ["cover_image"]
async def test_asset_generation_job_worker_executes_assets_in_plan_order(
db_session,
test_story,
mock_tts_provider,
):
job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="asset_generation",
input_type="audio,image",
request_payload={"story_id": test_story.id, "assets": ["audio", "image"]},
story_id=test_story.id,
)
with patch(
"app.services.story_service.generate_image",
new_callable=AsyncMock,
) as mock_generate_image:
mock_generate_image.return_value = "https://example.com/plan-cover.png"
await run_generation_job_service(job.id, db_session)
refreshed_job = (
await db_session.execute(select(GenerationJob).where(GenerationJob.id == job.id))
).scalar_one()
assert refreshed_job.status == "completed"
assert refreshed_job.current_step == "asset_generation_completed"
assert refreshed_job.result_snapshot["image_status"] == "ready"
assert refreshed_job.result_snapshot["audio_status"] == "ready"
story = (
await db_session.execute(
select(Story).where(Story.id == test_story.id)
)
).scalar_one()
assert story.image_url == "https://example.com/plan-cover.png"
assert story.audio_status == "ready"
assert story.audio_path is not None
events = (
await db_session.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"audio_started",
"audio_succeeded",
"cover_image_started",
"cover_image_succeeded",
"executor_completed",
"asset_generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "asset_generation"
assert [task["key"] for task in plan["tasks"]] == [
"start_asset_generation",
"complete_audio_asset",
"complete_image_asset",
"complete_asset_generation",
]
assert events[4].event_metadata["asset"] == "audio"
assert events[6].event_metadata["asset"] == "cover_image"
assert events[7].event_metadata["executed_task_keys"] == [
"complete_audio_asset",
"complete_image_asset",
]
assert events[7].event_metadata["result_assets"] == ["audio", "cover_image"]
mock_tts_provider.assert_awaited_once()
mock_generate_image.assert_awaited_once()
async def test_cancel_queued_asset_generation_job_marks_it_canceled(
@@ -538,7 +748,9 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve
assert [event.event_type for event in events] == [
"request_accepted",
"worker_started",
"workflow_planned",
"context_prepared",
"evaluation_completed",
"narrative_generated",
"storybook_images_started",
"storybook_cover_image_succeeded",
@@ -548,13 +760,45 @@ async def test_storybook_generation_is_queued_then_worker_records_page_image_eve
"story_saved",
"generation_completed",
]
plan = events[2].event_metadata["plan"]
assert plan["mode"] == "storybook"
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_storybook_pages",
"evaluate_storybook_pages",
"generate_storybook_images",
"persist_storybook",
"queue_postprocessing",
"complete_generation",
]
image_task = next(
task
for task in plan["tasks"]
if task["key"] == "generate_storybook_images"
)
assert image_task["required"] is False
assert image_task["recoverable"] is True
assert events[4].event_metadata["passed"] is True
assert events[4].event_metadata["artifact"] == "storybook_pages"
page_events = [
event
for event in events
if event.event_type == "storybook_page_image_succeeded"
]
assert [event.event_metadata["page_number"] for event in page_events] == [1, 2]
assert events[8].event_metadata["completed_pages"] == [1, 2]
assert events[10].event_metadata["completed_pages"] == [1, 2]
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
detail_response = await client.get(
f"/api/generations/jobs/{job.id}"
)
assert detail_response.status_code == 200
detail = detail_response.json()
assert "evaluation_completed" not in [
event["event_type"] for event in detail["events"]
]
finally:
app.dependency_overrides.clear()
@@ -716,6 +960,414 @@ async def test_story_provider_stats_aggregate_job_events(
app.dependency_overrides.clear()
async def test_story_trace_summary_aggregates_steps_artifacts_and_failure_categories(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_retry",
input_type="image",
request_payload={"assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="cover_image_started",
status="running",
metadata={
"step": "image_generation",
"artifact": "cover_image",
"failure_category": None,
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="cover_image_failed",
status="failed",
metadata={
"step": "image_generation",
"artifact": "cover_image",
"failure_category": "provider_error",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="quality_gate_failed",
status="failed",
metadata={
"step": "narrative_generation",
"artifact": "story_text",
"failure_category": "schema_error",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="evaluation_completed",
status="failed",
metadata={
"step": "evaluation",
"artifact": "story_text",
"failure_category": "schema_error",
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [
{
"dimension": "structure",
"score": 0.0,
"reason": "故事结构未通过质量门。",
},
{
"dimension": "safety",
"score": 0.0,
"reason": "内容未通过儿童安全或结构完整性检查。",
},
],
},
)
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/trace-summary"
)
assert response.status_code == 200
data = response.json()
assert data["story_id"] == degraded_story_with_text.id
assert data["total_events"] == 4
assert data["failed_events"] == 2
assert data["by_step"] == [
{"name": "image_generation", "count": 2},
{"name": "narrative_generation", "count": 1},
]
assert data["by_artifact"] == [
{"name": "cover_image", "count": 2},
{"name": "story_text", "count": 1},
]
assert data["failure_categories"] == [
{"name": "provider_error", "count": 1},
{"name": "schema_error", "count": 1},
]
assert "evaluation" not in data
assert "overall_score" not in str(data)
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_hides_internal_evaluation_step(
db_session,
auth_token,
test_user,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "小兔子",
"generate_images": False,
},
)
await record_generation_event(
db_session,
job=job,
event_type="evaluation_completed",
status="succeeded",
metadata={
"step": "evaluation",
"artifact": "story_text",
"overall_score": 0.96,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "完整。"},
],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
assert data["current_step"] == "narrative_generated"
assert data["progress_label"] == "正文已生成"
assert [event["event_type"] for event in data["events"]] == [
"request_accepted"
]
assert "evaluation_completed" not in str(data)
assert "overall_score" not in str(data)
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_sanitizes_request_payload(
db_session,
auth_token,
test_user,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"input_type": "keywords",
"type": "keywords",
"data": "不要回传原始关键词",
"education_theme": "勇气",
"generate_images": True,
"page_count": 6,
"child_profile_id": "child-public-id",
"universe_id": "universe-public-id",
"internal_dispatch_token": "secret-dispatch-token",
"provider_override": "internal-provider",
"evaluation_policy": {"threshold": 0.9},
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
assert data["request_payload"] == {
"child_profile_id": "child-public-id",
"generate_images": True,
"input_type": "keywords",
"output_mode": "story",
"page_count": 6,
"type": "keywords",
"universe_id": "universe-public-id",
}
payload_dump = str(data["request_payload"])
assert "不要回传原始关键词" not in payload_dump
assert "education_theme" not in payload_dump
assert "secret-dispatch-token" not in payload_dump
assert "internal-provider" not in payload_dump
assert "evaluation_policy" not in payload_dump
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_detail_sanitizes_public_event_metadata(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="workflow_planned",
status="succeeded",
metadata={
"step": "request_acceptance",
"artifact": "none",
"plan": {
"mode": "asset_generation",
"tasks": [
{
"key": "complete_image_asset",
"step": "image_generation",
"artifact": "image",
"required": False,
"recoverable": True,
}
],
},
"internal_threshold": 0.72,
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="asset_generation_completed",
status="completed",
metadata={
"assets": ["image"],
"result_snapshot": {
"story_id": degraded_story_with_text.id,
"last_error": "internal provider detail",
},
"error": "internal provider detail",
},
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get(f"/api/generations/jobs/{job.id}")
assert response.status_code == 200
data = response.json()
workflow_event = next(
event for event in data["events"] if event["event_type"] == "workflow_planned"
)
assert workflow_event["event_metadata"] == {
"artifact": "none",
"plan_mode": "asset_generation",
"planned_task_count": 1,
"recoverable_task_count": 1,
"step": "request_acceptance",
}
completion_event = next(
event
for event in data["events"]
if event["event_type"] == "asset_generation_completed"
)
assert completion_event["event_metadata"] == {"assets": ["image"]}
assert "plan" not in workflow_event["event_metadata"]
assert "tasks" not in str(data["events"])
assert "internal_threshold" not in str(data["events"])
assert "result_snapshot" not in str(data["events"])
assert "internal provider detail" not in str(data["events"])
assert "executor_completed" not in str(data["events"])
assert "complete_image_asset" not in str(data["events"])
finally:
app.dependency_overrides.clear()
async def test_user_generation_job_summary_hides_internal_executor_step(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id, "assets": ["image"]},
story_id=degraded_story_with_text.id,
)
await record_generation_event(
db_session,
job=job,
story_id=degraded_story_with_text.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"executed_task_keys": ["complete_image_asset"],
},
)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
detail_response = await client.get(f"/api/generations/jobs/{job.id}")
list_response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/jobs"
)
trace_summary_response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/trace-summary"
)
assert detail_response.status_code == 200
detail = detail_response.json()
assert detail["current_step"] == "workflow_planned"
assert detail["progress_label"] == "工作流已规划"
assert "executor_completed" not in str(detail)
assert "complete_image_asset" not in str(detail)
assert list_response.status_code == 200
listed_job = next(item for item in list_response.json() if item["id"] == job.id)
assert listed_job["current_step"] == "workflow_planned"
assert listed_job["progress_label"] == "工作流已规划"
assert trace_summary_response.status_code == 200
trace_summary = trace_summary_response.json()
assert "executor_completed" not in str(trace_summary)
assert "complete_image_asset" not in str(trace_summary)
assert trace_summary["total_events"] == 1
finally:
app.dependency_overrides.clear()
async def test_user_provider_analytics_aggregate_across_stories(
db_session,
auth_token,

View File

@@ -1,5 +1,7 @@
"""Tests for generation harness runtime support."""
from pathlib import Path
import pytest
from sqlalchemy import select
@@ -7,8 +9,21 @@ from app.db.models import GenerationJob, GenerationJobEvent
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.generation_jobs import create_generation_job, record_generation_event
from app.services.harness.artifacts import AssetCompletionResult
from app.services.harness.control import ExecutionControl, GenerationJobCanceledError
from app.services.harness.evaluation_replay import (
EvaluationReplayArtifact,
EvaluationReplayCase,
ExpectedEvaluation,
replay_evaluation_golden_cases,
run_evaluation_replay_cases,
)
from app.services.harness.evaluators import evaluate_story_output, evaluate_storybook_output
from app.services.harness.executor import run_asset_plan
from app.services.harness.plans import (
WorkflowMode,
WorkflowPlan,
WorkflowTask,
build_asset_plan,
build_story_plan,
build_storybook_plan,
@@ -27,12 +42,18 @@ from app.services.harness.types import (
normalize_trace_metadata,
step_for_event,
)
from app.services.story_status import StoryAssetStatus
FIXTURES_DIR = (
Path(__file__).parents[1] / "app" / "services" / "harness" / "fixtures"
)
def test_event_type_maps_to_standard_workflow_step():
assert step_for_event("request_accepted") == WorkflowStep.REQUEST_ACCEPTANCE
assert step_for_event("context_prepared") == WorkflowStep.CONTEXT_PREPARATION
assert step_for_event("narrative_generated") == WorkflowStep.NARRATIVE_GENERATION
assert step_for_event("evaluation_completed") == WorkflowStep.EVALUATION
assert step_for_event("story_saved") == WorkflowStep.STORY_PERSISTENCE
assert step_for_event("provider_call_succeeded") == WorkflowStep.PROVIDER_INVOCATION
assert step_for_event("quality_gate_failed") == WorkflowStep.NARRATIVE_GENERATION
@@ -46,6 +67,7 @@ def test_event_type_maps_to_standard_workflow_step():
def test_event_type_maps_to_standard_artifact():
assert artifact_for_event("narrative_generated") == ArtifactKind.STORY_TEXT
assert artifact_for_event("quality_gate_failed") == ArtifactKind.STORY_TEXT
assert artifact_for_event("evaluation_completed") == ArtifactKind.STORY_TEXT
assert artifact_for_event("cover_image_succeeded") == ArtifactKind.COVER_IMAGE
assert artifact_for_event("storybook_page_image_failed") == ArtifactKind.PAGE_IMAGE
assert artifact_for_event("audio_cache_hit") == ArtifactKind.AUDIO
@@ -108,6 +130,13 @@ def test_story_plan_without_assets_snapshot():
"required": True,
"recoverable": False,
},
{
"key": "evaluate_narrative",
"step": "evaluation",
"artifact": "story_text",
"required": True,
"recoverable": False,
},
{
"key": "persist_story",
"step": "story_persistence",
@@ -137,7 +166,7 @@ def test_story_plan_with_assets_marks_cover_recoverable():
plan = build_story_plan(generate_images=True).to_snapshot()
assert plan["mode"] == "story_with_assets"
assert plan["tasks"][3] == {
assert plan["tasks"][4] == {
"key": "generate_cover_image",
"step": "image_generation",
"artifact": "cover_image",
@@ -153,13 +182,14 @@ def test_storybook_plan_with_images_marks_storybook_images_recoverable():
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_storybook_pages",
"evaluate_storybook_pages",
"generate_storybook_images",
"persist_storybook",
"queue_postprocessing",
"complete_generation",
]
assert plan["tasks"][2]["artifact"] == "image"
assert plan["tasks"][2]["recoverable"] is True
assert plan["tasks"][3]["artifact"] == "image"
assert plan["tasks"][3]["recoverable"] is True
def test_asset_retry_plan_deduplicates_assets():
@@ -200,6 +230,86 @@ def test_asset_retry_plan_deduplicates_assets():
}
@pytest.mark.asyncio
async def test_run_asset_plan_executes_asset_tasks_in_plan_order():
calls: list[str] = []
async def image_task() -> AssetCompletionResult:
calls.append("image")
return AssetCompletionResult(
asset="cover_image",
status=StoryAssetStatus.READY,
value="https://example.com/cover.png",
)
async def audio_task() -> AssetCompletionResult:
calls.append("audio")
return AssetCompletionResult(
asset="audio",
status=StoryAssetStatus.READY,
value=b"audio",
)
result = await run_asset_plan(
build_asset_plan(output_mode="asset_generation", assets=["audio", "image"]),
image_task=image_task,
audio_task=audio_task,
)
assert calls == ["audio", "image"]
assert result.executed_task_keys == ("complete_audio_asset", "complete_image_asset")
assert result.ignored_task_keys == (
"start_asset_generation",
"complete_asset_generation",
)
assert [item.asset for item in result.task_results] == ["audio", "cover_image"]
@pytest.mark.asyncio
async def test_run_asset_plan_ignores_unknown_non_asset_tasks():
calls: list[str] = []
plan = WorkflowPlan(
mode=WorkflowMode.ASSET_RETRY,
tasks=(
WorkflowTask(
key="start_asset_retry",
step=WorkflowStep.ASSET_RETRY,
artifact=ArtifactKind.NONE,
),
WorkflowTask(
key="complete_video_asset",
step=WorkflowStep.UNKNOWN,
artifact=ArtifactKind.UNKNOWN,
required=False,
recoverable=True,
),
WorkflowTask(
key="complete_asset_retry",
step=WorkflowStep.ASSET_RETRY,
artifact=ArtifactKind.NONE,
),
),
)
async def image_task() -> AssetCompletionResult:
calls.append("image")
return AssetCompletionResult(
asset="cover_image",
status=StoryAssetStatus.READY,
)
result = await run_asset_plan(plan, image_task=image_task)
assert calls == []
assert result.task_results == ()
assert result.executed_task_keys == ()
assert result.ignored_task_keys == (
"start_asset_retry",
"complete_video_asset",
"complete_asset_retry",
)
def test_story_quality_gate_accepts_complete_child_safe_story():
validate_story_output(
StoryOutput(
@@ -211,6 +321,166 @@ def test_story_quality_gate_accepts_complete_child_safe_story():
)
def test_story_evaluator_scores_complete_child_safe_story():
result = evaluate_story_output(
StoryOutput(
mode="generated",
title="小兔子的月光花园",
story_text="小兔子在花园里学会了和朋友轮流分享水壶,也学会了复盘今天的努力。",
cover_prompt_suggestion="A gentle moonlit garden with a rabbit",
),
education_theme="复盘",
)
assert result.passed is True
assert result.blocking is False
assert result.overall_score >= 0.9
assert result.to_metadata()["scores"][0]["dimension"] == "structure"
def test_story_evaluator_blocks_quality_gate_failure():
result = evaluate_story_output(
StoryOutput(
mode="generated",
title="空白故事",
story_text="",
cover_prompt_suggestion="A cover",
)
)
assert result.passed is False
assert result.blocking is True
assert result.overall_score == 0.0
assert result.gate_error is not None
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == "missing_story_text"
def test_storybook_evaluator_scores_complete_child_safe_storybook():
result = evaluate_storybook_output(
Storybook(
title="森林里的复盘星星",
main_character="小兔子露露",
art_style="温暖水彩",
cover_prompt="A warm watercolor forest cover",
pages=[
StorybookPage(
page_number=1,
text="露露在森林里发现一颗会提醒她复盘的小星星。",
image_prompt="Lulu finds a star",
),
StorybookPage(
page_number=2,
text="她回想今天的努力,学会下次先和朋友商量。",
image_prompt="Lulu thinking with friends",
),
],
),
education_theme="复盘",
)
assert result.passed is True
assert result.blocking is False
assert result.overall_score >= 0.9
def test_storybook_evaluator_blocks_quality_gate_failure():
result = evaluate_storybook_output(
Storybook(
title="森林绘本",
main_character="小兔子",
art_style="水彩",
cover_prompt="A forest cover",
pages=[
StorybookPage(page_number=1, text="第一页。", image_prompt="page 1"),
StorybookPage(page_number=1, text="第二页。", image_prompt="page 2"),
],
)
)
assert result.passed is False
assert result.blocking is True
assert result.gate_error is not None
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == (
"invalid_storybook_page_number"
)
def test_evaluation_golden_cases_replay_successfully():
result = replay_evaluation_golden_cases(
FIXTURES_DIR / "evaluation_golden_cases.json"
)
assert result.passed is True, result.failure_report()
assert result.failed_case_ids == ()
assert len(result.cases) == 11
assert {
case.artifact
for case in result.cases
} == {
EvaluationReplayArtifact.STORY,
EvaluationReplayArtifact.STORYBOOK,
}
def test_evaluation_golden_cases_report_internal_coverage_summary():
result = replay_evaluation_golden_cases(
FIXTURES_DIR / "evaluation_golden_cases.json"
)
summary = result.coverage_summary()
assert summary["artifact"] == {
"storybook": 5,
"story": 6,
}
assert summary["age_band"] == {
"3-4": 4,
"5-6": 4,
"unknown": 2,
"7-8": 1,
}
assert summary["risk_area"] == {
"schema_error": 4,
"happy_path": 2,
"readability_warning": 2,
"safety_error": 2,
"length_boundary": 1,
}
assert summary["outcome"] == {
"blocked": 8,
"passed": 3,
}
assert summary["tags"]["story"] == 6
assert summary["tags"]["storybook"] == 5
assert summary["tags"]["blocking"] == 6
assert summary["tags"]["threshold_block"] == 2
def test_evaluation_replay_reports_expectation_mismatch():
case = EvaluationReplayCase(
case_id="expectation-mismatch",
artifact=EvaluationReplayArtifact.STORY,
input_payload={"keywords": "小兔子"},
output_payload={
"mode": "generated",
"title": "小兔子的花园",
"story_text": "小兔子学会了和朋友分享水壶。",
"cover_prompt_suggestion": "A rabbit sharing a watering can",
},
expected=ExpectedEvaluation(
passed=True,
blocking=False,
min_overall_score=0.99,
),
)
result = run_evaluation_replay_cases([case])
assert result.passed is False
assert result.failed_case_ids == ("expectation-mismatch",)
assert "expected overall_score >=" in result.failure_report()
def test_story_quality_gate_rejects_missing_story_text():
output = StoryOutput(
mode="generated",