Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -1,5 +1,7 @@
"""Tests for generation harness runtime support."""
from pathlib import Path
import pytest
from sqlalchemy import select
@@ -7,8 +9,21 @@ from app.db.models import GenerationJob, GenerationJobEvent
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.generation_jobs import create_generation_job, record_generation_event
from app.services.harness.artifacts import AssetCompletionResult
from app.services.harness.control import ExecutionControl, GenerationJobCanceledError
from app.services.harness.evaluation_replay import (
EvaluationReplayArtifact,
EvaluationReplayCase,
ExpectedEvaluation,
replay_evaluation_golden_cases,
run_evaluation_replay_cases,
)
from app.services.harness.evaluators import evaluate_story_output, evaluate_storybook_output
from app.services.harness.executor import run_asset_plan
from app.services.harness.plans import (
WorkflowMode,
WorkflowPlan,
WorkflowTask,
build_asset_plan,
build_story_plan,
build_storybook_plan,
@@ -27,12 +42,18 @@ from app.services.harness.types import (
normalize_trace_metadata,
step_for_event,
)
from app.services.story_status import StoryAssetStatus
FIXTURES_DIR = (
Path(__file__).parents[1] / "app" / "services" / "harness" / "fixtures"
)
def test_event_type_maps_to_standard_workflow_step():
assert step_for_event("request_accepted") == WorkflowStep.REQUEST_ACCEPTANCE
assert step_for_event("context_prepared") == WorkflowStep.CONTEXT_PREPARATION
assert step_for_event("narrative_generated") == WorkflowStep.NARRATIVE_GENERATION
assert step_for_event("evaluation_completed") == WorkflowStep.EVALUATION
assert step_for_event("story_saved") == WorkflowStep.STORY_PERSISTENCE
assert step_for_event("provider_call_succeeded") == WorkflowStep.PROVIDER_INVOCATION
assert step_for_event("quality_gate_failed") == WorkflowStep.NARRATIVE_GENERATION
@@ -46,6 +67,7 @@ def test_event_type_maps_to_standard_workflow_step():
def test_event_type_maps_to_standard_artifact():
assert artifact_for_event("narrative_generated") == ArtifactKind.STORY_TEXT
assert artifact_for_event("quality_gate_failed") == ArtifactKind.STORY_TEXT
assert artifact_for_event("evaluation_completed") == ArtifactKind.STORY_TEXT
assert artifact_for_event("cover_image_succeeded") == ArtifactKind.COVER_IMAGE
assert artifact_for_event("storybook_page_image_failed") == ArtifactKind.PAGE_IMAGE
assert artifact_for_event("audio_cache_hit") == ArtifactKind.AUDIO
@@ -108,6 +130,13 @@ def test_story_plan_without_assets_snapshot():
"required": True,
"recoverable": False,
},
{
"key": "evaluate_narrative",
"step": "evaluation",
"artifact": "story_text",
"required": True,
"recoverable": False,
},
{
"key": "persist_story",
"step": "story_persistence",
@@ -137,7 +166,7 @@ def test_story_plan_with_assets_marks_cover_recoverable():
plan = build_story_plan(generate_images=True).to_snapshot()
assert plan["mode"] == "story_with_assets"
assert plan["tasks"][3] == {
assert plan["tasks"][4] == {
"key": "generate_cover_image",
"step": "image_generation",
"artifact": "cover_image",
@@ -153,13 +182,14 @@ def test_storybook_plan_with_images_marks_storybook_images_recoverable():
assert [task["key"] for task in plan["tasks"]] == [
"prepare_context",
"generate_storybook_pages",
"evaluate_storybook_pages",
"generate_storybook_images",
"persist_storybook",
"queue_postprocessing",
"complete_generation",
]
assert plan["tasks"][2]["artifact"] == "image"
assert plan["tasks"][2]["recoverable"] is True
assert plan["tasks"][3]["artifact"] == "image"
assert plan["tasks"][3]["recoverable"] is True
def test_asset_retry_plan_deduplicates_assets():
@@ -200,6 +230,86 @@ def test_asset_retry_plan_deduplicates_assets():
}
@pytest.mark.asyncio
async def test_run_asset_plan_executes_asset_tasks_in_plan_order():
calls: list[str] = []
async def image_task() -> AssetCompletionResult:
calls.append("image")
return AssetCompletionResult(
asset="cover_image",
status=StoryAssetStatus.READY,
value="https://example.com/cover.png",
)
async def audio_task() -> AssetCompletionResult:
calls.append("audio")
return AssetCompletionResult(
asset="audio",
status=StoryAssetStatus.READY,
value=b"audio",
)
result = await run_asset_plan(
build_asset_plan(output_mode="asset_generation", assets=["audio", "image"]),
image_task=image_task,
audio_task=audio_task,
)
assert calls == ["audio", "image"]
assert result.executed_task_keys == ("complete_audio_asset", "complete_image_asset")
assert result.ignored_task_keys == (
"start_asset_generation",
"complete_asset_generation",
)
assert [item.asset for item in result.task_results] == ["audio", "cover_image"]
@pytest.mark.asyncio
async def test_run_asset_plan_ignores_unknown_non_asset_tasks():
calls: list[str] = []
plan = WorkflowPlan(
mode=WorkflowMode.ASSET_RETRY,
tasks=(
WorkflowTask(
key="start_asset_retry",
step=WorkflowStep.ASSET_RETRY,
artifact=ArtifactKind.NONE,
),
WorkflowTask(
key="complete_video_asset",
step=WorkflowStep.UNKNOWN,
artifact=ArtifactKind.UNKNOWN,
required=False,
recoverable=True,
),
WorkflowTask(
key="complete_asset_retry",
step=WorkflowStep.ASSET_RETRY,
artifact=ArtifactKind.NONE,
),
),
)
async def image_task() -> AssetCompletionResult:
calls.append("image")
return AssetCompletionResult(
asset="cover_image",
status=StoryAssetStatus.READY,
)
result = await run_asset_plan(plan, image_task=image_task)
assert calls == []
assert result.task_results == ()
assert result.executed_task_keys == ()
assert result.ignored_task_keys == (
"start_asset_retry",
"complete_video_asset",
"complete_asset_retry",
)
def test_story_quality_gate_accepts_complete_child_safe_story():
validate_story_output(
StoryOutput(
@@ -211,6 +321,166 @@ def test_story_quality_gate_accepts_complete_child_safe_story():
)
def test_story_evaluator_scores_complete_child_safe_story():
result = evaluate_story_output(
StoryOutput(
mode="generated",
title="小兔子的月光花园",
story_text="小兔子在花园里学会了和朋友轮流分享水壶,也学会了复盘今天的努力。",
cover_prompt_suggestion="A gentle moonlit garden with a rabbit",
),
education_theme="复盘",
)
assert result.passed is True
assert result.blocking is False
assert result.overall_score >= 0.9
assert result.to_metadata()["scores"][0]["dimension"] == "structure"
def test_story_evaluator_blocks_quality_gate_failure():
result = evaluate_story_output(
StoryOutput(
mode="generated",
title="空白故事",
story_text="",
cover_prompt_suggestion="A cover",
)
)
assert result.passed is False
assert result.blocking is True
assert result.overall_score == 0.0
assert result.gate_error is not None
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == "missing_story_text"
def test_storybook_evaluator_scores_complete_child_safe_storybook():
result = evaluate_storybook_output(
Storybook(
title="森林里的复盘星星",
main_character="小兔子露露",
art_style="温暖水彩",
cover_prompt="A warm watercolor forest cover",
pages=[
StorybookPage(
page_number=1,
text="露露在森林里发现一颗会提醒她复盘的小星星。",
image_prompt="Lulu finds a star",
),
StorybookPage(
page_number=2,
text="她回想今天的努力,学会下次先和朋友商量。",
image_prompt="Lulu thinking with friends",
),
],
),
education_theme="复盘",
)
assert result.passed is True
assert result.blocking is False
assert result.overall_score >= 0.9
def test_storybook_evaluator_blocks_quality_gate_failure():
result = evaluate_storybook_output(
Storybook(
title="森林绘本",
main_character="小兔子",
art_style="水彩",
cover_prompt="A forest cover",
pages=[
StorybookPage(page_number=1, text="第一页。", image_prompt="page 1"),
StorybookPage(page_number=1, text="第二页。", image_prompt="page 2"),
],
)
)
assert result.passed is False
assert result.blocking is True
assert result.gate_error is not None
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == (
"invalid_storybook_page_number"
)
def test_evaluation_golden_cases_replay_successfully():
result = replay_evaluation_golden_cases(
FIXTURES_DIR / "evaluation_golden_cases.json"
)
assert result.passed is True, result.failure_report()
assert result.failed_case_ids == ()
assert len(result.cases) == 11
assert {
case.artifact
for case in result.cases
} == {
EvaluationReplayArtifact.STORY,
EvaluationReplayArtifact.STORYBOOK,
}
def test_evaluation_golden_cases_report_internal_coverage_summary():
result = replay_evaluation_golden_cases(
FIXTURES_DIR / "evaluation_golden_cases.json"
)
summary = result.coverage_summary()
assert summary["artifact"] == {
"storybook": 5,
"story": 6,
}
assert summary["age_band"] == {
"3-4": 4,
"5-6": 4,
"unknown": 2,
"7-8": 1,
}
assert summary["risk_area"] == {
"schema_error": 4,
"happy_path": 2,
"readability_warning": 2,
"safety_error": 2,
"length_boundary": 1,
}
assert summary["outcome"] == {
"blocked": 8,
"passed": 3,
}
assert summary["tags"]["story"] == 6
assert summary["tags"]["storybook"] == 5
assert summary["tags"]["blocking"] == 6
assert summary["tags"]["threshold_block"] == 2
def test_evaluation_replay_reports_expectation_mismatch():
case = EvaluationReplayCase(
case_id="expectation-mismatch",
artifact=EvaluationReplayArtifact.STORY,
input_payload={"keywords": "小兔子"},
output_payload={
"mode": "generated",
"title": "小兔子的花园",
"story_text": "小兔子学会了和朋友分享水壶。",
"cover_prompt_suggestion": "A rabbit sharing a watering can",
},
expected=ExpectedEvaluation(
passed=True,
blocking=False,
min_overall_score=0.99,
),
)
result = run_evaluation_replay_cases([case])
assert result.passed is False
assert result.failed_case_ids == ("expectation-mismatch",)
assert "expected overall_score >=" in result.failure_report()
def test_story_quality_gate_rejects_missing_story_text():
output = StoryOutput(
mode="generated",