Expand generation harness observability
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
"""Tests for generation harness runtime support."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from sqlalchemy import select
|
||||
|
||||
@@ -7,8 +9,21 @@ from app.db.models import GenerationJob, GenerationJobEvent
|
||||
from app.services.adapters.storybook.primary import Storybook, StorybookPage
|
||||
from app.services.adapters.text.models import StoryOutput
|
||||
from app.services.generation_jobs import create_generation_job, record_generation_event
|
||||
from app.services.harness.artifacts import AssetCompletionResult
|
||||
from app.services.harness.control import ExecutionControl, GenerationJobCanceledError
|
||||
from app.services.harness.evaluation_replay import (
|
||||
EvaluationReplayArtifact,
|
||||
EvaluationReplayCase,
|
||||
ExpectedEvaluation,
|
||||
replay_evaluation_golden_cases,
|
||||
run_evaluation_replay_cases,
|
||||
)
|
||||
from app.services.harness.evaluators import evaluate_story_output, evaluate_storybook_output
|
||||
from app.services.harness.executor import run_asset_plan
|
||||
from app.services.harness.plans import (
|
||||
WorkflowMode,
|
||||
WorkflowPlan,
|
||||
WorkflowTask,
|
||||
build_asset_plan,
|
||||
build_story_plan,
|
||||
build_storybook_plan,
|
||||
@@ -27,12 +42,18 @@ from app.services.harness.types import (
|
||||
normalize_trace_metadata,
|
||||
step_for_event,
|
||||
)
|
||||
from app.services.story_status import StoryAssetStatus
|
||||
|
||||
FIXTURES_DIR = (
|
||||
Path(__file__).parents[1] / "app" / "services" / "harness" / "fixtures"
|
||||
)
|
||||
|
||||
|
||||
def test_event_type_maps_to_standard_workflow_step():
|
||||
assert step_for_event("request_accepted") == WorkflowStep.REQUEST_ACCEPTANCE
|
||||
assert step_for_event("context_prepared") == WorkflowStep.CONTEXT_PREPARATION
|
||||
assert step_for_event("narrative_generated") == WorkflowStep.NARRATIVE_GENERATION
|
||||
assert step_for_event("evaluation_completed") == WorkflowStep.EVALUATION
|
||||
assert step_for_event("story_saved") == WorkflowStep.STORY_PERSISTENCE
|
||||
assert step_for_event("provider_call_succeeded") == WorkflowStep.PROVIDER_INVOCATION
|
||||
assert step_for_event("quality_gate_failed") == WorkflowStep.NARRATIVE_GENERATION
|
||||
@@ -46,6 +67,7 @@ def test_event_type_maps_to_standard_workflow_step():
|
||||
def test_event_type_maps_to_standard_artifact():
|
||||
assert artifact_for_event("narrative_generated") == ArtifactKind.STORY_TEXT
|
||||
assert artifact_for_event("quality_gate_failed") == ArtifactKind.STORY_TEXT
|
||||
assert artifact_for_event("evaluation_completed") == ArtifactKind.STORY_TEXT
|
||||
assert artifact_for_event("cover_image_succeeded") == ArtifactKind.COVER_IMAGE
|
||||
assert artifact_for_event("storybook_page_image_failed") == ArtifactKind.PAGE_IMAGE
|
||||
assert artifact_for_event("audio_cache_hit") == ArtifactKind.AUDIO
|
||||
@@ -108,6 +130,13 @@ def test_story_plan_without_assets_snapshot():
|
||||
"required": True,
|
||||
"recoverable": False,
|
||||
},
|
||||
{
|
||||
"key": "evaluate_narrative",
|
||||
"step": "evaluation",
|
||||
"artifact": "story_text",
|
||||
"required": True,
|
||||
"recoverable": False,
|
||||
},
|
||||
{
|
||||
"key": "persist_story",
|
||||
"step": "story_persistence",
|
||||
@@ -137,7 +166,7 @@ def test_story_plan_with_assets_marks_cover_recoverable():
|
||||
plan = build_story_plan(generate_images=True).to_snapshot()
|
||||
|
||||
assert plan["mode"] == "story_with_assets"
|
||||
assert plan["tasks"][3] == {
|
||||
assert plan["tasks"][4] == {
|
||||
"key": "generate_cover_image",
|
||||
"step": "image_generation",
|
||||
"artifact": "cover_image",
|
||||
@@ -153,13 +182,14 @@ def test_storybook_plan_with_images_marks_storybook_images_recoverable():
|
||||
assert [task["key"] for task in plan["tasks"]] == [
|
||||
"prepare_context",
|
||||
"generate_storybook_pages",
|
||||
"evaluate_storybook_pages",
|
||||
"generate_storybook_images",
|
||||
"persist_storybook",
|
||||
"queue_postprocessing",
|
||||
"complete_generation",
|
||||
]
|
||||
assert plan["tasks"][2]["artifact"] == "image"
|
||||
assert plan["tasks"][2]["recoverable"] is True
|
||||
assert plan["tasks"][3]["artifact"] == "image"
|
||||
assert plan["tasks"][3]["recoverable"] is True
|
||||
|
||||
|
||||
def test_asset_retry_plan_deduplicates_assets():
|
||||
@@ -200,6 +230,86 @@ def test_asset_retry_plan_deduplicates_assets():
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_asset_plan_executes_asset_tasks_in_plan_order():
|
||||
calls: list[str] = []
|
||||
|
||||
async def image_task() -> AssetCompletionResult:
|
||||
calls.append("image")
|
||||
return AssetCompletionResult(
|
||||
asset="cover_image",
|
||||
status=StoryAssetStatus.READY,
|
||||
value="https://example.com/cover.png",
|
||||
)
|
||||
|
||||
async def audio_task() -> AssetCompletionResult:
|
||||
calls.append("audio")
|
||||
return AssetCompletionResult(
|
||||
asset="audio",
|
||||
status=StoryAssetStatus.READY,
|
||||
value=b"audio",
|
||||
)
|
||||
|
||||
result = await run_asset_plan(
|
||||
build_asset_plan(output_mode="asset_generation", assets=["audio", "image"]),
|
||||
image_task=image_task,
|
||||
audio_task=audio_task,
|
||||
)
|
||||
|
||||
assert calls == ["audio", "image"]
|
||||
assert result.executed_task_keys == ("complete_audio_asset", "complete_image_asset")
|
||||
assert result.ignored_task_keys == (
|
||||
"start_asset_generation",
|
||||
"complete_asset_generation",
|
||||
)
|
||||
assert [item.asset for item in result.task_results] == ["audio", "cover_image"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_asset_plan_ignores_unknown_non_asset_tasks():
|
||||
calls: list[str] = []
|
||||
plan = WorkflowPlan(
|
||||
mode=WorkflowMode.ASSET_RETRY,
|
||||
tasks=(
|
||||
WorkflowTask(
|
||||
key="start_asset_retry",
|
||||
step=WorkflowStep.ASSET_RETRY,
|
||||
artifact=ArtifactKind.NONE,
|
||||
),
|
||||
WorkflowTask(
|
||||
key="complete_video_asset",
|
||||
step=WorkflowStep.UNKNOWN,
|
||||
artifact=ArtifactKind.UNKNOWN,
|
||||
required=False,
|
||||
recoverable=True,
|
||||
),
|
||||
WorkflowTask(
|
||||
key="complete_asset_retry",
|
||||
step=WorkflowStep.ASSET_RETRY,
|
||||
artifact=ArtifactKind.NONE,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
async def image_task() -> AssetCompletionResult:
|
||||
calls.append("image")
|
||||
return AssetCompletionResult(
|
||||
asset="cover_image",
|
||||
status=StoryAssetStatus.READY,
|
||||
)
|
||||
|
||||
result = await run_asset_plan(plan, image_task=image_task)
|
||||
|
||||
assert calls == []
|
||||
assert result.task_results == ()
|
||||
assert result.executed_task_keys == ()
|
||||
assert result.ignored_task_keys == (
|
||||
"start_asset_retry",
|
||||
"complete_video_asset",
|
||||
"complete_asset_retry",
|
||||
)
|
||||
|
||||
|
||||
def test_story_quality_gate_accepts_complete_child_safe_story():
|
||||
validate_story_output(
|
||||
StoryOutput(
|
||||
@@ -211,6 +321,166 @@ def test_story_quality_gate_accepts_complete_child_safe_story():
|
||||
)
|
||||
|
||||
|
||||
def test_story_evaluator_scores_complete_child_safe_story():
|
||||
result = evaluate_story_output(
|
||||
StoryOutput(
|
||||
mode="generated",
|
||||
title="小兔子的月光花园",
|
||||
story_text="小兔子在花园里学会了和朋友轮流分享水壶,也学会了复盘今天的努力。",
|
||||
cover_prompt_suggestion="A gentle moonlit garden with a rabbit",
|
||||
),
|
||||
education_theme="复盘",
|
||||
)
|
||||
|
||||
assert result.passed is True
|
||||
assert result.blocking is False
|
||||
assert result.overall_score >= 0.9
|
||||
assert result.to_metadata()["scores"][0]["dimension"] == "structure"
|
||||
|
||||
|
||||
def test_story_evaluator_blocks_quality_gate_failure():
|
||||
result = evaluate_story_output(
|
||||
StoryOutput(
|
||||
mode="generated",
|
||||
title="空白故事",
|
||||
story_text="",
|
||||
cover_prompt_suggestion="A cover",
|
||||
)
|
||||
)
|
||||
|
||||
assert result.passed is False
|
||||
assert result.blocking is True
|
||||
assert result.overall_score == 0.0
|
||||
assert result.gate_error is not None
|
||||
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == "missing_story_text"
|
||||
|
||||
|
||||
def test_storybook_evaluator_scores_complete_child_safe_storybook():
|
||||
result = evaluate_storybook_output(
|
||||
Storybook(
|
||||
title="森林里的复盘星星",
|
||||
main_character="小兔子露露",
|
||||
art_style="温暖水彩",
|
||||
cover_prompt="A warm watercolor forest cover",
|
||||
pages=[
|
||||
StorybookPage(
|
||||
page_number=1,
|
||||
text="露露在森林里发现一颗会提醒她复盘的小星星。",
|
||||
image_prompt="Lulu finds a star",
|
||||
),
|
||||
StorybookPage(
|
||||
page_number=2,
|
||||
text="她回想今天的努力,学会下次先和朋友商量。",
|
||||
image_prompt="Lulu thinking with friends",
|
||||
),
|
||||
],
|
||||
),
|
||||
education_theme="复盘",
|
||||
)
|
||||
|
||||
assert result.passed is True
|
||||
assert result.blocking is False
|
||||
assert result.overall_score >= 0.9
|
||||
|
||||
|
||||
def test_storybook_evaluator_blocks_quality_gate_failure():
|
||||
result = evaluate_storybook_output(
|
||||
Storybook(
|
||||
title="森林绘本",
|
||||
main_character="小兔子",
|
||||
art_style="水彩",
|
||||
cover_prompt="A forest cover",
|
||||
pages=[
|
||||
StorybookPage(page_number=1, text="第一页。", image_prompt="page 1"),
|
||||
StorybookPage(page_number=1, text="第二页。", image_prompt="page 2"),
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
assert result.passed is False
|
||||
assert result.blocking is True
|
||||
assert result.gate_error is not None
|
||||
assert result.to_metadata()["quality_gate"]["issues"][0]["code"] == (
|
||||
"invalid_storybook_page_number"
|
||||
)
|
||||
|
||||
|
||||
def test_evaluation_golden_cases_replay_successfully():
|
||||
result = replay_evaluation_golden_cases(
|
||||
FIXTURES_DIR / "evaluation_golden_cases.json"
|
||||
)
|
||||
|
||||
assert result.passed is True, result.failure_report()
|
||||
assert result.failed_case_ids == ()
|
||||
assert len(result.cases) == 11
|
||||
assert {
|
||||
case.artifact
|
||||
for case in result.cases
|
||||
} == {
|
||||
EvaluationReplayArtifact.STORY,
|
||||
EvaluationReplayArtifact.STORYBOOK,
|
||||
}
|
||||
|
||||
|
||||
def test_evaluation_golden_cases_report_internal_coverage_summary():
|
||||
result = replay_evaluation_golden_cases(
|
||||
FIXTURES_DIR / "evaluation_golden_cases.json"
|
||||
)
|
||||
|
||||
summary = result.coverage_summary()
|
||||
|
||||
assert summary["artifact"] == {
|
||||
"storybook": 5,
|
||||
"story": 6,
|
||||
}
|
||||
assert summary["age_band"] == {
|
||||
"3-4": 4,
|
||||
"5-6": 4,
|
||||
"unknown": 2,
|
||||
"7-8": 1,
|
||||
}
|
||||
assert summary["risk_area"] == {
|
||||
"schema_error": 4,
|
||||
"happy_path": 2,
|
||||
"readability_warning": 2,
|
||||
"safety_error": 2,
|
||||
"length_boundary": 1,
|
||||
}
|
||||
assert summary["outcome"] == {
|
||||
"blocked": 8,
|
||||
"passed": 3,
|
||||
}
|
||||
assert summary["tags"]["story"] == 6
|
||||
assert summary["tags"]["storybook"] == 5
|
||||
assert summary["tags"]["blocking"] == 6
|
||||
assert summary["tags"]["threshold_block"] == 2
|
||||
|
||||
|
||||
def test_evaluation_replay_reports_expectation_mismatch():
|
||||
case = EvaluationReplayCase(
|
||||
case_id="expectation-mismatch",
|
||||
artifact=EvaluationReplayArtifact.STORY,
|
||||
input_payload={"keywords": "小兔子"},
|
||||
output_payload={
|
||||
"mode": "generated",
|
||||
"title": "小兔子的花园",
|
||||
"story_text": "小兔子学会了和朋友分享水壶。",
|
||||
"cover_prompt_suggestion": "A rabbit sharing a watering can",
|
||||
},
|
||||
expected=ExpectedEvaluation(
|
||||
passed=True,
|
||||
blocking=False,
|
||||
min_overall_score=0.99,
|
||||
),
|
||||
)
|
||||
|
||||
result = run_evaluation_replay_cases([case])
|
||||
|
||||
assert result.passed is False
|
||||
assert result.failed_case_ids == ("expectation-mismatch",)
|
||||
assert "expected overall_score >=" in result.failure_report()
|
||||
|
||||
|
||||
def test_story_quality_gate_rejects_missing_story_text():
|
||||
output = StoryOutput(
|
||||
mode="generated",
|
||||
|
||||
Reference in New Issue
Block a user