Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -36,8 +36,8 @@ from app.services.generation_jobs import (
ensure_no_active_story_generation_job,
finish_generation_job,
generation_job_can_retry,
generation_job_to_summary,
get_generation_job_for_user,
public_generation_job_to_summary,
record_generation_event,
)
from app.services.harness.artifacts import (
@@ -57,12 +57,27 @@ from app.services.harness.control import (
ExecutionControl,
GenerationJobCanceledError,
)
from app.services.harness.evaluators import (
EvaluationResult,
evaluate_story_output,
evaluate_storybook_output,
)
from app.services.harness.executor import (
record_evaluation_result,
record_executor_result,
record_workflow_plan,
run_asset_plan,
)
from app.services.harness.plans import (
build_asset_plan,
build_story_plan,
build_storybook_plan,
)
from app.services.harness.quality_gates import (
QualityGateError,
validate_story_output,
validate_storybook_output,
)
from app.services.harness.trace import TraceRecorder
from app.services.harness.types import ArtifactKind
from app.services.memory_service import build_enhanced_memory_context
from app.services.provider_router import (
generate_image,
@@ -129,6 +144,24 @@ async def _record_quality_gate_failure_if_present(
)
async def _record_evaluation_result_if_present(
db: AsyncSession,
*,
job,
evaluation: EvaluationResult,
artifact: ArtifactKind | str = ArtifactKind.STORY_TEXT,
) -> None:
"""Append deterministic evaluation metadata for tracked worker jobs."""
await record_evaluation_result(
db,
job=job,
metadata=evaluation.to_metadata(),
status="succeeded" if evaluation.passed else "failed",
artifact=artifact,
)
def _asset_result_metadata(result: AssetCompletionResult) -> dict:
"""Build JSON-safe metadata for asset workflow events."""
@@ -643,18 +676,33 @@ async def generate_and_save_story(
user_id=user_id,
generation_job=job,
)
validate_story_output(result)
except QualityGateError as exc:
await _record_quality_gate_failure_if_present(db, job=job, error=exc)
raise HTTPException(
status_code=502,
detail="Story generation failed quality checks, please try again.",
) from exc
except Exception as exc:
raise HTTPException(
status_code=502,
detail="Story generation failed, please try again.",
) from exc
evaluation = evaluate_story_output(
result,
education_theme=request.education_theme,
)
if evaluation.gate_error is not None:
await _record_quality_gate_failure_if_present(
db,
job=job,
error=evaluation.gate_error,
)
await _record_evaluation_result_if_present(
db,
job=job,
evaluation=evaluation,
)
if evaluation.blocking:
raise HTTPException(
status_code=502,
detail="Story generation failed quality checks, please try again.",
)
await _record_job_event_if_present(
db,
job=job,
@@ -758,13 +806,32 @@ async def generate_storybook_service(
user_id=user_id,
generation_job=job,
)
validate_storybook_output(storybook)
except QualityGateError as exc:
await _record_quality_gate_failure_if_present(db, job=job, error=exc)
raise HTTPException(status_code=500, detail=f"故事书质量检查失败: {exc}") from exc
except Exception as e:
logger.error("storybook_generation_failed", error=str(e))
raise HTTPException(status_code=500, detail=f"故事书生成失败: {e}")
evaluation = evaluate_storybook_output(
storybook,
education_theme=request.education_theme,
)
if evaluation.gate_error is not None:
await _record_quality_gate_failure_if_present(
db,
job=job,
error=evaluation.gate_error,
)
await _record_evaluation_result_if_present(
db,
job=job,
evaluation=evaluation,
artifact=ArtifactKind.STORYBOOK_PAGES,
)
if evaluation.blocking:
raise HTTPException(
status_code=500,
detail=f"故事书质量检查失败: {evaluation.gate_error or 'evaluation blocked'}",
)
await _record_job_event_if_present(
db,
job=job,
@@ -1025,28 +1092,50 @@ async def _generate_asset_generation_service_with_job(
if not requested_assets:
raise HTTPException(status_code=400, detail="资源任务缺少 assets。")
plan = build_asset_plan(
output_mode="asset_generation",
assets=requested_assets,
)
await record_workflow_plan(
db,
job=job,
plan=plan,
)
story = await get_story_detail(int(story_id), job.user_id, db)
if "image" in requested_assets:
async def complete_image() -> AssetCompletionResult:
if story.mode == "storybook":
await _complete_storybook_image_assets(story, db, job=job)
else:
await _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
)
return await _complete_storybook_image_assets(story, db, job=job)
if "audio" in requested_assets:
await _complete_audio_asset(
return await _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
)
async def complete_audio() -> AssetCompletionResult:
return await _complete_audio_asset(
story,
db,
raise_on_failure=True,
job=job,
)
asset_plan_result = await run_asset_plan(
plan,
image_task=complete_image if "image" in requested_assets else None,
audio_task=complete_audio if "audio" in requested_assets else None,
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_plan_result,
)
story = await get_story_detail(story.id, job.user_id, db)
await finish_generation_job(
db,
@@ -1096,7 +1185,7 @@ async def retry_generation_job_service(
)
await _dispatch_generation_job(db, job=retry_job)
await db.refresh(retry_job)
return generation_job_to_summary(retry_job)
return public_generation_job_to_summary(retry_job)
async def _generate_generation_service_with_job(
@@ -1109,6 +1198,11 @@ async def _generate_generation_service_with_job(
"""Run the unified generation workflow after the tracking job has been created."""
if request.output_mode == "storybook":
await record_workflow_plan(
db,
job=job,
plan=build_storybook_plan(generate_images=request.generate_images),
)
storybook = await generate_storybook_service(
StorybookRequest(
keywords=request.data,
@@ -1155,6 +1249,9 @@ async def _generate_generation_service_with_job(
retryable_assets=saved_story.retryable_assets,
)
if request.output_mode == "story" and not request.generate_images:
return await _execute_story_without_assets_plan(request, user_id, db, job=job)
generate_request = GenerateRequest(
type=request.type,
data=request.data,
@@ -1164,6 +1261,11 @@ async def _generate_generation_service_with_job(
)
if request.generate_images:
await record_workflow_plan(
db,
job=job,
plan=build_story_plan(generate_images=True),
)
story = await generate_full_story_service(generate_request, user_id, db, job=job)
saved_story = await get_story_detail(story.id, user_id, db)
await _record_postprocessing_event_if_needed(db, job=job, story=saved_story)
@@ -1222,6 +1324,54 @@ async def _generate_generation_service_with_job(
universe_id=story.universe_id,
retryable_assets=story.retryable_assets,
)
async def _execute_story_without_assets_plan(
request: GenerationRequest,
user_id: str,
db: AsyncSession,
*,
job,
) -> GenerationResponse:
"""Execute the minimal text-story workflow through an explicit plan."""
plan = build_story_plan(generate_images=False)
await record_workflow_plan(db, job=job, plan=plan)
generate_request = GenerateRequest(
type=request.type,
data=request.data,
education_theme=request.education_theme,
child_profile_id=request.child_profile_id,
universe_id=request.universe_id,
)
story = await generate_and_save_story(generate_request, user_id, db, job=job)
await _record_postprocessing_event_if_needed(db, job=job, story=story)
await finish_generation_job(
db,
job=job,
story=story,
current_step="generation_completed",
message="Story generation completed with a persisted readable narrative.",
)
return GenerationResponse(
id=story.id,
generation_job_id=job.id,
title=story.title,
mode=story.mode,
story_text=story.story_text,
cover_prompt=story.cover_prompt,
image_url=story.image_url,
cover_url=story.image_url,
generation_status=story.generation_status,
text_status=story.text_status,
image_status=story.image_status,
audio_status=story.audio_status,
last_error=story.last_error,
child_profile_id=story.child_profile_id,
universe_id=story.universe_id,
retryable_assets=story.retryable_assets,
)
async def list_stories(
@@ -1321,36 +1471,7 @@ async def queue_story_asset_generation(
)
await _dispatch_generation_job(db, job=job)
await db.refresh(job)
return generation_job_to_summary(job)
async def _retry_cover_image_asset(story: Story, db: AsyncSession, *, job=None) -> None:
"""Retry cover generation for a text story."""
await _complete_cover_image_asset(
story,
db,
last_error_prefix="封面生成失败",
log_event="cover_asset_retry_failed",
job=job,
)
async def _retry_storybook_image_assets(
story: Story,
db: AsyncSession,
*,
job=None,
) -> None:
"""Retry missing storybook cover/page images."""
await _complete_storybook_image_assets(story, db, job=job)
async def _retry_audio_asset(story: Story, db: AsyncSession, *, job=None) -> None:
"""Retry audio generation while preserving persisted status on provider failure."""
await _complete_audio_asset(story, db, raise_on_failure=False, job=job)
return public_generation_job_to_summary(job)
async def retry_story_assets(
@@ -1374,6 +1495,15 @@ async def retry_story_assets(
try:
story = await get_story_detail(story_id, user_id, db)
plan = build_asset_plan(
output_mode="asset_retry",
assets=requested_assets,
)
await record_workflow_plan(
db,
job=job,
plan=plan,
)
await record_generation_event(
db,
job=job,
@@ -1384,14 +1514,37 @@ async def retry_story_assets(
metadata={"assets": requested_assets},
)
if "image" in requested_assets:
async def retry_image() -> AssetCompletionResult:
if story.mode == "storybook":
await _retry_storybook_image_assets(story, db, job=job)
else:
await _retry_cover_image_asset(story, db, job=job)
return await _complete_storybook_image_assets(story, db, job=job)
if "audio" in requested_assets:
await _retry_audio_asset(story, db, job=job)
return await _complete_cover_image_asset(
story,
db,
last_error_prefix="封面生成失败",
log_event="cover_asset_retry_failed",
job=job,
)
async def retry_audio() -> AssetCompletionResult:
return await _complete_audio_asset(
story,
db,
raise_on_failure=False,
job=job,
)
asset_plan_result = await run_asset_plan(
plan,
image_task=retry_image if "image" in requested_assets else None,
audio_task=retry_audio if "audio" in requested_assets else None,
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_plan_result,
)
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
@@ -1448,13 +1601,29 @@ async def generate_story_cover(
try:
story = await get_story_detail(story_id, user_id, db)
image_result = await _complete_cover_image_asset(
story,
plan = build_asset_plan(output_mode="asset_generation", assets=["image"])
await record_workflow_plan(
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
plan=plan,
)
asset_result = await run_asset_plan(
plan,
image_task=lambda: _complete_cover_image_asset(
story,
db,
raise_on_failure=True,
log_event="cover_generation_failed",
job=job,
),
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_result,
)
image_result = asset_result.task_results[0] if asset_result.task_results else None
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
db,
@@ -1464,7 +1633,11 @@ async def generate_story_cover(
message="Cover image generation completed.",
metadata={"assets": ["image"]},
)
if image_result.succeeded and isinstance(image_result.value, str):
if (
image_result is not None
and image_result.succeeded
and isinstance(image_result.value, str)
):
return image_result.value
except HTTPException as exc:
await finish_generation_job(
@@ -1501,12 +1674,28 @@ async def generate_story_audio(
try:
story = await get_story_detail(story_id, user_id, db)
audio_result = await _complete_audio_asset(
story,
plan = build_asset_plan(output_mode="asset_generation", assets=["audio"])
await record_workflow_plan(
db,
raise_on_failure=True,
job=job,
plan=plan,
)
asset_result = await run_asset_plan(
plan,
audio_task=lambda: _complete_audio_asset(
story,
db,
raise_on_failure=True,
job=job,
),
)
await record_executor_result(
db,
job=job,
plan=plan,
result=asset_result,
)
audio_result = asset_result.task_results[0] if asset_result.task_results else None
story = await get_story_detail(story_id, user_id, db)
await finish_generation_job(
db,
@@ -1516,7 +1705,11 @@ async def generate_story_audio(
message="Story audio generation completed.",
metadata={"assets": ["audio"]},
)
if audio_result.succeeded and isinstance(audio_result.value, bytes):
if (
audio_result is not None
and audio_result.succeeded
and isinstance(audio_result.value, bytes)
):
return audio_result.value
except HTTPException as exc:
await finish_generation_job(