feat: improve generation analytics and maintenance

This commit is contained in:
2026-04-19 09:03:40 +08:00
parent d5a173aa0d
commit 5318de670f
21 changed files with 1155 additions and 57 deletions

View File

@@ -1,5 +1,6 @@
"""Generation job tracking tests."""
from datetime import datetime, timedelta, timezone
from unittest.mock import AsyncMock, patch
import pytest
@@ -12,7 +13,11 @@ from app.main import app
from app.services.adapters import AdapterConfig
from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput
from app.services.generation_jobs import create_generation_job, record_generation_event
from app.services.generation_jobs import (
create_generation_job,
mark_stale_generation_jobs,
record_generation_event,
)
pytestmark = pytest.mark.asyncio
@@ -520,6 +525,7 @@ async def test_user_provider_analytics_aggregate_across_stories(
assert data["failed_calls"] == 1
assert data["avg_latency_ms"] == 60.0
assert data["estimated_cost_usd"] == 0.013
assert data["failure_reasons"] == [{"reason": "timeout", "count": 1}]
assert data["by_provider"] == [
{
"capability": "image",
@@ -551,3 +557,249 @@ async def test_user_provider_analytics_aggregate_across_stories(
]
finally:
app.dependency_overrides.clear()
async def test_provider_analytics_support_days_and_capability_filters(
db_session,
auth_token,
degraded_story_with_text,
test_story,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
image_job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_retry",
input_type="image",
request_payload={"assets": ["image"]},
story_id=degraded_story_with_text.id,
)
old_event = await record_generation_event(
db_session,
job=image_job,
story_id=degraded_story_with_text.id,
event_type="provider_call_failed",
status="failed",
metadata={
"capability": "image",
"adapter": "cqtai",
"strategy": "priority",
"latency_ms": 120,
"error": "timeout",
},
)
old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10)
await db_session.commit()
tts_job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="asset_retry",
input_type="audio",
request_payload={"assets": ["audio"]},
story_id=test_story.id,
)
await record_generation_event(
db_session,
job=tts_job,
story_id=test_story.id,
event_type="provider_call_succeeded",
status="succeeded",
metadata={
"capability": "tts",
"adapter": "edge_tts",
"strategy": "priority",
"latency_ms": 18,
"estimated_cost_usd": 0.003,
},
)
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get("/api/generations/provider-analytics?days=7")
assert response.status_code == 200
data = response.json()
assert data["window_days"] == 7
assert data["total_calls"] == 1
assert data["job_count"] == 1
assert data["story_count"] == 1
assert data["failure_reasons"] == []
response = await client.get(
"/api/generations/provider-analytics?capability=image"
)
assert response.status_code == 200
data = response.json()
assert data["capability"] == "image"
assert data["total_calls"] == 1
assert data["failed_calls"] == 1
assert data["job_count"] == 1
assert data["story_count"] == 1
assert data["failure_reasons"] == [{"reason": "timeout", "count": 1}]
response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/provider-stats?capability=image"
)
assert response.status_code == 200
data = response.json()
assert data["capability"] == "image"
assert data["failure_reasons"] == [{"reason": "timeout", "count": 1}]
finally:
app.dependency_overrides.clear()
async def test_generation_ops_summary_exposes_running_stale_and_recent_failures(
db_session,
auth_token,
degraded_story_with_text,
test_story,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
running_job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="story",
input_type="keywords",
request_payload={"data": "星星"},
story_id=test_story.id,
)
stale_job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id},
story_id=degraded_story_with_text.id,
)
failed_job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_retry",
input_type="image",
request_payload={"assets": ["image"]},
story_id=degraded_story_with_text.id,
)
degraded_job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="storybook",
input_type="keywords",
request_payload={"data": "月亮"},
story_id=test_story.id,
)
stale_job.updated_at = datetime.now(timezone.utc) - timedelta(hours=3)
failed_job.status = "failed"
failed_job.current_step = "asset_retry_failed"
failed_job.error_message = "image timeout"
failed_job.updated_at = datetime.now(timezone.utc) - timedelta(hours=1)
degraded_job.status = "degraded_completed"
degraded_job.current_step = "generation_completed"
degraded_job.updated_at = datetime.now(timezone.utc) - timedelta(minutes=30)
running_job.updated_at = datetime.now(timezone.utc) - timedelta(minutes=10)
await db_session.commit()
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get("/api/generations/ops-summary?hours=48")
assert response.status_code == 200
data = response.json()
assert data["window_hours"] == 48
assert data["active_jobs"] == 2
assert data["stale_running_jobs"] == 1
assert data["failed_jobs"] == 1
assert data["degraded_jobs"] == 1
assert data["asset_retry_jobs"] == 2
assert len(data["recent_failures"]) == 1
assert data["recent_failures"][0]["job_id"] == failed_job.id
assert data["recent_failures"][0]["story_title"] == degraded_story_with_text.title
assert data["recent_failures"][0]["failure_label"] == "资源重试失败"
finally:
app.dependency_overrides.clear()
async def test_mark_stale_generation_jobs_marks_old_running_jobs_failed(
db_session,
degraded_story_with_text,
):
stale_job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="story",
input_type="keywords",
request_payload={"data": "超时任务"},
story_id=degraded_story_with_text.id,
)
stale_job.updated_at = datetime.now(timezone.utc) - timedelta(hours=2)
await db_session.commit()
result = await mark_stale_generation_jobs(db_session, stale_after_minutes=30)
assert result == {"running": 1, "marked_stale": 1, "stale_after_minutes": 30}
refreshed_job = (
await db_session.execute(select(GenerationJob).where(GenerationJob.id == stale_job.id))
).scalar_one()
assert refreshed_job.status == "failed"
assert refreshed_job.current_step == "generation_stale_failed"
assert refreshed_job.error_message == "Generation job exceeded 30 minutes without progress."
events = (
await db_session.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == stale_job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
assert events[-1].event_type == "generation_stale_failed"
assert events[-1].event_metadata["stale_after_minutes"] == 30
async def test_retry_assets_rejects_when_story_has_active_job(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id},
story_id=degraded_story_with_text.id,
)
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.post(
f"/api/generations/{degraded_story_with_text.id}/retry-assets",
json={"assets": ["image"]},
)
assert response.status_code == 409
assert "已有运行中的任务" in response.json()["detail"]
finally:
app.dependency_overrides.clear()