feat: improve generation analytics and maintenance

This commit is contained in:
2026-04-19 09:03:40 +08:00
parent d5a173aa0d
commit 5318de670f
21 changed files with 1155 additions and 57 deletions

View File

@@ -134,7 +134,8 @@ npm run build
| GET | `/api/generations/jobs/{job_id}` | 查询生成任务事件流 | | GET | `/api/generations/jobs/{job_id}` | 查询生成任务事件流 |
| GET | `/api/generations/{story_id}/jobs` | 查询故事生成与重试历史 | | GET | `/api/generations/{story_id}/jobs` | 查询故事生成与重试历史 |
| GET | `/api/generations/{story_id}/provider-stats` | 查询 Provider 调用聚合指标 | | GET | `/api/generations/{story_id}/provider-stats` | 查询 Provider 调用聚合指标 |
| GET | `/api/generations/provider-analytics` | 查询当前用户跨故事 Provider 运营摘要 | | GET | `/api/generations/ops-summary` | 查询最近任务运行概览、失败摘要和超时阈值 |
| GET | `/api/generations/provider-analytics` | 查询当前用户跨故事 Provider 运营摘要,支持 `days` / `capability` 筛选 |
| GET | `/api/audio/{story_id}/status` | 查询音频缓存状态,不触发生成 | | GET | `/api/audio/{story_id}/status` | 查询音频缓存状态,不触发生成 |
| DELETE | `/api/audio/{story_id}/cache` | 清理故事音频缓存 | | DELETE | `/api/audio/{story_id}/cache` | 清理故事音频缓存 |
| GET | `/api/stories` | 故事列表 | | GET | `/api/stories` | 故事列表 |
@@ -164,4 +165,4 @@ npm run build
## 当前取舍 ## 当前取舍
仓库只保留一个 Docker Compose 入口:`docker-compose.yml`。生产部署、HA 演练、旧 Claude 原型和历史归档已从主仓库移除,避免干扰当前求职演示主线。 仓库只保留一个 Docker Compose 入口:`docker-compose.yml`。生产部署、HA 演练、旧 Claude 原型和历史归档已从主仓库移除,避免干扰当前求职演示主线。音频缓存默认按 `STORY_AUDIO_CACHE_TTL_DAYS=30` 做后台清理Celery beat 会每日执行一次 prune生成任务默认按 `GENERATION_JOB_STALE_MINUTES=60` 判定卡住,后台会定时自动收敛为失败态,避免故事长期显示“永远在跑”。

View File

@@ -1,5 +1,5 @@
<script setup lang="ts"> <script setup lang="ts">
import { computed, ref, onMounted } from 'vue' import { computed, ref, onMounted, watch } from 'vue'
import { useRouter } from 'vue-router' import { useRouter } from 'vue-router'
import { api } from '../api/client' import { api } from '../api/client'
import BaseButton from '../components/ui/BaseButton.vue' import BaseButton from '../components/ui/BaseButton.vue'
@@ -45,6 +45,8 @@ interface GenerationProviderStat {
} }
interface GenerationProviderAnalytics { interface GenerationProviderAnalytics {
window_days: number | null
capability: string | null
total_calls: number total_calls: number
successful_calls: number successful_calls: number
failed_calls: number failed_calls: number
@@ -53,14 +55,43 @@ interface GenerationProviderAnalytics {
job_count: number job_count: number
story_count: number story_count: number
by_provider: GenerationProviderStat[] by_provider: GenerationProviderStat[]
failure_reasons: Array<{
reason: string
count: number
}>
}
interface GenerationRecentFailure {
job_id: string
story_id: number | null
story_title: string | null
output_mode: string
current_step: string
error_message: string | null
failure_label: string
updated_at: string
}
interface GenerationOpsSummary {
window_hours: number
stale_threshold_minutes: number
active_jobs: number
stale_running_jobs: number
failed_jobs: number
degraded_jobs: number
asset_retry_jobs: number
recent_failures: GenerationRecentFailure[]
} }
const router = useRouter() const router = useRouter()
const stories = ref<StoryItem[]>([]) const stories = ref<StoryItem[]>([])
const providerAnalytics = ref<GenerationProviderAnalytics | null>(null) const providerAnalytics = ref<GenerationProviderAnalytics | null>(null)
const opsSummary = ref<GenerationOpsSummary | null>(null)
const loading = ref(true) const loading = ref(true)
const error = ref('') const error = ref('')
const showCreateModal = ref(false) const showCreateModal = ref(false)
const selectedWindow = ref<'7' | '30' | 'all'>('30')
const selectedCapability = ref<'all' | 'text' | 'image' | 'tts' | 'storybook'>('all')
const readableCount = computed(() => const readableCount = computed(() =>
stories.value.filter((story) => isReadableGenerationStatus(story.generation_status)).length, stories.value.filter((story) => isReadableGenerationStatus(story.generation_status)).length,
) )
@@ -74,15 +105,30 @@ const providerSuccessRate = computed(() => {
) )
}) })
const topProvider = computed(() => providerAnalytics.value?.by_provider[0] ?? null) const topProvider = computed(() => providerAnalytics.value?.by_provider[0] ?? null)
const topFailureReason = computed(() => providerAnalytics.value?.failure_reasons[0] ?? null)
function buildProviderAnalyticsPath() {
const params = new URLSearchParams()
if (selectedWindow.value !== 'all') {
params.set('days', selectedWindow.value)
}
if (selectedCapability.value !== 'all') {
params.set('capability', selectedCapability.value)
}
const query = params.toString()
return `/api/generations/provider-analytics${query ? `?${query}` : ''}`
}
async function fetchStories() { async function fetchStories() {
try { try {
const [storyList, analytics] = await Promise.all([ const [storyList, analytics, ops] = await Promise.all([
api.get<StoryItem[]>('/api/stories'), api.get<StoryItem[]>('/api/stories'),
api.get<GenerationProviderAnalytics>('/api/generations/provider-analytics'), api.get<GenerationProviderAnalytics>(buildProviderAnalyticsPath()),
api.get<GenerationOpsSummary>('/api/generations/ops-summary'),
]) ])
stories.value = storyList stories.value = storyList
providerAnalytics.value = analytics providerAnalytics.value = analytics
opsSummary.value = ops
} catch (e) { } catch (e) {
error.value = e instanceof Error ? e.message : '加载失败' error.value = e instanceof Error ? e.message : '加载失败'
} finally { } finally {
@@ -123,6 +169,27 @@ function formatCost(value?: number | null) {
return typeof value === 'number' ? `$${value.toFixed(4)}` : '$0.0000' return typeof value === 'number' ? `$${value.toFixed(4)}` : '$0.0000'
} }
function formatOutputMode(value: string) {
switch (value) {
case 'storybook':
return '绘本'
case 'asset_retry':
return '资源重试'
case 'asset_generation':
return '资源生成'
default:
return '故事'
}
}
function setWindow(value: '7' | '30' | 'all') {
selectedWindow.value = value
}
function setCapability(value: 'all' | 'text' | 'image' | 'tts' | 'storybook') {
selectedCapability.value = value
}
onMounted(() => { onMounted(() => {
fetchStories() fetchStories()
if (router.currentRoute.value.query.openCreate) { if (router.currentRoute.value.query.openCreate) {
@@ -130,6 +197,10 @@ onMounted(() => {
router.replace({ query: { ...router.currentRoute.value.query, openCreate: undefined } }) router.replace({ query: { ...router.currentRoute.value.query, openCreate: undefined } })
} }
}) })
watch([selectedWindow, selectedCapability], () => {
void fetchStories()
})
</script> </script>
<template> <template>
@@ -213,6 +284,18 @@ onMounted(() => {
<p class="mt-2 text-sm leading-6 text-gray-500"> <p class="mt-2 text-sm leading-6 text-gray-500">
生成资源补全和失败恢复留下的供应商调用轨迹 生成资源补全和失败恢复留下的供应商调用轨迹
</p> </p>
<div class="mt-4 flex flex-wrap gap-2">
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedWindow === '7' ? 'border-gray-900 bg-gray-900 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setWindow('7')">最近 7 </button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedWindow === '30' ? 'border-gray-900 bg-gray-900 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setWindow('30')">最近 30 </button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedWindow === 'all' ? 'border-gray-900 bg-gray-900 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setWindow('all')">全部</button>
</div>
<div class="mt-3 flex flex-wrap gap-2">
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'all' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('all')">全部能力</button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'text' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('text')">文本</button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'image' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('image')">图片</button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'tts' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('tts')">语音</button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'storybook' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('storybook')">绘本</button>
</div>
</div> </div>
<div class="grid grid-cols-2 gap-3 sm:grid-cols-4 lg:min-w-[520px]"> <div class="grid grid-cols-2 gap-3 sm:grid-cols-4 lg:min-w-[520px]">
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3"> <div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
@@ -236,6 +319,70 @@ onMounted(() => {
<p v-if="topProvider" class="mt-4 text-sm text-gray-500"> <p v-if="topProvider" class="mt-4 text-sm text-gray-500">
当前样本中最前面的能力组合是 {{ topProvider.capability }} / {{ topProvider.adapter }}成功 {{ topProvider.success_count }} 失败 {{ topProvider.failure_count }} 当前样本中最前面的能力组合是 {{ topProvider.capability }} / {{ topProvider.adapter }}成功 {{ topProvider.success_count }} 失败 {{ topProvider.failure_count }}
</p> </p>
<p v-if="topFailureReason" class="mt-2 text-sm text-rose-600">
最常见失败原因{{ topFailureReason.reason }}{{ topFailureReason.count }}
</p>
</BaseCard>
<BaseCard
v-if="opsSummary"
class="mb-8"
padding="lg"
>
<div class="flex flex-col gap-5 lg:flex-row lg:items-center lg:justify-between">
<div>
<h2 class="text-xl font-bold text-gray-800">任务运行概览</h2>
<p class="mt-2 text-sm leading-6 text-gray-500">
最近 {{ opsSummary.window_hours }} 小时的任务健康度运行超过
{{ opsSummary.stale_threshold_minutes }} 分钟会被视为卡住
</p>
</div>
<div class="grid grid-cols-2 gap-3 sm:grid-cols-4 lg:min-w-[520px]">
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
<div class="text-xs text-gray-500">运行中</div>
<div class="mt-1 text-lg font-semibold text-gray-800">{{ opsSummary.active_jobs }}</div>
</div>
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
<div class="text-xs text-gray-500">超时待收敛</div>
<div class="mt-1 text-lg font-semibold" :class="opsSummary.stale_running_jobs ? 'text-amber-600' : 'text-gray-800'">
{{ opsSummary.stale_running_jobs }}
</div>
</div>
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
<div class="text-xs text-gray-500">最近失败</div>
<div class="mt-1 text-lg font-semibold" :class="opsSummary.failed_jobs ? 'text-rose-600' : 'text-gray-800'">
{{ opsSummary.failed_jobs }}
</div>
</div>
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
<div class="text-xs text-gray-500">资源任务</div>
<div class="mt-1 text-lg font-semibold text-gray-800">{{ opsSummary.asset_retry_jobs }}</div>
</div>
</div>
</div>
<p v-if="opsSummary.degraded_jobs" class="mt-4 text-sm text-amber-600">
最近 {{ opsSummary.window_hours }} 小时有 {{ opsSummary.degraded_jobs }} 个任务以降级完成收尾
</p>
<div v-if="opsSummary.recent_failures.length" class="mt-4 space-y-3">
<div
v-for="failure in opsSummary.recent_failures"
:key="failure.job_id"
class="rounded-lg border border-rose-100 bg-rose-50 px-4 py-3"
>
<div class="flex flex-wrap items-center justify-between gap-3">
<div class="text-sm font-semibold text-gray-800">
{{ failure.story_title || `${formatOutputMode(failure.output_mode)}任务` }}
</div>
<div class="text-xs text-gray-500">{{ formatDate(failure.updated_at) }}</div>
</div>
<div class="mt-1 text-xs text-rose-600">
{{ failure.failure_label }} · {{ failure.error_message || '请打开任务轨迹查看原因' }}
</div>
</div>
</div>
<p v-else class="mt-4 text-sm text-emerald-600">
最近 {{ opsSummary.window_hours }} 小时没有失败任务当前链路比较稳定
</p>
</BaseCard> </BaseCard>
<!-- 故事网格 --> <!-- 故事网格 -->

View File

@@ -48,6 +48,8 @@ STORYBOOK_PROVIDERS=["storybook_primary"]
TEXT_MODEL=gemini-2.0-flash TEXT_MODEL=gemini-2.0-flash
IMAGE_MODEL=nano-banana IMAGE_MODEL=nano-banana
IMAGE_RESOLUTION=1K IMAGE_RESOLUTION=1K
STORY_AUDIO_CACHE_TTL_DAYS=30
GENERATION_JOB_STALE_MINUTES=60
# TTS_MODEL=speech-2.6-turbo (MiniMax) / zh-CN-XiaoxiaoNeural (Edge) # TTS_MODEL=speech-2.6-turbo (MiniMax) / zh-CN-XiaoxiaoNeural (Edge)
# [API 密钥池] # [API 密钥池]

View File

@@ -4,7 +4,7 @@ import json
import uuid import uuid
from typing import AsyncGenerator from typing import AsyncGenerator
from fastapi import APIRouter, Depends, Response from fastapi import APIRouter, Depends, Query, Response
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sse_starlette.sse import EventSourceResponse from sse_starlette.sse import EventSourceResponse
@@ -19,6 +19,7 @@ from app.schemas.story_schemas import (
GenerateRequest, GenerateRequest,
GenerationJobDetailResponse, GenerationJobDetailResponse,
GenerationJobSummaryResponse, GenerationJobSummaryResponse,
GenerationOpsSummaryResponse,
GenerationProviderAnalyticsResponse, GenerationProviderAnalyticsResponse,
GenerationProviderStatsResponse, GenerationProviderStatsResponse,
GenerationRequest, GenerationRequest,
@@ -36,6 +37,7 @@ from app.services import story_service
from app.services.generation_jobs import ( from app.services.generation_jobs import (
get_generation_job_detail, get_generation_job_detail,
get_story_provider_stats, get_story_provider_stats,
get_user_generation_ops_summary,
get_user_provider_analytics, get_user_provider_analytics,
list_story_generation_jobs, list_story_generation_jobs,
) )
@@ -86,16 +88,36 @@ async def get_generation_job(
return await get_generation_job_detail(db, job_id=job_id, user_id=user.id) return await get_generation_job_detail(db, job_id=job_id, user_id=user.id)
@router.get(
"/generations/ops-summary",
response_model=GenerationOpsSummaryResponse,
)
async def get_generation_ops_summary(
hours: int = Query(default=24, ge=1, le=168),
user: User = Depends(require_user),
db: AsyncSession = Depends(get_db),
):
"""Get a compact recent operations summary for generation workflows."""
return await get_user_generation_ops_summary(db, user_id=user.id, hours=hours)
@router.get( @router.get(
"/generations/provider-analytics", "/generations/provider-analytics",
response_model=GenerationProviderAnalyticsResponse, response_model=GenerationProviderAnalyticsResponse,
) )
async def get_generation_provider_analytics( async def get_generation_provider_analytics(
days: int | None = Query(default=None, ge=1, le=365),
capability: str | None = Query(default=None),
user: User = Depends(require_user), user: User = Depends(require_user),
db: AsyncSession = Depends(get_db), db: AsyncSession = Depends(get_db),
): ):
"""Get provider call stats aggregated across the user's generation history.""" """Get provider call stats aggregated across the user's generation history."""
return await get_user_provider_analytics(db, user_id=user.id) return await get_user_provider_analytics(
db,
user_id=user.id,
days=days,
capability=capability,
)
@router.get( @router.get(
@@ -117,11 +139,19 @@ async def list_generation_jobs(
) )
async def get_generation_provider_stats( async def get_generation_provider_stats(
story_id: int, story_id: int,
days: int | None = Query(default=None, ge=1, le=365),
capability: str | None = Query(default=None),
user: User = Depends(require_user), user: User = Depends(require_user),
db: AsyncSession = Depends(get_db), db: AsyncSession = Depends(get_db),
): ):
"""Get provider call stats aggregated from generation job events.""" """Get provider call stats aggregated from generation job events."""
return await get_story_provider_stats(db, story_id=story_id, user_id=user.id) return await get_story_provider_stats(
db,
story_id=story_id,
user_id=user.id,
days=days,
capability=capability,
)
@router.get("/generations/{story_id}", response_model=StoryDetailResponse) @router.get("/generations/{story_id}", response_model=StoryDetailResponse)

View File

@@ -49,6 +49,14 @@ celery_app.conf.update(
"task": "app.tasks.memory.prune_memories_task", "task": "app.tasks.memory.prune_memories_task",
"schedule": crontab(minute="0", hour="3"), # Daily at 03:00 "schedule": crontab(minute="0", hour="3"), # Daily at 03:00
}, },
"prune_story_audio_cache": {
"task": "app.tasks.audio_cache.prune_story_audio_cache_task",
"schedule": crontab(minute="30", hour="3"), # Daily at 03:30
},
"prune_stale_generation_jobs": {
"task": "app.tasks.generation_maintenance.prune_stale_generation_jobs_task",
"schedule": crontab(minute="*/30"),
},
}, },
) )

View File

@@ -66,6 +66,14 @@ class Settings(BaseSettings):
"storage/audio", "storage/audio",
description="Directory for cached story audio files", description="Directory for cached story audio files",
) )
story_audio_cache_ttl_days: int = Field(
30,
description="TTL in days before cached story audio is pruned",
)
generation_job_stale_minutes: int = Field(
60,
description="Minutes before a running generation job is considered stale",
)
# Celery (Redis) # Celery (Redis)
celery_broker_url: str = Field("redis://localhost:6379/0") celery_broker_url: str = Field("redis://localhost:6379/0")

View File

@@ -220,21 +220,33 @@ class GenerationProviderStatResponse(BaseModel):
estimated_cost_usd: float = 0.0 estimated_cost_usd: float = 0.0
class GenerationProviderFailureReasonResponse(BaseModel):
"""Aggregated failed provider call reason."""
reason: str
count: int
class GenerationProviderStatsResponse(BaseModel): class GenerationProviderStatsResponse(BaseModel):
"""Provider call stats aggregated from generation job events.""" """Provider call stats aggregated from generation job events."""
story_id: int story_id: int
window_days: int | None = None
capability: str | None = None
total_calls: int total_calls: int
successful_calls: int successful_calls: int
failed_calls: int failed_calls: int
avg_latency_ms: float | None = None avg_latency_ms: float | None = None
estimated_cost_usd: float = 0.0 estimated_cost_usd: float = 0.0
by_provider: list[GenerationProviderStatResponse] = Field(default_factory=list) by_provider: list[GenerationProviderStatResponse] = Field(default_factory=list)
failure_reasons: list[GenerationProviderFailureReasonResponse] = Field(default_factory=list)
class GenerationProviderAnalyticsResponse(BaseModel): class GenerationProviderAnalyticsResponse(BaseModel):
"""Provider call stats aggregated across one user's generation history.""" """Provider call stats aggregated across one user's generation history."""
window_days: int | None = None
capability: str | None = None
total_calls: int total_calls: int
successful_calls: int successful_calls: int
failed_calls: int failed_calls: int
@@ -243,6 +255,33 @@ class GenerationProviderAnalyticsResponse(BaseModel):
job_count: int job_count: int
story_count: int story_count: int
by_provider: list[GenerationProviderStatResponse] = Field(default_factory=list) by_provider: list[GenerationProviderStatResponse] = Field(default_factory=list)
failure_reasons: list[GenerationProviderFailureReasonResponse] = Field(default_factory=list)
class GenerationRecentFailureResponse(BaseModel):
"""One recent failed generation task for operations summary."""
job_id: str
story_id: int | None = None
story_title: str | None = None
output_mode: str
current_step: str
error_message: str | None = None
failure_label: str
updated_at: datetime
class GenerationOpsSummaryResponse(BaseModel):
"""Recent generation health summary for one user."""
window_hours: int
stale_threshold_minutes: int
active_jobs: int
stale_running_jobs: int
failed_jobs: int
degraded_jobs: int
asset_retry_jobs: int
recent_failures: list[GenerationRecentFailureResponse] = Field(default_factory=list)
class AchievementItem(BaseModel): class AchievementItem(BaseModel):

View File

@@ -2,14 +2,19 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime, timedelta, timezone
from typing import Any from typing import Any
from fastapi import HTTPException from fastapi import HTTPException
from sqlalchemy import desc, distinct, func, select from sqlalchemy import desc, select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings
from app.core.logging import get_logger
from app.db.models import GenerationJob, GenerationJobEvent, Story from app.db.models import GenerationJob, GenerationJobEvent, Story
logger = get_logger(__name__)
def _story_snapshot(story: Story | None) -> dict[str, Any]: def _story_snapshot(story: Story | None) -> dict[str, Any]:
if story is None: if story is None:
@@ -68,6 +73,7 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]:
"asset_generation_completed": (100, "资源已完成"), "asset_generation_completed": (100, "资源已完成"),
"asset_retry_completed": (100, "资源重试完成"), "asset_retry_completed": (100, "资源重试完成"),
"generation_completed": (100, "生成完成"), "generation_completed": (100, "生成完成"),
"generation_stale_failed": (100, "任务超时已收敛"),
} }
percent, label = progress_map.get(job.current_step, (10, "生成处理中")) percent, label = progress_map.get(job.current_step, (10, "生成处理中"))
return { return {
@@ -77,6 +83,27 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]:
} }
def _normalize_datetime(value: datetime) -> datetime:
if value.tzinfo is None:
return value.replace(tzinfo=timezone.utc)
return value.astimezone(timezone.utc)
def _is_stale_job(job: GenerationJob, *, stale_after_minutes: int) -> bool:
cutoff = datetime.now(timezone.utc) - timedelta(minutes=stale_after_minutes)
return job.status == "running" and _normalize_datetime(job.updated_at) <= cutoff
def _failure_label(job: GenerationJob) -> str:
if job.current_step == "generation_stale_failed":
return "任务超时"
if job.output_mode == "asset_retry":
return "资源重试失败"
if job.output_mode == "asset_generation":
return "资源生成失败"
return "生成失败"
async def create_generation_job( async def create_generation_job(
db: AsyncSession, db: AsyncSession,
*, *,
@@ -266,16 +293,64 @@ async def list_story_generation_jobs(
return [generation_job_to_summary(job) for job in jobs] return [generation_job_to_summary(job) for job in jobs]
async def get_active_story_generation_job(
db: AsyncSession,
*,
story_id: int,
user_id: str,
) -> GenerationJob | None:
"""Return the most recent running job for a story, if any."""
result = await db.execute(
select(GenerationJob)
.where(
GenerationJob.story_id == story_id,
GenerationJob.user_id == user_id,
GenerationJob.status == "running",
)
.order_by(desc(GenerationJob.updated_at), desc(GenerationJob.id))
.limit(1)
)
return result.scalar_one_or_none()
async def ensure_no_active_story_generation_job(
db: AsyncSession,
*,
story_id: int,
user_id: str,
) -> None:
"""Prevent duplicate asset work while a story already has a running job."""
active_job = await get_active_story_generation_job(db, story_id=story_id, user_id=user_id)
if active_job is None:
return
progress = _job_progress(active_job)
raise HTTPException(
status_code=409,
detail=(
f"当前故事已有运行中的任务({progress['progress_label']}"
"请等待当前任务完成后再试。"
),
)
def _as_float(value: Any) -> float | None: def _as_float(value: Any) -> float | None:
if isinstance(value, int | float): if isinstance(value, int | float):
return float(value) return float(value)
return None return None
def _aggregate_provider_events(events: list[GenerationJobEvent]) -> dict[str, Any]: def _aggregate_provider_events(
events: list[GenerationJobEvent],
*,
capability: str | None = None,
) -> dict[str, Any]:
"""Aggregate provider telemetry from provider call events.""" """Aggregate provider telemetry from provider call events."""
by_key: dict[tuple[str, str], dict[str, Any]] = {} by_key: dict[tuple[str, str], dict[str, Any]] = {}
failure_reasons: dict[str, int] = {}
total_latency = 0.0 total_latency = 0.0
latency_count = 0 latency_count = 0
total_cost = 0.0 total_cost = 0.0
@@ -284,13 +359,16 @@ def _aggregate_provider_events(events: list[GenerationJobEvent]) -> dict[str, An
for event in events: for event in events:
metadata = event.event_metadata or {} metadata = event.event_metadata or {}
capability = str(metadata.get("capability") or "unknown") event_capability = str(metadata.get("capability") or "unknown")
if capability is not None and event_capability != capability:
continue
adapter = str(metadata.get("adapter") or "unknown") adapter = str(metadata.get("adapter") or "unknown")
key = (capability, adapter) key = (event_capability, adapter)
bucket = by_key.setdefault( bucket = by_key.setdefault(
key, key,
{ {
"capability": capability, "capability": event_capability,
"adapter": adapter, "adapter": adapter,
"call_count": 0, "call_count": 0,
"success_count": 0, "success_count": 0,
@@ -318,6 +396,8 @@ def _aggregate_provider_events(events: list[GenerationJobEvent]) -> dict[str, An
else: else:
bucket["failure_count"] += 1 bucket["failure_count"] += 1
failed_calls += 1 failed_calls += 1
reason = str(metadata.get("error") or "unknown_error")
failure_reasons[reason] = failure_reasons.get(reason, 0) + 1
by_provider = [] by_provider = []
for bucket in by_key.values(): for bucket in by_key.values():
@@ -349,67 +429,243 @@ def _aggregate_provider_events(events: list[GenerationJobEvent]) -> dict[str, An
"avg_latency_ms": round(total_latency / latency_count, 2) if latency_count else None, "avg_latency_ms": round(total_latency / latency_count, 2) if latency_count else None,
"estimated_cost_usd": round(total_cost, 6), "estimated_cost_usd": round(total_cost, 6),
"by_provider": by_provider, "by_provider": by_provider,
"failure_reasons": [
{"reason": reason, "count": count}
for reason, count in sorted(
failure_reasons.items(),
key=lambda item: (-item[1], item[0]),
)
],
} }
def _provider_events_query(
*,
user_id: str,
story_id: int | None = None,
days: int | None = None,
):
query = (
select(GenerationJobEvent)
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
.where(
GenerationJob.user_id == user_id,
GenerationJobEvent.event_type.in_(
["provider_call_succeeded", "provider_call_failed"]
),
)
)
if story_id is not None:
query = query.where(GenerationJob.story_id == story_id)
if days is not None:
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
query = query.where(GenerationJobEvent.created_at >= cutoff)
return query.order_by(GenerationJobEvent.id)
async def get_story_provider_stats( async def get_story_provider_stats(
db: AsyncSession, db: AsyncSession,
*, *,
story_id: int, story_id: int,
user_id: str, user_id: str,
days: int | None = None,
capability: str | None = None,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Aggregate provider call telemetry from all user-owned jobs for one story.""" """Aggregate provider call telemetry from all user-owned jobs for one story."""
events = ( events = (
await db.execute( await db.execute(
select(GenerationJobEvent) _provider_events_query(
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id) user_id=user_id,
.where( story_id=story_id,
GenerationJob.story_id == story_id, days=days,
GenerationJob.user_id == user_id,
GenerationJobEvent.event_type.in_(
["provider_call_succeeded", "provider_call_failed"]
),
) )
.order_by(GenerationJobEvent.id)
) )
).scalars().all() ).scalars().all()
return {"story_id": story_id, **_aggregate_provider_events(events)} return {
"story_id": story_id,
"window_days": days,
"capability": capability,
**_aggregate_provider_events(events, capability=capability),
}
async def get_user_provider_analytics( async def get_user_provider_analytics(
db: AsyncSession, db: AsyncSession,
*, *,
user_id: str, user_id: str,
days: int | None = None,
capability: str | None = None,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Aggregate provider telemetry across all stories owned by one user.""" """Aggregate provider telemetry across all stories owned by one user."""
events = ( events = (
await db.execute( await db.execute(
select(GenerationJobEvent) _provider_events_query(
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id) user_id=user_id,
days=days,
)
)
).scalars().all()
filtered_event_job_ids = {
event.job_id
for event in events
if capability is None
or str((event.event_metadata or {}).get("capability") or "unknown") == capability
}
filtered_story_ids = {
event.story_id
for event in events
if event.story_id is not None
and (
capability is None
or str((event.event_metadata or {}).get("capability") or "unknown") == capability
)
}
return {
"window_days": days,
"capability": capability,
**_aggregate_provider_events(events, capability=capability),
"job_count": len(filtered_event_job_ids),
"story_count": len(filtered_story_ids),
}
async def get_user_generation_ops_summary(
db: AsyncSession,
*,
user_id: str,
hours: int = 24,
recent_failure_limit: int = 5,
) -> dict[str, Any]:
"""Summarize recent generation health for one user."""
stale_after_minutes = settings.generation_job_stale_minutes
recent_cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
running_jobs = (
await db.execute(
select(GenerationJob)
.where( .where(
GenerationJob.user_id == user_id, GenerationJob.user_id == user_id,
GenerationJobEvent.event_type.in_( GenerationJob.status == "running",
["provider_call_succeeded", "provider_call_failed"]
),
) )
.order_by(GenerationJobEvent.id) .order_by(desc(GenerationJob.updated_at), desc(GenerationJob.id))
) )
).scalars().all() ).scalars().all()
job_count, story_count = ( recent_jobs = (
await db.execute( await db.execute(
select( select(GenerationJob, Story.title)
func.count(GenerationJob.id), .outerjoin(Story, Story.id == GenerationJob.story_id)
func.count(distinct(GenerationJob.story_id)), .where(
).where(GenerationJob.user_id == user_id) GenerationJob.user_id == user_id,
GenerationJob.updated_at >= recent_cutoff,
)
.order_by(desc(GenerationJob.updated_at), desc(GenerationJob.id))
) )
).one() ).all()
recent_failures: list[dict[str, Any]] = []
failed_jobs = 0
degraded_jobs = 0
asset_retry_jobs = 0
for job, story_title in recent_jobs:
if job.status == "failed":
failed_jobs += 1
if len(recent_failures) < recent_failure_limit:
recent_failures.append(
{
"job_id": job.id,
"story_id": job.story_id,
"story_title": story_title,
"output_mode": job.output_mode,
"current_step": job.current_step,
"error_message": job.error_message,
"failure_label": _failure_label(job),
"updated_at": job.updated_at,
}
)
elif job.status == "degraded_completed":
degraded_jobs += 1
if job.output_mode in {"asset_retry", "asset_generation"}:
asset_retry_jobs += 1
return { return {
**_aggregate_provider_events(events), "window_hours": hours,
"job_count": job_count, "stale_threshold_minutes": stale_after_minutes,
"story_count": story_count, "active_jobs": len(running_jobs),
"stale_running_jobs": sum(
1 for job in running_jobs if _is_stale_job(job, stale_after_minutes=stale_after_minutes)
),
"failed_jobs": failed_jobs,
"degraded_jobs": degraded_jobs,
"asset_retry_jobs": asset_retry_jobs,
"recent_failures": recent_failures,
}
async def mark_stale_generation_jobs(
db: AsyncSession,
*,
stale_after_minutes: int | None = None,
) -> dict[str, int]:
"""Mark long-running generation jobs as failed so they no longer appear stuck forever."""
threshold = stale_after_minutes or settings.generation_job_stale_minutes
running_jobs = (
await db.execute(
select(GenerationJob)
.where(GenerationJob.status == "running")
.order_by(GenerationJob.updated_at, GenerationJob.id)
)
).scalars().all()
marked_stale = 0
for job in running_jobs:
if not _is_stale_job(job, stale_after_minutes=threshold):
continue
story = None
if job.story_id is not None:
story = (
await db.execute(
select(Story).where(
Story.id == job.story_id,
Story.user_id == job.user_id,
)
)
).scalar_one_or_none()
await finish_generation_job(
db,
job=job,
story=story,
status="failed",
current_step="generation_stale_failed",
error_message=f"Generation job exceeded {threshold} minutes without progress.",
message="Generation job was marked failed after exceeding the stale threshold.",
metadata={"stale_after_minutes": threshold},
)
marked_stale += 1
logger.warning(
"generation_job_marked_stale",
job_id=job.id,
story_id=job.story_id,
output_mode=job.output_mode,
stale_after_minutes=threshold,
)
return {
"running": len(running_jobs),
"marked_stale": marked_stale,
"stale_after_minutes": threshold,
} }

View File

@@ -2,6 +2,7 @@
import asyncio import asyncio
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Literal from typing import Literal
from fastapi import HTTPException from fastapi import HTTPException
@@ -9,6 +10,7 @@ from sqlalchemy import desc, select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import joinedload from sqlalchemy.orm import joinedload
from app.core.config import settings
from app.core.logging import get_logger from app.core.logging import get_logger
from app.db.models import ChildProfile, Story, StoryUniverse from app.db.models import ChildProfile, Story, StoryUniverse
from app.schemas.story_schemas import ( from app.schemas.story_schemas import (
@@ -32,6 +34,7 @@ from app.services.audio_storage import (
) )
from app.services.generation_jobs import ( from app.services.generation_jobs import (
create_generation_job, create_generation_job,
ensure_no_active_story_generation_job,
finish_generation_job, finish_generation_job,
record_generation_event, record_generation_event,
) )
@@ -1369,6 +1372,7 @@ async def retry_story_assets(
db: AsyncSession, db: AsyncSession,
) -> Story: ) -> Story:
"""Retry selected assets through one workflow-level endpoint.""" """Retry selected assets through one workflow-level endpoint."""
await ensure_no_active_story_generation_job(db, story_id=story_id, user_id=user_id)
requested_assets = list(dict.fromkeys(assets)) requested_assets = list(dict.fromkeys(assets))
job = await create_generation_job( job = await create_generation_job(
db, db,
@@ -1443,6 +1447,7 @@ async def generate_story_cover(
db: AsyncSession, db: AsyncSession,
) -> str: ) -> str:
"""Generate cover image for an existing story.""" """Generate cover image for an existing story."""
await ensure_no_active_story_generation_job(db, story_id=story_id, user_id=user_id)
job = await create_generation_job( job = await create_generation_job(
db, db,
user_id=user_id, user_id=user_id,
@@ -1495,6 +1500,7 @@ async def generate_story_audio(
db: AsyncSession, db: AsyncSession,
) -> bytes: ) -> bytes:
"""Generate audio for a story.""" """Generate audio for a story."""
await ensure_no_active_story_generation_job(db, story_id=story_id, user_id=user_id)
job = await create_generation_job( job = await create_generation_job(
db, db,
user_id=user_id, user_id=user_id,
@@ -1597,6 +1603,50 @@ async def clear_story_audio_cache(
return await get_story_audio_status(story_id, user_id, db) return await get_story_audio_status(story_id, user_id, db)
async def prune_story_audio_cache(db: AsyncSession) -> dict[str, int]:
"""Prune expired audio cache files and repair story metadata."""
ttl_days = max(1, settings.story_audio_cache_ttl_days)
cutoff = datetime.now(timezone.utc) - timedelta(days=ttl_days)
result = await db.execute(select(Story).where(Story.audio_path.is_not(None)))
stories = result.scalars().all()
scanned = 0
pruned = 0
repaired = 0
for story in stories:
scanned += 1
metadata = get_audio_cache_metadata(story.audio_path)
if not metadata.exists:
story.audio_path = None
if story.audio_status == StoryAssetStatus.READY.value:
sync_story_status(story, audio_status=StoryAssetStatus.NOT_REQUESTED)
repaired += 1
continue
if metadata.updated_at and metadata.updated_at < cutoff:
delete_audio_cache(story.audio_path)
story.audio_path = None
sync_story_status(
story,
audio_status=StoryAssetStatus.NOT_REQUESTED,
last_error=None,
)
pruned += 1
await db.commit()
logger.info(
"story_audio_cache_pruned",
scanned=scanned,
pruned=pruned,
repaired=repaired,
ttl_days=ttl_days,
)
return {"scanned": scanned, "pruned": pruned, "repaired": repaired}
async def get_story_achievements( async def get_story_achievements(
story_id: int, story_id: int,
user_id: str, user_id: str,

View File

@@ -0,0 +1,29 @@
"""Celery tasks for story audio cache maintenance."""
import asyncio
from app.core.celery_app import celery_app
from app.core.logging import get_logger
from app.db.database import _get_session_factory
from app.services.story_service import prune_story_audio_cache
logger = get_logger(__name__)
@celery_app.task
def prune_story_audio_cache_task():
"""Daily task to prune expired story audio cache files."""
logger.info("prune_story_audio_cache_task_started")
async def _run():
session_factory = _get_session_factory()
async with session_factory() as session:
return await prune_story_audio_cache(session)
try:
result = asyncio.run(_run())
logger.info("prune_story_audio_cache_task_completed", **result)
return result
except Exception as exc:
logger.error("prune_story_audio_cache_task_failed", error=str(exc))
raise

View File

@@ -0,0 +1,30 @@
"""Generation job maintenance tasks."""
import asyncio
from app.core.celery_app import celery_app
from app.core.logging import get_logger
from app.db.database import _get_session_factory
from app.services.generation_jobs import mark_stale_generation_jobs
logger = get_logger(__name__)
@celery_app.task
def prune_stale_generation_jobs_task():
"""Periodically mark stale running generation jobs as failed."""
logger.info("prune_stale_generation_jobs_task_started")
async def _run():
session_factory = _get_session_factory()
async with session_factory() as session:
return await mark_stale_generation_jobs(session)
try:
result = asyncio.run(_run())
logger.info("prune_stale_generation_jobs_task_completed", **result)
return result
except Exception as exc:
logger.error("prune_stale_generation_jobs_task_failed", error=str(exc))
raise

View File

@@ -0,0 +1,65 @@
"""Story audio cache maintenance tests."""
import os
from datetime import datetime, timedelta, timezone
from pathlib import Path
import pytest
from fastapi import HTTPException
from sqlalchemy import select
from app.core.config import settings
from app.db.models import Story
from app.services.generation_jobs import create_generation_job
from app.services.story_service import generate_story_audio, prune_story_audio_cache
pytestmark = pytest.mark.asyncio
async def test_prune_story_audio_cache_removes_expired_audio(
db_session,
test_story,
mock_tts_provider,
monkeypatch,
):
await generate_story_audio(test_story.id, test_story.user_id, db_session)
cached_audio_path = Path(settings.story_audio_cache_dir) / f"story-{test_story.id}.mp3"
assert cached_audio_path.is_file()
old_time = datetime.now(timezone.utc) - timedelta(days=10)
timestamp = old_time.timestamp()
os.utime(cached_audio_path, (timestamp, timestamp))
monkeypatch.setattr(settings, "story_audio_cache_ttl_days", 7)
result = await prune_story_audio_cache(db_session)
assert result == {"scanned": 1, "pruned": 1, "repaired": 0}
assert not cached_audio_path.exists()
story = (
await db_session.execute(select(Story).where(Story.id == test_story.id))
).scalar_one()
assert story.audio_path is None
assert story.audio_status == "not_requested"
assert story.generation_status == "partial_ready"
async def test_generate_story_audio_rejects_when_story_has_active_job(
db_session,
test_story,
):
await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="asset_retry",
input_type="audio",
request_payload={"story_id": test_story.id},
story_id=test_story.id,
)
with pytest.raises(HTTPException) as exc_info:
await generate_story_audio(test_story.id, test_story.user_id, db_session)
assert exc_info.value.status_code == 409
assert "已有运行中的任务" in str(exc_info.value.detail)

View File

@@ -1,5 +1,6 @@
"""Generation job tracking tests.""" """Generation job tracking tests."""
from datetime import datetime, timedelta, timezone
from unittest.mock import AsyncMock, patch from unittest.mock import AsyncMock, patch
import pytest import pytest
@@ -12,7 +13,11 @@ from app.main import app
from app.services.adapters import AdapterConfig from app.services.adapters import AdapterConfig
from app.services.adapters.storybook.primary import Storybook, StorybookPage from app.services.adapters.storybook.primary import Storybook, StorybookPage
from app.services.adapters.text.models import StoryOutput from app.services.adapters.text.models import StoryOutput
from app.services.generation_jobs import create_generation_job, record_generation_event from app.services.generation_jobs import (
create_generation_job,
mark_stale_generation_jobs,
record_generation_event,
)
pytestmark = pytest.mark.asyncio pytestmark = pytest.mark.asyncio
@@ -520,6 +525,7 @@ async def test_user_provider_analytics_aggregate_across_stories(
assert data["failed_calls"] == 1 assert data["failed_calls"] == 1
assert data["avg_latency_ms"] == 60.0 assert data["avg_latency_ms"] == 60.0
assert data["estimated_cost_usd"] == 0.013 assert data["estimated_cost_usd"] == 0.013
assert data["failure_reasons"] == [{"reason": "timeout", "count": 1}]
assert data["by_provider"] == [ assert data["by_provider"] == [
{ {
"capability": "image", "capability": "image",
@@ -551,3 +557,249 @@ async def test_user_provider_analytics_aggregate_across_stories(
] ]
finally: finally:
app.dependency_overrides.clear() app.dependency_overrides.clear()
async def test_provider_analytics_support_days_and_capability_filters(
db_session,
auth_token,
degraded_story_with_text,
test_story,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
image_job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_retry",
input_type="image",
request_payload={"assets": ["image"]},
story_id=degraded_story_with_text.id,
)
old_event = await record_generation_event(
db_session,
job=image_job,
story_id=degraded_story_with_text.id,
event_type="provider_call_failed",
status="failed",
metadata={
"capability": "image",
"adapter": "cqtai",
"strategy": "priority",
"latency_ms": 120,
"error": "timeout",
},
)
old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10)
await db_session.commit()
tts_job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="asset_retry",
input_type="audio",
request_payload={"assets": ["audio"]},
story_id=test_story.id,
)
await record_generation_event(
db_session,
job=tts_job,
story_id=test_story.id,
event_type="provider_call_succeeded",
status="succeeded",
metadata={
"capability": "tts",
"adapter": "edge_tts",
"strategy": "priority",
"latency_ms": 18,
"estimated_cost_usd": 0.003,
},
)
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get("/api/generations/provider-analytics?days=7")
assert response.status_code == 200
data = response.json()
assert data["window_days"] == 7
assert data["total_calls"] == 1
assert data["job_count"] == 1
assert data["story_count"] == 1
assert data["failure_reasons"] == []
response = await client.get(
"/api/generations/provider-analytics?capability=image"
)
assert response.status_code == 200
data = response.json()
assert data["capability"] == "image"
assert data["total_calls"] == 1
assert data["failed_calls"] == 1
assert data["job_count"] == 1
assert data["story_count"] == 1
assert data["failure_reasons"] == [{"reason": "timeout", "count": 1}]
response = await client.get(
f"/api/generations/{degraded_story_with_text.id}/provider-stats?capability=image"
)
assert response.status_code == 200
data = response.json()
assert data["capability"] == "image"
assert data["failure_reasons"] == [{"reason": "timeout", "count": 1}]
finally:
app.dependency_overrides.clear()
async def test_generation_ops_summary_exposes_running_stale_and_recent_failures(
db_session,
auth_token,
degraded_story_with_text,
test_story,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
running_job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="story",
input_type="keywords",
request_payload={"data": "星星"},
story_id=test_story.id,
)
stale_job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id},
story_id=degraded_story_with_text.id,
)
failed_job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_retry",
input_type="image",
request_payload={"assets": ["image"]},
story_id=degraded_story_with_text.id,
)
degraded_job = await create_generation_job(
db_session,
user_id=test_story.user_id,
output_mode="storybook",
input_type="keywords",
request_payload={"data": "月亮"},
story_id=test_story.id,
)
stale_job.updated_at = datetime.now(timezone.utc) - timedelta(hours=3)
failed_job.status = "failed"
failed_job.current_step = "asset_retry_failed"
failed_job.error_message = "image timeout"
failed_job.updated_at = datetime.now(timezone.utc) - timedelta(hours=1)
degraded_job.status = "degraded_completed"
degraded_job.current_step = "generation_completed"
degraded_job.updated_at = datetime.now(timezone.utc) - timedelta(minutes=30)
running_job.updated_at = datetime.now(timezone.utc) - timedelta(minutes=10)
await db_session.commit()
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.get("/api/generations/ops-summary?hours=48")
assert response.status_code == 200
data = response.json()
assert data["window_hours"] == 48
assert data["active_jobs"] == 2
assert data["stale_running_jobs"] == 1
assert data["failed_jobs"] == 1
assert data["degraded_jobs"] == 1
assert data["asset_retry_jobs"] == 2
assert len(data["recent_failures"]) == 1
assert data["recent_failures"][0]["job_id"] == failed_job.id
assert data["recent_failures"][0]["story_title"] == degraded_story_with_text.title
assert data["recent_failures"][0]["failure_label"] == "资源重试失败"
finally:
app.dependency_overrides.clear()
async def test_mark_stale_generation_jobs_marks_old_running_jobs_failed(
db_session,
degraded_story_with_text,
):
stale_job = await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="story",
input_type="keywords",
request_payload={"data": "超时任务"},
story_id=degraded_story_with_text.id,
)
stale_job.updated_at = datetime.now(timezone.utc) - timedelta(hours=2)
await db_session.commit()
result = await mark_stale_generation_jobs(db_session, stale_after_minutes=30)
assert result == {"running": 1, "marked_stale": 1, "stale_after_minutes": 30}
refreshed_job = (
await db_session.execute(select(GenerationJob).where(GenerationJob.id == stale_job.id))
).scalar_one()
assert refreshed_job.status == "failed"
assert refreshed_job.current_step == "generation_stale_failed"
assert refreshed_job.error_message == "Generation job exceeded 30 minutes without progress."
events = (
await db_session.execute(
select(GenerationJobEvent)
.where(GenerationJobEvent.job_id == stale_job.id)
.order_by(GenerationJobEvent.id)
)
).scalars().all()
assert events[-1].event_type == "generation_stale_failed"
assert events[-1].event_metadata["stale_after_minutes"] == 30
async def test_retry_assets_rejects_when_story_has_active_job(
db_session,
auth_token,
degraded_story_with_text,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
await create_generation_job(
db_session,
user_id=degraded_story_with_text.user_id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": degraded_story_with_text.id},
story_id=degraded_story_with_text.id,
)
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.post(
f"/api/generations/{degraded_story_with_text.id}/retry-assets",
json={"assets": ["image"]},
)
assert response.status_code == 409
assert "已有运行中的任务" in response.json()["detail"]
finally:
app.dependency_overrides.clear()

View File

@@ -16,6 +16,8 @@
- `./scripts/demo_smoke.sh` 已覆盖音频缓存状态查询。 - `./scripts/demo_smoke.sh` 已覆盖音频缓存状态查询。
- Week 4 Demo 包装已完成新增架构说明、Demo 包装文档、Week 4 sprint review用户端和管理端绘本阅读器支持阅读位置恢复。 - Week 4 Demo 包装已完成新增架构说明、Demo 包装文档、Week 4 sprint review用户端和管理端绘本阅读器支持阅读位置恢复。
- Week 4 最终回归通过:后端全量测试 85 passedruff 通过,用户端/管理端构建通过,`docker compose up -d --build``./scripts/demo_smoke.sh` 通过。 - Week 4 最终回归通过:后端全量测试 85 passedruff 通过,用户端/管理端构建通过,`docker compose up -d --build``./scripts/demo_smoke.sh` 通过。
- 继续优化后再次验证Provider analytics 已支持时间窗口与能力筛选、失败原因摘要;音频缓存已加入 TTL 配置和后台 prune 任务。
- 新一轮优化验证通过:新增 `GET /api/generations/ops-summary`,故事库已展示最近失败与卡住任务摘要;生成任务已支持 stale 自动收敛和重复资产任务保护。
- 后端新增 `partial_ready``text_status` 与迁移 `0012_story_text_status` 后,`backend/.venv/bin/python -m pytest backend/tests -q` 通过82 个测试通过。 - 后端新增 `partial_ready``text_status` 与迁移 `0012_story_text_status` 后,`backend/.venv/bin/python -m pytest backend/tests -q` 通过82 个测试通过。
- `backend/.venv/bin/python -m ruff check backend/app backend/tests backend/alembic/versions/0012_add_story_text_status_and_partial_ready.py` 通过。 - `backend/.venv/bin/python -m ruff check backend/app backend/tests backend/alembic/versions/0012_add_story_text_status_and_partial_ready.py` 通过。
- 用户端与管理端 `npm run build` 均通过。 - 用户端与管理端 `npm run build` 均通过。

View File

@@ -66,6 +66,11 @@ Week 2 已完成演示闭环、统一生成工作流、generation job/event、
| W4-03 | Demo | 求职版 Demo 包装 | `docs/planning/demo-package.md` | P0 | Done | | W4-03 | Demo | 求职版 Demo 包装 | `docs/planning/demo-package.md` | P0 | Done |
| W4-04 | QA | 全量回归与验证记录 | pytest、ruff、前端 build、Docker smoke | P0 | Done | | W4-04 | QA | 全量回归与验证记录 | pytest、ruff、前端 build、Docker smoke | P0 | Done |
| W4-05 | Product | 项目复盘与下一阶段路线 | `docs/planning/week-4-sprint-review.md` | P1 | Done | | W4-05 | Product | 项目复盘与下一阶段路线 | `docs/planning/week-4-sprint-review.md` | P1 | Done |
| W4-06 | Ops | Provider analytics 支持时间窗口与失败原因 | `days` / `capability` 筛选 + failure reason summary | P1 | Done |
| W4-07 | Ops | 音频缓存后台清理 | TTL 配置 + Celery beat prune task | P1 | Done |
| W4-08 | Ops | 任务运行概览与失败摘要 | `GET /api/generations/ops-summary` + 最近失败列表 | P1 | Done |
| W4-09 | Workflow | 卡住任务自动收敛 | `GENERATION_JOB_STALE_MINUTES` + Celery beat stale job maintenance | P1 | Done |
| W4-10 | Workflow | 防止重复资产任务 | 运行中故事拒绝重复封面/音频/资产重试请求 | P1 | Done |
--- ---

View File

@@ -34,6 +34,7 @@ DreamWeaver 已经具备求职演示所需的完整闭环:
- generation job/event - generation job/event
- Provider failover 和聚合指标 - Provider failover 和聚合指标
- 跨故事 Provider analytics - 跨故事 Provider analytics
- 任务运行概览、最近失败摘要与卡住任务收敛
- 前端生成轨迹和自动轮询形态 - 前端生成轨迹和自动轮询形态
--- ---
@@ -42,7 +43,7 @@ DreamWeaver 已经具备求职演示所需的完整闭环:
最近一轮验证包括: 最近一轮验证包括:
- 后端全量测试:85 passed - 后端全量测试:91 passed
- 后端 ruff通过 - 后端 ruff通过
- 用户端生产构建:通过 - 用户端生产构建:通过
- 管理端生产构建:通过 - 管理端生产构建:通过
@@ -56,10 +57,9 @@ DreamWeaver 已经具备求职演示所需的完整闭环:
| Priority | Task | Why | | Priority | Task | Why |
| --- | --- | --- | | --- | --- | --- |
| P0 | 将同步生成迁移到 Celery worker | 支持真实长任务、断点恢复和后台进度 | | P0 | 将同步生成迁移到 Celery worker | 支持真实长任务、断点恢复和后台进度 |
| P0 | Provider analytics 加入时间窗口和失败原因 | 让运营分析可用于成本与稳定性决策 |
| P1 | 音频缓存过期策略与后台清理 | 控制磁盘占用和缓存生命周期 |
| P1 | 生成任务取消与重试队列 | 防止重复任务和用户误触造成浪费 | | P1 | 生成任务取消与重试队列 | 防止重复任务和用户误触造成浪费 |
| P1 | 监控告警与结构化 dashboard | 上线前需要可观测性闭环 | | P1 | 跨用户 / 跨环境 Provider dashboard | 当前已支持单用户摘要,后续要支持运营视角 |
| P1 | 监控告警与结构化 dashboard | 目前已有故事库级概览,后续要接入更完整观测体系 |
| P2 | 更细粒度叙事风格与音色策略 | 扩展体验,但不影响当前求职版主线 | | P2 | 更细粒度叙事风格与音色策略 | 扩展体验,但不影响当前求职版主线 |
--- ---

View File

@@ -64,6 +64,10 @@ DreamWeaver 当前同时支持普通故事生成、完整故事生成和绘本
- Provider 调用已可按故事聚合为成功率、平均耗时、预估成本和 adapter 明细 - Provider 调用已可按故事聚合为成功率、平均耗时、预估成本和 adapter 明细
- generation job 响应已提供 `progress_percent``progress_label``is_terminal`,前端可直接用于进度条和轮询 - generation job 响应已提供 `progress_percent``progress_label``is_terminal`,前端可直接用于进度条和轮询
- 已新增跨故事 Provider 运营摘要 `GET /api/generations/provider-analytics`,故事库可展示总调用、成功率、平均耗时、预估成本和任务/故事覆盖数 - 已新增跨故事 Provider 运营摘要 `GET /api/generations/provider-analytics`,故事库可展示总调用、成功率、平均耗时、预估成本和任务/故事覆盖数
- 跨故事 Provider 运营摘要已支持按时间窗口和 capability 筛选,并聚合失败原因
- 已新增任务运行概览 `GET /api/generations/ops-summary`,故事库可展示最近失败、运行中任务和超时待收敛任务
- 重复资产任务已加入保护:同一故事存在运行中 job 时,不再重复触发封面、音频或统一资产重试
- Celery beat 已支持定时收敛卡住的 generation job避免任务长期停在 running
- 用户端与管理端生成轨迹组件会在任务未终止时自动轮询,为后续后台 worker 进度流保留前端形态 - 用户端与管理端生成轨迹组件会在任务未终止时自动轮询,为后续后台 worker 进度流保留前端形态
- `POST /api/generations` 响应已返回 `generation_job_id`smoke 脚本会验证 job 查询与 story job history - `POST /api/generations` 响应已返回 `generation_job_id`smoke 脚本会验证 job 查询与 story job history
- 用户端与管理端的故事详情页、绘本阅读页已接入生成轨迹,展示生成/重试任务、关键事件、Provider 调用结果和聚合指标 - 用户端与管理端的故事详情页、绘本阅读页已接入生成轨迹,展示生成/重试任务、关键事件、Provider 调用结果和聚合指标
@@ -74,7 +78,7 @@ DreamWeaver 当前同时支持普通故事生成、完整故事生成和绘本
- 普通故事、完整生成、绘本生成已有统一外部入口,内部 workflow 仍可继续减少兼容层分支 - 普通故事、完整生成、绘本生成已有统一外部入口,内部 workflow 仍可继续减少兼容层分支
- 统一资产重试入口已覆盖普通故事封面、绘本缺失插图和故事音频,后续可继续扩展更细的资产级审计 - 统一资产重试入口已覆盖普通故事封面、绘本缺失插图和故事音频,后续可继续扩展更细的资产级审计
- 后台异步 worker 执行、断点续跑、跨时间窗口筛选和更完整的 Provider 运营分析仍属于后续生产化增强 - 后台异步 worker 执行、断点续跑、跨用户/跨环境 Provider 分析,以及真正的取消/重试队列仍属于后续生产化增强
### What This Means ### What This Means

View File

@@ -19,9 +19,10 @@
- `GET /api/generations/jobs/{job_id}`:查询单次生成/补全任务及其事件流。 - `GET /api/generations/jobs/{job_id}`:查询单次生成/补全任务及其事件流。
- `GET /api/generations/{story_id}/jobs`:查询某个故事或绘本的生成与重试历史。 - `GET /api/generations/{story_id}/jobs`:查询某个故事或绘本的生成与重试历史。
- `GET /api/generations/{story_id}/provider-stats`:按故事聚合 Provider 调用成功率、平均耗时、预估成本和 adapter 明细。 - `GET /api/generations/{story_id}/provider-stats`:按故事聚合 Provider 调用成功率、平均耗时、预估成本和 adapter 明细。
- `GET /api/generations/provider-analytics`:按当前用户聚合跨故事 Provider 调用、任务数、故事数、成功率、平均耗时和预估成本。 - `GET /api/generations/provider-analytics`:按当前用户聚合跨故事 Provider 调用、任务数、故事数、成功率、平均耗时和预估成本,并支持 `days` / `capability` 筛选
- `GET /api/generations/ops-summary`:按当前用户聚合最近任务健康度,包括运行中数量、超时阈值、卡住任务数和最近失败摘要。
job 响应会返回 `progress_percent``progress_label``is_terminal`,用户端与管理端已经消费这些查询入口,在故事详情页和绘本阅读页展示最近任务、任务历史、事件时间线、进度条和 Provider 聚合指标;当任务未终止时,前端会自动轮询,为后台 worker 进度流预留体验形态。 job 响应会返回 `progress_percent``progress_label``is_terminal`,用户端与管理端已经消费这些查询入口,在故事详情页和绘本阅读页展示最近任务、任务历史、事件时间线、进度条和 Provider 聚合指标;当任务未终止时,前端会自动轮询,为后台 worker 进度流预留体验形态。当前 analytics 还会聚合失败原因便于快速解释“最近为什么失败”ops summary 会额外把“哪些任务卡住了、最近哪些任务失败了”压缩成故事库首页能直接看的摘要。
## 现有状态模型 ## 现有状态模型
@@ -37,6 +38,12 @@ job 响应会返回 `progress_percent`、`progress_label` 和 `is_terminal`
这些字段足够支撑前端展示、smoke 检查、失败降级、资产重试和生成轨迹解释。 这些字段足够支撑前端展示、smoke 检查、失败降级、资产重试和生成轨迹解释。
## 当前维护策略
- 音频缓存由 `STORY_AUDIO_CACHE_TTL_DAYS` 控制过期时间Celery beat 会每日清理。
- 生成任务由 `GENERATION_JOB_STALE_MINUTES` 控制卡住阈值Celery beat 会每 30 分钟扫描一次,将超时运行中的任务标记为 `generation_stale_failed`
- 当某个故事已经有运行中的 job 时,封面补全、音频生成和统一资产重试会直接拒绝重复请求,避免用户连点造成重复成本。
## 什么时候需要落库 job ## 什么时候需要落库 job
如果后续进入真实生产化,需要扩展当前 job/event 模型: 如果后续进入真实生产化,需要扩展当前 job/event 模型:
@@ -52,7 +59,7 @@ job 响应会返回 `progress_percent`、`progress_label` 和 `is_terminal`
当前已有两层记录,未来可以继续扩展字段和事件颗粒度: 当前已有两层记录,未来可以继续扩展字段和事件颗粒度:
- 将同步生成请求迁移到真正异步 worker 后,继续复用现有 job 查询和前端轮询进度条。 - 将同步生成请求迁移到真正异步 worker 后,继续复用现有 job 查询和前端轮询进度条。
- 将当前跨故事 provider 指标扩展为跨时间窗口、跨用户和失败原因维度的运营分析。 - 将当前跨故事 provider 指标扩展为跨用户、跨环境和更细颗粒度的失败原因维度分析。
## 面试表达 ## 面试表达

View File

@@ -42,15 +42,23 @@ export interface GenerationProviderStat {
export interface GenerationProviderStats { export interface GenerationProviderStats {
story_id: number story_id: number
window_days: number | null
capability: string | null
total_calls: number total_calls: number
successful_calls: number successful_calls: number
failed_calls: number failed_calls: number
avg_latency_ms: number | null avg_latency_ms: number | null
estimated_cost_usd: number estimated_cost_usd: number
by_provider: GenerationProviderStat[] by_provider: GenerationProviderStat[]
failure_reasons: Array<{
reason: string
count: number
}>
} }
export interface GenerationProviderAnalytics { export interface GenerationProviderAnalytics {
window_days: number | null
capability: string | null
total_calls: number total_calls: number
successful_calls: number successful_calls: number
failed_calls: number failed_calls: number
@@ -59,4 +67,30 @@ export interface GenerationProviderAnalytics {
job_count: number job_count: number
story_count: number story_count: number
by_provider: GenerationProviderStat[] by_provider: GenerationProviderStat[]
failure_reasons: Array<{
reason: string
count: number
}>
}
export interface GenerationRecentFailure {
job_id: string
story_id: number | null
story_title: string | null
output_mode: string
current_step: string
error_message: string | null
failure_label: string
updated_at: string
}
export interface GenerationOpsSummary {
window_hours: number
stale_threshold_minutes: number
active_jobs: number
stale_running_jobs: number
failed_jobs: number
degraded_jobs: number
asset_retry_jobs: number
recent_failures: GenerationRecentFailure[]
} }

View File

@@ -1,5 +1,5 @@
<script setup lang="ts"> <script setup lang="ts">
import { computed, onMounted, ref } from 'vue' import { computed, onMounted, ref, watch } from 'vue'
import { useRouter } from 'vue-router' import { useRouter } from 'vue-router'
import { api } from '../api/client' import { api } from '../api/client'
import CreateStoryModal from '../components/CreateStoryModal.vue' import CreateStoryModal from '../components/CreateStoryModal.vue'
@@ -7,7 +7,7 @@ import BaseButton from '../components/ui/BaseButton.vue'
import BaseCard from '../components/ui/BaseCard.vue' import BaseCard from '../components/ui/BaseCard.vue'
import EmptyState from '../components/ui/EmptyState.vue' import EmptyState from '../components/ui/EmptyState.vue'
import LoadingSpinner from '../components/ui/LoadingSpinner.vue' import LoadingSpinner from '../components/ui/LoadingSpinner.vue'
import type { GenerationProviderAnalytics } from '../types/generation' import type { GenerationOpsSummary, GenerationProviderAnalytics } from '../types/generation'
import { import {
getAssetStatusMeta, getAssetStatusMeta,
getGenerationStatusMeta, getGenerationStatusMeta,
@@ -39,9 +39,12 @@ interface StoryItem {
const router = useRouter() const router = useRouter()
const stories = ref<StoryItem[]>([]) const stories = ref<StoryItem[]>([])
const providerAnalytics = ref<GenerationProviderAnalytics | null>(null) const providerAnalytics = ref<GenerationProviderAnalytics | null>(null)
const opsSummary = ref<GenerationOpsSummary | null>(null)
const loading = ref(true) const loading = ref(true)
const error = ref('') const error = ref('')
const showCreateModal = ref(false) const showCreateModal = ref(false)
const selectedWindow = ref<'7' | '30' | 'all'>('30')
const selectedCapability = ref<'all' | 'text' | 'image' | 'tts' | 'storybook'>('all')
const readableCount = computed(() => const readableCount = computed(() =>
stories.value.filter((story) => isReadableGenerationStatus(story.generation_status)).length, stories.value.filter((story) => isReadableGenerationStatus(story.generation_status)).length,
@@ -57,15 +60,30 @@ const providerSuccessRate = computed(() => {
) )
}) })
const topProvider = computed(() => providerAnalytics.value?.by_provider[0] ?? null) const topProvider = computed(() => providerAnalytics.value?.by_provider[0] ?? null)
const topFailureReason = computed(() => providerAnalytics.value?.failure_reasons[0] ?? null)
function buildProviderAnalyticsPath() {
const params = new URLSearchParams()
if (selectedWindow.value !== 'all') {
params.set('days', selectedWindow.value)
}
if (selectedCapability.value !== 'all') {
params.set('capability', selectedCapability.value)
}
const query = params.toString()
return `/api/generations/provider-analytics${query ? `?${query}` : ''}`
}
async function fetchStories() { async function fetchStories() {
try { try {
const [storyList, analytics] = await Promise.all([ const [storyList, analytics, ops] = await Promise.all([
api.get<StoryItem[]>('/api/stories'), api.get<StoryItem[]>('/api/stories'),
api.get<GenerationProviderAnalytics>('/api/generations/provider-analytics'), api.get<GenerationProviderAnalytics>(buildProviderAnalyticsPath()),
api.get<GenerationOpsSummary>('/api/generations/ops-summary'),
]) ])
stories.value = storyList stories.value = storyList
providerAnalytics.value = analytics providerAnalytics.value = analytics
opsSummary.value = ops
} catch (e) { } catch (e) {
error.value = e instanceof Error ? e.message : '加载失败' error.value = e instanceof Error ? e.message : '加载失败'
} finally { } finally {
@@ -106,6 +124,27 @@ function formatCost(value?: number | null) {
return typeof value === 'number' ? `$${value.toFixed(4)}` : '$0.0000' return typeof value === 'number' ? `$${value.toFixed(4)}` : '$0.0000'
} }
function formatOutputMode(value: string) {
switch (value) {
case 'storybook':
return '绘本'
case 'asset_retry':
return '资源重试'
case 'asset_generation':
return '资源生成'
default:
return '故事'
}
}
function setWindow(value: '7' | '30' | 'all') {
selectedWindow.value = value
}
function setCapability(value: 'all' | 'text' | 'image' | 'tts' | 'storybook') {
selectedCapability.value = value
}
onMounted(() => { onMounted(() => {
void fetchStories() void fetchStories()
@@ -114,6 +153,10 @@ onMounted(() => {
router.replace({ query: { ...router.currentRoute.value.query, openCreate: undefined } }) router.replace({ query: { ...router.currentRoute.value.query, openCreate: undefined } })
} }
}) })
watch([selectedWindow, selectedCapability], () => {
void fetchStories()
})
</script> </script>
<template> <template>
@@ -191,6 +234,18 @@ onMounted(() => {
<p class="mt-2 text-sm leading-6 text-gray-500"> <p class="mt-2 text-sm leading-6 text-gray-500">
最近生成和资源补全留下的供应商调用轨迹 最近生成和资源补全留下的供应商调用轨迹
</p> </p>
<div class="mt-4 flex flex-wrap gap-2">
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedWindow === '7' ? 'border-gray-900 bg-gray-900 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setWindow('7')">最近 7 </button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedWindow === '30' ? 'border-gray-900 bg-gray-900 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setWindow('30')">最近 30 </button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedWindow === 'all' ? 'border-gray-900 bg-gray-900 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setWindow('all')">全部</button>
</div>
<div class="mt-3 flex flex-wrap gap-2">
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'all' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('all')">全部能力</button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'text' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('text')">文本</button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'image' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('image')">图片</button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'tts' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('tts')">语音</button>
<button type="button" class="rounded-lg border px-3 py-1.5 text-sm transition-colors" :class="selectedCapability === 'storybook' ? 'border-emerald-600 bg-emerald-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'" @click="setCapability('storybook')">绘本</button>
</div>
</div> </div>
<div class="grid grid-cols-2 gap-3 sm:grid-cols-4 lg:min-w-[520px]"> <div class="grid grid-cols-2 gap-3 sm:grid-cols-4 lg:min-w-[520px]">
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3"> <div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
@@ -214,6 +269,70 @@ onMounted(() => {
<p v-if="topProvider" class="mt-4 text-sm text-gray-500"> <p v-if="topProvider" class="mt-4 text-sm text-gray-500">
当前样本中最前面的能力组合是 {{ topProvider.capability }} / {{ topProvider.adapter }}成功 {{ topProvider.success_count }} 失败 {{ topProvider.failure_count }} 当前样本中最前面的能力组合是 {{ topProvider.capability }} / {{ topProvider.adapter }}成功 {{ topProvider.success_count }} 失败 {{ topProvider.failure_count }}
</p> </p>
<p v-if="topFailureReason" class="mt-2 text-sm text-rose-600">
最常见失败原因{{ topFailureReason.reason }}{{ topFailureReason.count }}
</p>
</BaseCard>
<BaseCard
v-if="opsSummary"
class="mb-8"
padding="lg"
>
<div class="flex flex-col gap-5 lg:flex-row lg:items-center lg:justify-between">
<div>
<h2 class="text-xl font-bold text-gray-800">任务运行概览</h2>
<p class="mt-2 text-sm leading-6 text-gray-500">
最近 {{ opsSummary.window_hours }} 小时的任务健康度运行超过
{{ opsSummary.stale_threshold_minutes }} 分钟会被视为卡住
</p>
</div>
<div class="grid grid-cols-2 gap-3 sm:grid-cols-4 lg:min-w-[520px]">
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
<div class="text-xs text-gray-500">运行中</div>
<div class="mt-1 text-lg font-semibold text-gray-800">{{ opsSummary.active_jobs }}</div>
</div>
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
<div class="text-xs text-gray-500">超时待收敛</div>
<div class="mt-1 text-lg font-semibold" :class="opsSummary.stale_running_jobs ? 'text-amber-600' : 'text-gray-800'">
{{ opsSummary.stale_running_jobs }}
</div>
</div>
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
<div class="text-xs text-gray-500">最近失败</div>
<div class="mt-1 text-lg font-semibold" :class="opsSummary.failed_jobs ? 'text-rose-600' : 'text-gray-800'">
{{ opsSummary.failed_jobs }}
</div>
</div>
<div class="rounded-lg border border-gray-100 bg-gray-50 px-3 py-3">
<div class="text-xs text-gray-500">资源任务</div>
<div class="mt-1 text-lg font-semibold text-gray-800">{{ opsSummary.asset_retry_jobs }}</div>
</div>
</div>
</div>
<p v-if="opsSummary.degraded_jobs" class="mt-4 text-sm text-amber-600">
最近 {{ opsSummary.window_hours }} 小时有 {{ opsSummary.degraded_jobs }} 个任务以降级完成收尾
</p>
<div v-if="opsSummary.recent_failures.length" class="mt-4 space-y-3">
<div
v-for="failure in opsSummary.recent_failures"
:key="failure.job_id"
class="rounded-lg border border-rose-100 bg-rose-50 px-4 py-3"
>
<div class="flex flex-wrap items-center justify-between gap-3">
<div class="text-sm font-semibold text-gray-800">
{{ failure.story_title || `${formatOutputMode(failure.output_mode)}任务` }}
</div>
<div class="text-xs text-gray-500">{{ formatDate(failure.updated_at) }}</div>
</div>
<div class="mt-1 text-xs text-rose-600">
{{ failure.failure_label }} · {{ failure.error_message || '请打开任务轨迹查看原因' }}
</div>
</div>
</div>
<p v-else class="mt-4 text-sm text-emerald-600">
最近 {{ opsSummary.window_hours }} 小时没有失败任务当前链路比较稳定
</p>
</BaseCard> </BaseCard>
<div class="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-3 gap-6"> <div class="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-3 gap-6">

View File

@@ -165,6 +165,16 @@ provider_analytics_json="$(get_json "$APP_URL/api/generations/provider-analytics
assert_jq "$provider_analytics_json" '.total_calls >= 2 and .successful_calls >= 2 and .job_count >= 4 and .story_count >= 2 and (.by_provider | length) >= 1' "provider analytics should summarize calls across generated stories" assert_jq "$provider_analytics_json" '.total_calls >= 2 and .successful_calls >= 2 and .job_count >= 4 and .story_count >= 2 and (.by_provider | length) >= 1' "provider analytics should summarize calls across generated stories"
echo "$provider_analytics_json" | jq '{total_calls,successful_calls,failed_calls,job_count,story_count,avg_latency_ms,estimated_cost_usd}' echo "$provider_analytics_json" | jq '{total_calls,successful_calls,failed_calls,job_count,story_count,avg_latency_ms,estimated_cost_usd}'
say "Checking filtered provider analytics"
filtered_provider_analytics_json="$(get_json "$APP_URL/api/generations/provider-analytics?days=7&capability=text")"
assert_jq "$filtered_provider_analytics_json" '.window_days == 7 and .capability == "text" and .total_calls >= 1' "filtered provider analytics should honor days/capability filters"
echo "$filtered_provider_analytics_json" | jq '{window_days,capability,total_calls,successful_calls,failed_calls,failure_reasons}'
say "Checking generation ops summary"
ops_summary_json="$(get_json "$APP_URL/api/generations/ops-summary?hours=24")"
assert_jq "$ops_summary_json" '.window_hours == 24 and .active_jobs >= 0 and .stale_running_jobs >= 0 and .failed_jobs >= 0 and .asset_retry_jobs >= 2' "generation ops summary should expose recent task health"
echo "$ops_summary_json" | jq '{window_hours,stale_threshold_minutes,active_jobs,stale_running_jobs,failed_jobs,degraded_jobs,asset_retry_jobs,recent_failures}'
say "Checking story list" say "Checking story list"
list_json="$(get_json "$APP_URL/api/stories?limit=5")" list_json="$(get_json "$APP_URL/api/stories?limit=5")"
assert_jq "$list_json" "map(.id) | index($story_id) != null" "story list should include generated story" assert_jq "$list_json" "map(.id) | index($story_id) != null" "story list should include generated story"