From 395cdf4eddeae73ce3723fce56087f1a31586b54 Mon Sep 17 00:00:00 2001 From: Yuyan Date: Sun, 19 Apr 2026 18:56:17 +0800 Subject: [PATCH] feat: add admin provider analytics dashboard --- admin-frontend/src/views/AdminProviders.vue | 451 ++++++++++++++++-- backend/app/api/admin_providers.py | 59 ++- backend/app/services/generation_jobs.py | 110 ++++- backend/tests/test_admin_providers.py | 285 +++++++++++ docs/planning/demo-checklist.md | 2 +- docs/planning/demo-package.md | 4 +- docs/planning/interview-pitch.md | 2 +- .../planning/week-2-to-4-execution-backlog.md | 1 + docs/planning/week-4-sprint-review.md | 3 +- .../unified-generation-workflow-prd.md | 10 +- docs/technical/architecture.md | 8 +- docs/technical/generation-job-state.md | 2 +- 12 files changed, 886 insertions(+), 51 deletions(-) create mode 100644 backend/tests/test_admin_providers.py diff --git a/admin-frontend/src/views/AdminProviders.vue b/admin-frontend/src/views/AdminProviders.vue index dca49aa..6127f63 100644 --- a/admin-frontend/src/views/AdminProviders.vue +++ b/admin-frontend/src/views/AdminProviders.vue @@ -29,6 +29,248 @@ + +
+
+
+

当前环境 Provider 运营摘要

+ + 跨用户 / 当前环境 + +
+

+ 这里展示的是当前部署环境内所有生成任务留下的 Provider 调用轨迹,便于运营和排障。 + 跨环境对比仍需要后续独立汇聚层。 +

+
+ + + +
+
+ + + + + +
+
+ +
+
+
活跃用户
+
+ {{ analytics?.user_count ?? 0 }} +
+
+
+
总调用
+
+ {{ analytics?.total_calls ?? 0 }} +
+
+
+
成功率
+
+ {{ providerSuccessRate ?? '--' }}% +
+
+
+
预估成本
+
+ {{ formatCost(analytics?.estimated_cost_usd) }} +
+
+
+
+ +
+ 正在更新运营摘要... +
+
+ {{ analyticsError }} +
+ +
+
@@ -275,7 +517,7 @@ diff --git a/backend/app/api/admin_providers.py b/backend/app/api/admin_providers.py index 663fa45..1a1c7cb 100644 --- a/backend/app/api/admin_providers.py +++ b/backend/app/api/admin_providers.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Query from pydantic import BaseModel, ConfigDict, Field from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -8,6 +8,7 @@ from app.db.admin_models import Provider from app.db.database import get_db from app.services.adapters.registry import AdapterRegistry from app.services.cost_tracker import cost_tracker +from app.services.generation_jobs import get_admin_provider_analytics from app.services.provider_policy import DEFAULT_PROVIDERS, list_capability_policies from app.services.secret_service import SecretService @@ -56,6 +57,48 @@ class ProviderResponse(BaseModel): model_config = ConfigDict(from_attributes=True) + +class ProviderAnalyticsBucket(BaseModel): + capability: str + adapter: str + call_count: int + success_count: int + failure_count: int + avg_latency_ms: float | None = None + estimated_cost_usd: float + + +class ProviderAnalyticsUserBucket(BaseModel): + user_id: str + call_count: int + success_count: int + failure_count: int + job_count: int + story_count: int + estimated_cost_usd: float + + +class ProviderAnalyticsFailureReason(BaseModel): + reason: str + count: int + + +class ProviderAnalyticsResponse(BaseModel): + scope: str + window_days: int | None = None + capability: str | None = None + total_calls: int + successful_calls: int + failed_calls: int + avg_latency_ms: float | None = None + estimated_cost_usd: float + user_count: int + job_count: int + story_count: int + by_provider: list[ProviderAnalyticsBucket] + by_user: list[ProviderAnalyticsUserBucket] + failure_reasons: list[ProviderAnalyticsFailureReason] + @router.get("/providers/adapters") async def list_available_adapters(): """获取所有可用的适配器类型 (定义的类)。""" @@ -74,6 +117,20 @@ async def list_provider_capabilities(): return list_capability_policies() +@router.get("/providers/analytics", response_model=ProviderAnalyticsResponse) +async def get_provider_analytics( + days: int | None = Query(default=None, ge=1, le=365), + capability: str | None = Query(default=None), + db: AsyncSession = Depends(get_db), +): + """获取当前环境跨用户的 Provider 运营摘要。""" + return await get_admin_provider_analytics( + db, + days=days, + capability=capability, + ) + + @router.get("/providers", response_model=list[ProviderResponse]) async def list_providers(db: AsyncSession = Depends(get_db)): result = await db.execute(select(Provider)) diff --git a/backend/app/services/generation_jobs.py b/backend/app/services/generation_jobs.py index ace1975..2555492 100644 --- a/backend/app/services/generation_jobs.py +++ b/backend/app/services/generation_jobs.py @@ -606,23 +606,37 @@ def _aggregate_provider_events( } +def _event_matches_capability( + event: GenerationJobEvent, + capability: str | None = None, +) -> bool: + event_capability = str((event.event_metadata or {}).get("capability") or "unknown") + return capability is None or event_capability == capability + + def _provider_events_query( *, - user_id: str, + user_id: str | None = None, story_id: int | None = None, days: int | None = None, ): query = ( - select(GenerationJobEvent) + select( + GenerationJobEvent, + GenerationJob.user_id, + GenerationJob.story_id, + ) .join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id) .where( - GenerationJob.user_id == user_id, GenerationJobEvent.event_type.in_( ["provider_call_succeeded", "provider_call_failed"] ), ) ) + if user_id is not None: + query = query.where(GenerationJob.user_id == user_id) + if story_id is not None: query = query.where(GenerationJob.story_id == story_id) @@ -681,17 +695,12 @@ async def get_user_provider_analytics( filtered_event_job_ids = { event.job_id for event in events - if capability is None - or str((event.event_metadata or {}).get("capability") or "unknown") == capability + if _event_matches_capability(event, capability) } filtered_story_ids = { event.story_id for event in events - if event.story_id is not None - and ( - capability is None - or str((event.event_metadata or {}).get("capability") or "unknown") == capability - ) + if event.story_id is not None and _event_matches_capability(event, capability) } return { @@ -703,6 +712,87 @@ async def get_user_provider_analytics( } +async def get_admin_provider_analytics( + db: AsyncSession, + *, + days: int | None = None, + capability: str | None = None, +) -> dict[str, Any]: + """Aggregate provider telemetry across every user in the current environment.""" + + rows = (await db.execute(_provider_events_query(days=days))).all() + events = [event for event, _, _ in rows] + filtered_rows = [ + (event, user_id, story_id) + for event, user_id, story_id in rows + if _event_matches_capability(event, capability) + ] + + by_user: dict[str, dict[str, Any]] = {} + filtered_job_ids = {event.job_id for event, _, _ in filtered_rows} + filtered_story_ids = { + story_id for _, _, story_id in filtered_rows if story_id is not None + } + filtered_user_ids = {user_id for _, user_id, _ in filtered_rows} + + for event, user_id, story_id in filtered_rows: + bucket = by_user.setdefault( + user_id, + { + "user_id": user_id, + "call_count": 0, + "success_count": 0, + "failure_count": 0, + "estimated_cost_usd": 0.0, + "job_ids": set(), + "story_ids": set(), + }, + ) + bucket["call_count"] += 1 + bucket["job_ids"].add(event.job_id) + if story_id is not None: + bucket["story_ids"].add(story_id) + + if event.event_type == "provider_call_succeeded": + bucket["success_count"] += 1 + bucket["estimated_cost_usd"] += ( + _as_float((event.event_metadata or {}).get("estimated_cost_usd")) or 0.0 + ) + else: + bucket["failure_count"] += 1 + + serialized_users = [ + { + "user_id": user_id, + "call_count": bucket["call_count"], + "success_count": bucket["success_count"], + "failure_count": bucket["failure_count"], + "job_count": len(bucket["job_ids"]), + "story_count": len(bucket["story_ids"]), + "estimated_cost_usd": round(bucket["estimated_cost_usd"], 6), + } + for user_id, bucket in by_user.items() + ] + serialized_users.sort( + key=lambda item: ( + -int(item["call_count"]), + -float(item["estimated_cost_usd"]), + str(item["user_id"]), + ) + ) + + return { + "scope": "current_environment", + "window_days": days, + "capability": capability, + **_aggregate_provider_events(events, capability=capability), + "user_count": len(filtered_user_ids), + "job_count": len(filtered_job_ids), + "story_count": len(filtered_story_ids), + "by_user": serialized_users, + } + + async def get_user_generation_ops_summary( db: AsyncSession, *, diff --git a/backend/tests/test_admin_providers.py b/backend/tests/test_admin_providers.py new file mode 100644 index 0000000..bcbb755 --- /dev/null +++ b/backend/tests/test_admin_providers.py @@ -0,0 +1,285 @@ +from datetime import datetime, timedelta, timezone + +from fastapi import FastAPI +from httpx import ASGITransport, AsyncClient + +from app.api import admin_providers +from app.core.admin_auth import admin_guard +from app.db.database import get_db +from app.db.models import Story, User +from app.services.generation_jobs import create_generation_job, record_generation_event + + +def _build_admin_test_app(db_session) -> FastAPI: + app = FastAPI() + app.include_router(admin_providers.router, prefix="/admin") + + async def override_get_db(): + yield db_session + + async def override_admin_guard(): + return True + + app.dependency_overrides[get_db] = override_get_db + app.dependency_overrides[admin_guard] = override_admin_guard + return app + + +async def _create_story( + db_session, + *, + user_id: str, + title: str, + mode: str = "generated", +) -> Story: + story = Story( + user_id=user_id, + title=title, + story_text="测试内容", + cover_prompt="A gentle moonlit forest", + mode=mode, + generation_status="partial_ready", + text_status="ready", + image_status="not_requested", + audio_status="not_requested", + ) + db_session.add(story) + await db_session.commit() + await db_session.refresh(story) + return story + + +async def test_admin_provider_analytics_aggregate_across_users(db_session, test_user): + second_user = User( + id="github:67890", + name="Another User", + avatar_url="https://example.com/avatar-2.png", + provider="github", + ) + db_session.add(second_user) + await db_session.commit() + + first_story = await _create_story(db_session, user_id=test_user.id, title="第一位用户的故事") + second_story = await _create_story( + db_session, + user_id=second_user.id, + title="第二位用户的故事", + ) + + image_job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="asset_retry", + input_type="image", + request_payload={"assets": ["image"]}, + story_id=first_story.id, + ) + await record_generation_event( + db_session, + job=image_job, + story_id=first_story.id, + event_type="provider_call_succeeded", + status="succeeded", + metadata={ + "capability": "image", + "adapter": "demo", + "strategy": "priority", + "latency_ms": 42, + "estimated_cost_usd": 0.01, + }, + ) + await record_generation_event( + db_session, + job=image_job, + story_id=first_story.id, + event_type="provider_call_failed", + status="failed", + metadata={ + "capability": "image", + "adapter": "cqtai", + "strategy": "priority", + "latency_ms": 120, + "error": "timeout", + }, + ) + + audio_job = await create_generation_job( + db_session, + user_id=second_user.id, + output_mode="asset_retry", + input_type="audio", + request_payload={"assets": ["audio"]}, + story_id=second_story.id, + ) + await record_generation_event( + db_session, + job=audio_job, + story_id=second_story.id, + event_type="provider_call_succeeded", + status="succeeded", + metadata={ + "capability": "tts", + "adapter": "edge_tts", + "strategy": "priority", + "latency_ms": 18, + "estimated_cost_usd": 0.003, + }, + ) + + admin_app = _build_admin_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/providers/analytics") + + assert response.status_code == 200 + data = response.json() + assert data["scope"] == "current_environment" + assert data["user_count"] == 2 + assert data["job_count"] == 2 + assert data["story_count"] == 2 + assert data["total_calls"] == 3 + assert data["successful_calls"] == 2 + assert data["failed_calls"] == 1 + assert data["avg_latency_ms"] == 60.0 + assert data["estimated_cost_usd"] == 0.013 + assert data["failure_reasons"] == [{"reason": "timeout", "count": 1}] + assert data["by_provider"] == [ + { + "capability": "image", + "adapter": "cqtai", + "call_count": 1, + "success_count": 0, + "failure_count": 1, + "avg_latency_ms": 120.0, + "estimated_cost_usd": 0.0, + }, + { + "capability": "image", + "adapter": "demo", + "call_count": 1, + "success_count": 1, + "failure_count": 0, + "avg_latency_ms": 42.0, + "estimated_cost_usd": 0.01, + }, + { + "capability": "tts", + "adapter": "edge_tts", + "call_count": 1, + "success_count": 1, + "failure_count": 0, + "avg_latency_ms": 18.0, + "estimated_cost_usd": 0.003, + }, + ] + assert data["by_user"] == [ + { + "user_id": test_user.id, + "call_count": 2, + "success_count": 1, + "failure_count": 1, + "job_count": 1, + "story_count": 1, + "estimated_cost_usd": 0.01, + }, + { + "user_id": second_user.id, + "call_count": 1, + "success_count": 1, + "failure_count": 0, + "job_count": 1, + "story_count": 1, + "estimated_cost_usd": 0.003, + }, + ] + + +async def test_admin_provider_analytics_support_days_and_capability_filters( + db_session, + test_user, +): + second_user = User( + id="google:22222", + name="Filter User", + avatar_url="https://example.com/avatar-3.png", + provider="google", + ) + db_session.add(second_user) + await db_session.commit() + + first_story = await _create_story(db_session, user_id=test_user.id, title="旧事件故事") + second_story = await _create_story(db_session, user_id=second_user.id, title="最近事件故事") + + image_job = await create_generation_job( + db_session, + user_id=test_user.id, + output_mode="asset_retry", + input_type="image", + request_payload={"assets": ["image"]}, + story_id=first_story.id, + ) + old_event = await record_generation_event( + db_session, + job=image_job, + story_id=first_story.id, + event_type="provider_call_failed", + status="failed", + metadata={ + "capability": "image", + "adapter": "cqtai", + "strategy": "priority", + "latency_ms": 120, + "error": "timeout", + }, + ) + old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10) + await db_session.commit() + + audio_job = await create_generation_job( + db_session, + user_id=second_user.id, + output_mode="asset_retry", + input_type="audio", + request_payload={"assets": ["audio"]}, + story_id=second_story.id, + ) + await record_generation_event( + db_session, + job=audio_job, + story_id=second_story.id, + event_type="provider_call_succeeded", + status="succeeded", + metadata={ + "capability": "tts", + "adapter": "edge_tts", + "strategy": "priority", + "latency_ms": 18, + "estimated_cost_usd": 0.003, + }, + ) + + admin_app = _build_admin_test_app(db_session) + transport = ASGITransport(app=admin_app) + + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/admin/providers/analytics?days=7") + assert response.status_code == 200 + data = response.json() + assert data["window_days"] == 7 + assert data["total_calls"] == 1 + assert data["user_count"] == 1 + assert data["job_count"] == 1 + assert data["story_count"] == 1 + assert data["failure_reasons"] == [] + + response = await client.get("/admin/providers/analytics?capability=image") + assert response.status_code == 200 + data = response.json() + assert data["capability"] == "image" + assert data["total_calls"] == 1 + assert data["failed_calls"] == 1 + assert data["user_count"] == 1 + assert data["job_count"] == 1 + assert data["story_count"] == 1 + assert data["failure_reasons"] == [{"reason": "timeout", "count": 1}] diff --git a/docs/planning/demo-checklist.md b/docs/planning/demo-checklist.md index 1f6d1e1..f31a0f4 100644 --- a/docs/planning/demo-checklist.md +++ b/docs/planning/demo-checklist.md @@ -126,7 +126,7 @@ DreamWeaver 是面向 3-8 岁亲子场景的个性化 AI 绘本与陪伴式讲 ### 2:20 - 3:00 取舍与下一步 -求职版优先稳定闭环和可解释性,不做支付、多租户和复杂监控。现在 job/event 已能查询 workflow、资产补全、provider 调用轨迹和聚合指标,用户端和管理端也能展示生成轨迹与跨故事 Provider 运营摘要;统一生成也已经迁移到后台 worker,下一步是补取消/重试队列。 +求职版优先稳定闭环和可解释性,不做支付、多租户和复杂监控。现在 job/event 已能查询 workflow、资产补全、provider 调用轨迹和聚合指标,统一生成已迁移到后台 worker,取消/重试队列也已打通;用户端可看跨故事运营摘要,管理端可看当前环境跨用户 Provider dashboard。下一步应补跨环境汇聚、断点续跑和更完整监控。 --- diff --git a/docs/planning/demo-package.md b/docs/planning/demo-package.md index b2877d9..f4cd747 100644 --- a/docs/planning/demo-package.md +++ b/docs/planning/demo-package.md @@ -51,7 +51,7 @@ SMOKE_AUDIO=1 ./scripts/demo_smoke.sh - **AI 不确定性处理**:主内容和资产拆开,图片/音频失败不阻塞阅读。 - **Provider 产品化**:用户看到稳定能力,系统内部用 Capability / Provider / Adapter / Routing Policy 管供应链。 - **可观测性**:generation job/event 让生成过程、失败恢复和 Provider 成本可解释。 -- **可继续生产化**:统一生成已经迁移到 worker,前端轮询和任务事件模型也已打通,下一步是补取消/重试队列和更完整监控。 +- **可继续生产化**:统一生成已迁移到 worker,前端轮询、任务事件模型、取消/重试队列和管理台当前环境 dashboard 也已打通,下一步是补跨环境汇聚、断点续跑和更完整监控。 --- @@ -63,4 +63,4 @@ SMOKE_AUDIO=1 ./scripts/demo_smoke.sh | 图片生成失败 | 展示 `degraded_completed` 与资源重试 | | Docker 冷启动慢 | 演示前先跑 smoke 并保持容器运行 | | Provider 追问过深 | 回到 Capability / Provider / Adapter / Routing Policy 四层解释 | -| 生产化追问 | 说明下一步是取消/重试队列、监控告警、密钥治理和 Provider analytics 扩展 | +| 生产化追问 | 说明下一步是跨环境 Provider 汇聚、断点续跑、监控告警和密钥治理 | diff --git a/docs/planning/interview-pitch.md b/docs/planning/interview-pitch.md index 65d2c67..e5c8bbb 100644 --- a/docs/planning/interview-pitch.md +++ b/docs/planning/interview-pitch.md @@ -83,4 +83,4 @@ AI 生成产品最大的问题不是“能不能调模型”,而是结果不 ### 这个项目下一步怎么上线? -我已经把当前轻量 job/event 模型迁移到后台 worker,并打通了前端进度轮询;下一步会补取消/重试队列,再继续扩展跨时间窗口和跨用户维度的 provider 运营分析。生产上线前还需要补真实用户鉴权配置、密钥管理、监控告警和部署策略。 +我已经把当前轻量 job/event 模型迁移到后台 worker,并打通了前端进度轮询、取消/重试队列和管理台当前环境运营视图;下一步会补跨环境 Provider 汇聚、断点续跑和更完整监控。生产上线前还需要补真实用户鉴权配置、密钥管理和部署策略。 diff --git a/docs/planning/week-2-to-4-execution-backlog.md b/docs/planning/week-2-to-4-execution-backlog.md index 34cc3e0..fe0f9f6 100644 --- a/docs/planning/week-2-to-4-execution-backlog.md +++ b/docs/planning/week-2-to-4-execution-backlog.md @@ -72,6 +72,7 @@ Week 2 已完成演示闭环、统一生成工作流、generation job/event、 | W4-09 | Workflow | 卡住任务自动收敛 | `GENERATION_JOB_STALE_MINUTES` + Celery beat stale job maintenance | P1 | Done | | W4-10 | Workflow | 防止重复资产任务 | 运行中故事拒绝重复封面/音频/资产重试请求 | P1 | Done | | W4-11 | Workflow | 生成任务取消与重新排队 | 取消已提交任务,失败/取消任务可重新排队 | P1 | Done | +| W4-12 | Ops | 管理台当前环境跨用户 Provider dashboard | `GET /admin/providers/analytics` + admin console 运营摘要面板 | P1 | Done | --- diff --git a/docs/planning/week-4-sprint-review.md b/docs/planning/week-4-sprint-review.md index 0002e23..f15ec3a 100644 --- a/docs/planning/week-4-sprint-review.md +++ b/docs/planning/week-4-sprint-review.md @@ -40,6 +40,7 @@ DreamWeaver 已经具备求职演示所需的完整闭环: - Provider failover 和聚合指标 - 跨故事 Provider analytics - 任务运行概览、最近失败摘要与卡住任务收敛 +- 当前环境跨用户 Provider dashboard - 前端生成轨迹和自动轮询形态 --- @@ -61,7 +62,7 @@ DreamWeaver 已经具备求职演示所需的完整闭环: | Priority | Task | Why | | --- | --- | --- | -| P0 | 跨用户 / 跨环境 Provider dashboard | 当前已支持单用户摘要,后续要支持运营视角 | +| P0 | 跨环境 Provider dashboard | 当前环境跨用户摘要已落地,后续需要多部署汇聚视角 | | P1 | 监控告警与结构化 dashboard | 目前已有故事库级概览,后续要接入更完整观测体系 | | P1 | 断点续跑与更细粒度任务控制 | 让取消、重试和 worker 恢复更稳 | | P2 | 更细粒度叙事风格与音色策略 | 扩展体验,但不影响当前求职版主线 | diff --git a/docs/product/unified-generation-workflow-prd.md b/docs/product/unified-generation-workflow-prd.md index c754728..8c98940 100644 --- a/docs/product/unified-generation-workflow-prd.md +++ b/docs/product/unified-generation-workflow-prd.md @@ -77,16 +77,20 @@ DreamWeaver 当前同时支持普通故事生成、完整故事生成和绘本 - `POST /api/generations/jobs/{job_id}/cancel` - `POST /api/generations/jobs/{job_id}/retry` - 创建弹窗与生成轨迹都可触发取消或重新排队 +- 管理台已补当前环境的跨用户 Provider dashboard: + - `GET /admin/providers/analytics` + - 支持 `days` / `capability` 筛选 + - 可查看跨用户调用量、成功率、平均耗时、预估成本、失败原因和 Top 用户分布 ### Remaining Production Work - 普通故事、完整生成、绘本生成已有统一外部入口,内部 workflow 仍可继续减少兼容层分支 - 统一资产重试入口已覆盖普通故事封面、绘本缺失插图和故事音频,后续可继续扩展更细的资产级审计 -- 断点续跑、跨用户/跨环境 Provider 分析,以及更细粒度的任务控制策略仍属于后续生产化增强 +- 断点续跑、跨环境 Provider 汇聚视图,以及更细粒度的任务控制策略仍属于后续生产化增强 ### What This Means -这份 PRD 仍然保留目标态设计,但主干能力已经可在当前代码中演示。当前最适合的继续方式,是在已落地的 worker 化与任务控制基础上,把当前首版运营摘要扩展为可筛选、可对比的分析视角,并逐步补断点续跑和更完整监控,而不是继续扩大功能范围。 +这份 PRD 仍然保留目标态设计,但主干能力已经可在当前代码中演示。当前最适合的继续方式,是在已落地的 worker 化、任务控制和当前环境运营 dashboard 基础上,继续补断点续跑、跨环境汇聚和更完整监控,而不是继续扩大功能范围。 --- @@ -97,7 +101,7 @@ DreamWeaver 当前同时支持普通故事生成、完整故事生成和绘本 DreamWeaver 当前存在以下工作流层面问题: 1. **生成入口已建立,内部路径正在收束** - 当前前端已切到 `/api/generations`,旧的 `/api/stories/generate`、`/api/stories/generate/full`、`/api/storybook/generate` 仍作为兼容入口保留。service 内部已抽取上下文准备、主记录保存、封面补全、绘本插图补全和音频补全 helper,并用 `AssetCompletionResult` 表达资产补全结果。generation job/event 已落库并可查询,Provider 调用轨迹、单故事聚合指标和跨故事运营摘要也已进入用户端与管理端展示;统一生成请求现在已经交给后台 worker 执行。下一步重点是把取消/重试队列也接到这套事件模型上。 + 当前前端已切到 `/api/generations`,旧的 `/api/stories/generate`、`/api/stories/generate/full`、`/api/storybook/generate` 仍作为兼容入口保留。service 内部已抽取上下文准备、主记录保存、封面补全、绘本插图补全和音频补全 helper,并用 `AssetCompletionResult` 表达资产补全结果。generation job/event 已落库并可查询,Provider 调用轨迹、单故事聚合指标、跨故事运营摘要和管理台跨用户 dashboard 都已进入前端展示;统一生成请求现在已经交给后台 worker 执行。下一步重点应转向断点续跑和跨环境汇聚,而不是再补一套新的入口。 2. **保存与资产补全过程正在统一** 文本故事和绘本已拥有更清晰的主记录保存 helper;普通故事封面、绘本缺失插图、故事音频生成/缓存已共用各自的 asset completion helper。服务层已经能表达资产任务结果,并会把统一入口、资产重试、绘本逐页插图和音频生成的关键节点写入 job event。 diff --git a/docs/technical/architecture.md b/docs/technical/architecture.md index f030498..0ef96cb 100644 --- a/docs/technical/architecture.md +++ b/docs/technical/architecture.md @@ -100,7 +100,7 @@ flowchart LR 当前仍是求职版 MVP,不引入复杂工作流引擎。下一步生产化优先级: -1. 补齐生成任务取消与重新排队能力,减少误触和重复消耗。 -2. 基于现有 job 查询和前端轮询继续扩展真实异步进度与任务控制。 -3. 扩展 Provider analytics 的时间窗口、失败原因和跨用户维度。 -4. 继续补充部署、监控告警和密钥治理策略。 +1. 补跨环境 Provider 汇聚视图,避免每个部署环境各自成孤岛。 +2. 基于现有 job 查询和前端轮询继续扩展断点续跑与更细粒度任务控制。 +3. 在当前环境 dashboard 基础上继续扩展失败原因、监控告警和结构化观测能力。 +4. 继续补充部署与密钥治理策略。 diff --git a/docs/technical/generation-job-state.md b/docs/technical/generation-job-state.md index f2a7130..13c2689 100644 --- a/docs/technical/generation-job-state.md +++ b/docs/technical/generation-job-state.md @@ -61,7 +61,7 @@ job 响应会返回 `progress_percent`、`progress_label` 和 `is_terminal`, 当前已有两层记录,未来可以继续扩展字段和事件颗粒度: - 继续复用现有 job 查询和前端轮询进度条,为取消请求、重新排队和长任务通知提供统一入口。 -- 将当前跨故事 provider 指标扩展为跨用户、跨环境和更细颗粒度的失败原因维度分析。 +- 当前环境的跨用户 provider dashboard 已在 admin 端落地,下一步应补跨环境汇聚和更细颗粒度的失败原因维度分析。 ## 面试表达