feat: add generation trace and partial-ready workflow status

This commit is contained in:
2026-04-18 21:53:55 +08:00
parent 96dfc677e2
commit e99a7fbe14
36 changed files with 2597 additions and 144 deletions

View File

@@ -10,6 +10,7 @@ from app.core.logging import get_logger
from app.services.adapters import AdapterConfig, AdapterRegistry
from app.services.adapters.text.models import StoryOutput
from app.services.cost_tracker import cost_tracker
from app.services.generation_jobs import record_generation_event
from app.services.provider_cache import get_providers
from app.services.provider_metrics import health_checker, metrics_collector
from app.services.provider_policy import (
@@ -22,6 +23,7 @@ from app.services.provider_policy import (
if TYPE_CHECKING:
from app.db.admin_models import Provider
from app.db.models import GenerationJob
logger = get_logger(__name__)
@@ -36,6 +38,58 @@ _round_robin_counters: dict[ProviderType, int] = {
_latency_cache: dict[str, float] = {}
def _safe_estimated_cost(adapter) -> float:
"""Return an adapter cost value that is safe to serialize in job events."""
try:
return float(adapter.estimated_cost)
except Exception:
return 0.0
async def _record_provider_event_if_present(
db: AsyncSession | None,
*,
job: "GenerationJob | None",
event_type: str,
status: str,
provider_type: ProviderType,
adapter_name: str,
strategy: RoutingStrategy,
provider_id: str | None = None,
story_id: int | None = None,
latency_ms: int | None = None,
estimated_cost: float | None = None,
error: str | None = None,
) -> None:
"""Append provider call telemetry to the active generation job."""
if db is None or job is None:
return
await record_generation_event(
db,
job=job,
story_id=story_id,
event_type=event_type,
status=status,
message=(
f"{provider_type} provider {adapter_name} {status}."
if error is None
else f"{provider_type} provider {adapter_name} failed."
),
metadata={
"capability": provider_type,
"adapter": adapter_name,
"provider_id": provider_id,
"strategy": strategy.value,
"latency_ms": latency_ms,
"estimated_cost_usd": estimated_cost,
"error": error,
},
)
def _get_api_key(config_ref: str | None, adapter_name: str) -> str:
"""根据 config_ref 或适配器名称获取 API Key。"""
# 优先使用 config_ref
@@ -228,6 +282,8 @@ async def _route_with_failover(
strategy: RoutingStrategy = RoutingStrategy.PRIORITY,
db: AsyncSession | None = None,
user_id: str | None = None,
generation_job: "GenerationJob | None" = None,
story_id: int | None = None,
**kwargs,
) -> T:
"""通用 provider failover 路由。
@@ -237,6 +293,8 @@ async def _route_with_failover(
strategy: 路由策略
db: 数据库会话(可选,用于指标收集和熔断检查)
user_id: 用户 ID可选用于成本追踪和预算检查
generation_job: 生成任务(可选,用于记录 provider 调用轨迹)
story_id: 故事 ID可选用于关联 provider 事件)
**kwargs: 传递给适配器的参数
"""
providers = await _get_providers_with_config(provider_type)
@@ -274,7 +332,9 @@ async def _route_with_failover(
errors.append(f"{name}: 适配器未注册")
continue
provider_id = db_provider.id if db_provider else None
provider_id = str(db_provider.id) if db_provider else None
estimated_cost: float | None = None
start_time: float | None = None
try:
logger.debug(
@@ -285,6 +345,20 @@ async def _route_with_failover(
)
adapter = adapter_class(config)
estimated_cost = _safe_estimated_cost(adapter)
await _record_provider_event_if_present(
db,
job=generation_job,
story_id=story_id,
event_type="provider_call_started",
status="running",
provider_type=provider_type,
adapter_name=name,
provider_id=provider_id,
strategy=strategy,
estimated_cost=estimated_cost,
)
# 执行并计时
start_time = time.time()
@@ -301,7 +375,7 @@ async def _route_with_failover(
provider_id=provider_id,
success=True,
latency_ms=latency_ms,
cost_usd=adapter.estimated_cost,
cost_usd=estimated_cost,
)
await health_checker.record_call_result(db, provider_id, success=True)
@@ -312,10 +386,24 @@ async def _route_with_failover(
user_id=user_id,
provider_name=name,
capability=provider_type,
estimated_cost=adapter.estimated_cost,
estimated_cost=estimated_cost,
provider_id=provider_id,
)
await _record_provider_event_if_present(
db,
job=generation_job,
story_id=story_id,
event_type="provider_call_succeeded",
status="succeeded",
provider_type=provider_type,
adapter_name=name,
provider_id=provider_id,
strategy=strategy,
latency_ms=latency_ms,
estimated_cost=estimated_cost,
)
logger.info(
"provider_success",
provider_type=provider_type,
@@ -326,6 +414,11 @@ async def _route_with_failover(
except Exception as exc:
error_msg = str(exc)
latency_ms = (
int((time.time() - start_time) * 1000)
if start_time is not None
else None
)
logger.warning(
"provider_failed",
provider_type=provider_type,
@@ -346,6 +439,21 @@ async def _route_with_failover(
db, provider_id, success=False, error=error_msg
)
await _record_provider_event_if_present(
db,
job=generation_job,
story_id=story_id,
event_type="provider_call_failed",
status="failed",
provider_type=provider_type,
adapter_name=name,
provider_id=provider_id,
strategy=strategy,
latency_ms=latency_ms,
estimated_cost=estimated_cost,
error=error_msg,
)
raise ValueError(f"No {provider_type} provider succeeded. Errors: {' | '.join(errors)}")
@@ -356,12 +464,16 @@ async def generate_story_content(
memory_context: str | None = None,
strategy: RoutingStrategy = RoutingStrategy.PRIORITY,
db: AsyncSession | None = None,
user_id: str | None = None,
generation_job: "GenerationJob | None" = None,
) -> StoryOutput:
"""生成或润色故事,支持 failover。"""
return await _route_with_failover(
"text",
strategy=strategy,
db=db,
user_id=user_id,
generation_job=generation_job,
input_type=input_type,
data=data,
education_theme=education_theme,
@@ -373,19 +485,42 @@ async def generate_image(
prompt: str,
strategy: RoutingStrategy = RoutingStrategy.PRIORITY,
db: AsyncSession | None = None,
user_id: str | None = None,
generation_job: "GenerationJob | None" = None,
story_id: int | None = None,
**kwargs,
) -> str:
"""生成图片,返回 URL支持 failover。"""
return await _route_with_failover("image", strategy=strategy, db=db, prompt=prompt, **kwargs)
return await _route_with_failover(
"image",
strategy=strategy,
db=db,
user_id=user_id,
generation_job=generation_job,
story_id=story_id,
prompt=prompt,
**kwargs,
)
async def text_to_speech(
text: str,
strategy: RoutingStrategy = RoutingStrategy.PRIORITY,
db: AsyncSession | None = None,
user_id: str | None = None,
generation_job: "GenerationJob | None" = None,
story_id: int | None = None,
) -> bytes:
"""文本转语音,返回 MP3 bytes支持 failover。"""
return await _route_with_failover("tts", strategy=strategy, db=db, text=text)
return await _route_with_failover(
"tts",
strategy=strategy,
db=db,
user_id=user_id,
generation_job=generation_job,
story_id=story_id,
text=text,
)
async def generate_storybook(
@@ -395,6 +530,8 @@ async def generate_storybook(
memory_context: str | None = None,
strategy: RoutingStrategy = RoutingStrategy.PRIORITY,
db: AsyncSession | None = None,
user_id: str | None = None,
generation_job: "GenerationJob | None" = None,
):
"""生成分页故事书,支持 failover。"""
from app.services.adapters.storybook.primary import Storybook
@@ -403,6 +540,8 @@ async def generate_storybook(
"storybook",
strategy=strategy,
db=db,
user_id=user_id,
generation_job=generation_job,
keywords=keywords,
page_count=page_count,
education_theme=education_theme,