Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -90,11 +90,13 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]:
progress_map: dict[str, tuple[int, str]] = {
"request_accepted": (5, "已接收请求"),
"workflow_planned": (8, "工作流已规划"),
"retry_queued": (8, "重新排队中"),
"worker_started": (12, "后台任务已开始"),
"cancel_requested": (15, "已请求取消"),
"context_prepared": (20, "上下文已准备"),
"narrative_generated": (45, "正文已生成"),
"evaluation_completed": (52, "内容评测已完成"),
"story_saved": (60, "主记录已保存"),
"provider_call_started": (65, "Provider 调用中"),
"provider_call_succeeded": (72, "Provider 调用成功"),
@@ -307,6 +309,137 @@ def generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any]:
}
_PUBLIC_EVENT_METADATA_KEYS = {
"adapter",
"artifact",
"asset",
"assets",
"attempted_cover",
"audio_status",
"blocks_main_result",
"capability",
"completed_pages",
"cover_prompt_present",
"estimated_cost_usd",
"failed_pages",
"failure_category",
"generation_status",
"has_memory_context",
"image_status",
"input_type",
"latency_ms",
"mode",
"output_mode",
"page_count",
"page_number",
"recoverable",
"requested_from_step",
"retryable",
"scope",
"stale_after_minutes",
"status",
"step",
"strategy",
"text_status",
}
_PUBLIC_REQUEST_PAYLOAD_KEYS = {
"assets",
"child_profile_id",
"generate_images",
"input_type",
"output_mode",
"page_count",
"story_id",
"type",
"universe_id",
}
def _public_metadata_value(value: Any) -> Any:
"""Return a JSON-safe public value or None when the value is internal."""
if isinstance(value, str | int | float | bool) or value is None:
return value
if isinstance(value, list):
public_items = [
item
for item in value
if isinstance(item, str | int | float | bool) or item is None
]
return public_items
return None
def public_generation_request_payload(job: GenerationJob) -> dict[str, Any]:
"""Return request payload fields safe for user-facing job details."""
payload = job.request_payload or {}
public_payload: dict[str, Any] = {}
for key in sorted(_PUBLIC_REQUEST_PAYLOAD_KEYS):
if key not in payload:
continue
value = _public_metadata_value(payload[key])
if value is not None:
public_payload[key] = value
return public_payload
def _public_plan_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Expose only coarse workflow plan metadata to user-facing responses."""
plan = metadata.get("plan")
if not isinstance(plan, dict):
return {}
public: dict[str, Any] = {}
mode = plan.get("mode")
if isinstance(mode, str):
public["plan_mode"] = mode
tasks = plan.get("tasks")
if isinstance(tasks, list):
public["planned_task_count"] = len(tasks)
public["recoverable_task_count"] = sum(
1
for task in tasks
if isinstance(task, dict) and task.get("recoverable") is True
)
return public
def public_generation_event_metadata(event: GenerationJobEvent) -> dict[str, Any]:
"""Return event metadata safe for user-facing job event streams."""
metadata = event.event_metadata or {}
public_metadata: dict[str, Any] = {}
for key in sorted(_PUBLIC_EVENT_METADATA_KEYS):
if key not in metadata:
continue
value = _public_metadata_value(metadata[key])
if value is not None:
public_metadata[key] = value
if event.event_type == "workflow_planned":
public_metadata.update(_public_plan_metadata(metadata))
return public_metadata
def public_generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any] | None:
"""Convert a generation event for user-facing APIs with internal data removed."""
if event.event_type in {"evaluation_completed", "executor_completed"}:
return None
response = generation_event_to_response(event)
response["event_metadata"] = public_generation_event_metadata(event)
return response
def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
"""Convert a generation job ORM object to an API summary dict."""
@@ -328,6 +461,23 @@ def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
}
def public_generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
"""Convert a generation job for user-facing APIs with internal steps hidden."""
summary = generation_job_to_summary(job)
if summary["current_step"] == "evaluation_completed":
summary["current_step"] = "narrative_generated"
summary["progress_percent"] = 45
summary["progress_label"] = "正文已生成"
summary["is_terminal"] = False
elif summary["current_step"] == "executor_completed":
summary["current_step"] = "workflow_planned"
summary["progress_percent"] = 8
summary["progress_label"] = "工作流已规划"
summary["is_terminal"] = False
return summary
async def get_generation_job_for_user(
db: AsyncSession,
*,
@@ -362,13 +512,13 @@ async def request_generation_job_cancel(
raise HTTPException(status_code=409, detail="当前任务不支持取消")
if job.status == "canceled":
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
if _is_terminal_status(job.status):
raise HTTPException(status_code=409, detail="当前任务已终止,无法取消")
if job.current_step == "cancel_requested":
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
if job.current_step in {"request_accepted", "retry_queued"}:
story = None
@@ -391,7 +541,7 @@ async def request_generation_job_cancel(
error_message="Generation canceled by user before worker execution started.",
message="Generation job was canceled before worker execution started.",
)
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
previous_step = job.current_step
job.error_message = "Cancellation requested by user."
@@ -407,7 +557,7 @@ async def request_generation_job_cancel(
)
await db.commit()
await db.refresh(job)
return generation_job_to_summary(job)
return public_generation_job_to_summary(job)
async def get_generation_job_detail(
@@ -437,9 +587,13 @@ async def get_generation_job_detail(
).scalars().all()
return {
**generation_job_to_summary(job),
"request_payload": job.request_payload or {},
"events": [generation_event_to_response(event) for event in events],
**public_generation_job_to_summary(job),
"request_payload": public_generation_request_payload(job),
"events": [
response
for event in events
if (response := public_generation_event_to_response(event)) is not None
],
}
@@ -461,7 +615,7 @@ async def list_story_generation_jobs(
.order_by(desc(GenerationJob.created_at), desc(GenerationJob.id))
)
).scalars().all()
return [generation_job_to_summary(job) for job in jobs]
return [public_generation_job_to_summary(job) for job in jobs]
async def get_active_story_generation_job(
@@ -513,6 +667,59 @@ def _as_float(value: Any) -> float | None:
return None
def _sorted_buckets(counts: dict[str, int]) -> list[dict[str, Any]]:
return [
{"name": name, "count": count}
for name, count in sorted(
counts.items(),
key=lambda item: (-item[1], item[0]),
)
]
def _aggregate_trace_events(events: list[GenerationJobEvent]) -> dict[str, Any]:
"""Aggregate workflow trace metadata across job events."""
by_step: dict[str, int] = {}
by_artifact: dict[str, int] = {}
failure_categories: dict[str, int] = {}
failed_events = 0
total_events = 0
for event in events:
if event.event_type in {"evaluation_completed", "executor_completed"}:
continue
total_events += 1
metadata = event.event_metadata or {}
step = metadata.get("step")
artifact = metadata.get("artifact")
failure_category = metadata.get("failure_category")
if isinstance(step, str) and step:
by_step[step] = by_step.get(step, 0) + 1
if isinstance(artifact, str) and artifact and artifact != "none":
by_artifact[artifact] = by_artifact.get(artifact, 0) + 1
if event.status == "failed":
failed_events += 1
category = (
failure_category
if isinstance(failure_category, str) and failure_category
else "unknown_error"
)
failure_categories[category] = failure_categories.get(category, 0) + 1
return {
"total_events": total_events,
"failed_events": failed_events,
"by_step": _sorted_buckets(by_step),
"by_artifact": _sorted_buckets(by_artifact),
"failure_categories": _sorted_buckets(failure_categories),
}
def _aggregate_provider_events(
events: list[GenerationJobEvent],
*,
@@ -679,6 +886,38 @@ async def get_story_provider_stats(
}
async def get_story_trace_summary(
db: AsyncSession,
*,
story_id: int,
user_id: str,
days: int | None = None,
) -> dict[str, Any]:
"""Aggregate workflow trace metadata from all user-owned jobs for one story."""
query = (
select(GenerationJobEvent)
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
.where(
GenerationJob.story_id == story_id,
GenerationJob.user_id == user_id,
)
.order_by(GenerationJobEvent.id)
)
if days is not None:
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
query = query.where(GenerationJobEvent.created_at >= cutoff)
events = (await db.execute(query)).scalars().all()
return {
"story_id": story_id,
"window_days": days,
**_aggregate_trace_events(events),
}
async def get_user_provider_analytics(
db: AsyncSession,
*,