Expand generation harness observability
This commit is contained in:
@@ -90,11 +90,13 @@ def _job_progress(job: GenerationJob) -> dict[str, Any]:
|
||||
|
||||
progress_map: dict[str, tuple[int, str]] = {
|
||||
"request_accepted": (5, "已接收请求"),
|
||||
"workflow_planned": (8, "工作流已规划"),
|
||||
"retry_queued": (8, "重新排队中"),
|
||||
"worker_started": (12, "后台任务已开始"),
|
||||
"cancel_requested": (15, "已请求取消"),
|
||||
"context_prepared": (20, "上下文已准备"),
|
||||
"narrative_generated": (45, "正文已生成"),
|
||||
"evaluation_completed": (52, "内容评测已完成"),
|
||||
"story_saved": (60, "主记录已保存"),
|
||||
"provider_call_started": (65, "Provider 调用中"),
|
||||
"provider_call_succeeded": (72, "Provider 调用成功"),
|
||||
@@ -307,6 +309,137 @@ def generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any]:
|
||||
}
|
||||
|
||||
|
||||
_PUBLIC_EVENT_METADATA_KEYS = {
|
||||
"adapter",
|
||||
"artifact",
|
||||
"asset",
|
||||
"assets",
|
||||
"attempted_cover",
|
||||
"audio_status",
|
||||
"blocks_main_result",
|
||||
"capability",
|
||||
"completed_pages",
|
||||
"cover_prompt_present",
|
||||
"estimated_cost_usd",
|
||||
"failed_pages",
|
||||
"failure_category",
|
||||
"generation_status",
|
||||
"has_memory_context",
|
||||
"image_status",
|
||||
"input_type",
|
||||
"latency_ms",
|
||||
"mode",
|
||||
"output_mode",
|
||||
"page_count",
|
||||
"page_number",
|
||||
"recoverable",
|
||||
"requested_from_step",
|
||||
"retryable",
|
||||
"scope",
|
||||
"stale_after_minutes",
|
||||
"status",
|
||||
"step",
|
||||
"strategy",
|
||||
"text_status",
|
||||
}
|
||||
|
||||
_PUBLIC_REQUEST_PAYLOAD_KEYS = {
|
||||
"assets",
|
||||
"child_profile_id",
|
||||
"generate_images",
|
||||
"input_type",
|
||||
"output_mode",
|
||||
"page_count",
|
||||
"story_id",
|
||||
"type",
|
||||
"universe_id",
|
||||
}
|
||||
|
||||
|
||||
def _public_metadata_value(value: Any) -> Any:
|
||||
"""Return a JSON-safe public value or None when the value is internal."""
|
||||
|
||||
if isinstance(value, str | int | float | bool) or value is None:
|
||||
return value
|
||||
if isinstance(value, list):
|
||||
public_items = [
|
||||
item
|
||||
for item in value
|
||||
if isinstance(item, str | int | float | bool) or item is None
|
||||
]
|
||||
return public_items
|
||||
return None
|
||||
|
||||
|
||||
def public_generation_request_payload(job: GenerationJob) -> dict[str, Any]:
|
||||
"""Return request payload fields safe for user-facing job details."""
|
||||
|
||||
payload = job.request_payload or {}
|
||||
public_payload: dict[str, Any] = {}
|
||||
|
||||
for key in sorted(_PUBLIC_REQUEST_PAYLOAD_KEYS):
|
||||
if key not in payload:
|
||||
continue
|
||||
value = _public_metadata_value(payload[key])
|
||||
if value is not None:
|
||||
public_payload[key] = value
|
||||
|
||||
return public_payload
|
||||
|
||||
|
||||
def _public_plan_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Expose only coarse workflow plan metadata to user-facing responses."""
|
||||
|
||||
plan = metadata.get("plan")
|
||||
if not isinstance(plan, dict):
|
||||
return {}
|
||||
|
||||
public: dict[str, Any] = {}
|
||||
mode = plan.get("mode")
|
||||
if isinstance(mode, str):
|
||||
public["plan_mode"] = mode
|
||||
|
||||
tasks = plan.get("tasks")
|
||||
if isinstance(tasks, list):
|
||||
public["planned_task_count"] = len(tasks)
|
||||
public["recoverable_task_count"] = sum(
|
||||
1
|
||||
for task in tasks
|
||||
if isinstance(task, dict) and task.get("recoverable") is True
|
||||
)
|
||||
|
||||
return public
|
||||
|
||||
|
||||
def public_generation_event_metadata(event: GenerationJobEvent) -> dict[str, Any]:
|
||||
"""Return event metadata safe for user-facing job event streams."""
|
||||
|
||||
metadata = event.event_metadata or {}
|
||||
public_metadata: dict[str, Any] = {}
|
||||
|
||||
for key in sorted(_PUBLIC_EVENT_METADATA_KEYS):
|
||||
if key not in metadata:
|
||||
continue
|
||||
value = _public_metadata_value(metadata[key])
|
||||
if value is not None:
|
||||
public_metadata[key] = value
|
||||
|
||||
if event.event_type == "workflow_planned":
|
||||
public_metadata.update(_public_plan_metadata(metadata))
|
||||
|
||||
return public_metadata
|
||||
|
||||
|
||||
def public_generation_event_to_response(event: GenerationJobEvent) -> dict[str, Any] | None:
|
||||
"""Convert a generation event for user-facing APIs with internal data removed."""
|
||||
|
||||
if event.event_type in {"evaluation_completed", "executor_completed"}:
|
||||
return None
|
||||
response = generation_event_to_response(event)
|
||||
response["event_metadata"] = public_generation_event_metadata(event)
|
||||
return response
|
||||
|
||||
|
||||
def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
|
||||
"""Convert a generation job ORM object to an API summary dict."""
|
||||
|
||||
@@ -328,6 +461,23 @@ def generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
|
||||
}
|
||||
|
||||
|
||||
def public_generation_job_to_summary(job: GenerationJob) -> dict[str, Any]:
|
||||
"""Convert a generation job for user-facing APIs with internal steps hidden."""
|
||||
|
||||
summary = generation_job_to_summary(job)
|
||||
if summary["current_step"] == "evaluation_completed":
|
||||
summary["current_step"] = "narrative_generated"
|
||||
summary["progress_percent"] = 45
|
||||
summary["progress_label"] = "正文已生成"
|
||||
summary["is_terminal"] = False
|
||||
elif summary["current_step"] == "executor_completed":
|
||||
summary["current_step"] = "workflow_planned"
|
||||
summary["progress_percent"] = 8
|
||||
summary["progress_label"] = "工作流已规划"
|
||||
summary["is_terminal"] = False
|
||||
return summary
|
||||
|
||||
|
||||
async def get_generation_job_for_user(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
@@ -362,13 +512,13 @@ async def request_generation_job_cancel(
|
||||
raise HTTPException(status_code=409, detail="当前任务不支持取消")
|
||||
|
||||
if job.status == "canceled":
|
||||
return generation_job_to_summary(job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
if _is_terminal_status(job.status):
|
||||
raise HTTPException(status_code=409, detail="当前任务已终止,无法取消")
|
||||
|
||||
if job.current_step == "cancel_requested":
|
||||
return generation_job_to_summary(job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
if job.current_step in {"request_accepted", "retry_queued"}:
|
||||
story = None
|
||||
@@ -391,7 +541,7 @@ async def request_generation_job_cancel(
|
||||
error_message="Generation canceled by user before worker execution started.",
|
||||
message="Generation job was canceled before worker execution started.",
|
||||
)
|
||||
return generation_job_to_summary(job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
previous_step = job.current_step
|
||||
job.error_message = "Cancellation requested by user."
|
||||
@@ -407,7 +557,7 @@ async def request_generation_job_cancel(
|
||||
)
|
||||
await db.commit()
|
||||
await db.refresh(job)
|
||||
return generation_job_to_summary(job)
|
||||
return public_generation_job_to_summary(job)
|
||||
|
||||
|
||||
async def get_generation_job_detail(
|
||||
@@ -437,9 +587,13 @@ async def get_generation_job_detail(
|
||||
).scalars().all()
|
||||
|
||||
return {
|
||||
**generation_job_to_summary(job),
|
||||
"request_payload": job.request_payload or {},
|
||||
"events": [generation_event_to_response(event) for event in events],
|
||||
**public_generation_job_to_summary(job),
|
||||
"request_payload": public_generation_request_payload(job),
|
||||
"events": [
|
||||
response
|
||||
for event in events
|
||||
if (response := public_generation_event_to_response(event)) is not None
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -461,7 +615,7 @@ async def list_story_generation_jobs(
|
||||
.order_by(desc(GenerationJob.created_at), desc(GenerationJob.id))
|
||||
)
|
||||
).scalars().all()
|
||||
return [generation_job_to_summary(job) for job in jobs]
|
||||
return [public_generation_job_to_summary(job) for job in jobs]
|
||||
|
||||
|
||||
async def get_active_story_generation_job(
|
||||
@@ -513,6 +667,59 @@ def _as_float(value: Any) -> float | None:
|
||||
return None
|
||||
|
||||
|
||||
def _sorted_buckets(counts: dict[str, int]) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{"name": name, "count": count}
|
||||
for name, count in sorted(
|
||||
counts.items(),
|
||||
key=lambda item: (-item[1], item[0]),
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _aggregate_trace_events(events: list[GenerationJobEvent]) -> dict[str, Any]:
|
||||
"""Aggregate workflow trace metadata across job events."""
|
||||
|
||||
by_step: dict[str, int] = {}
|
||||
by_artifact: dict[str, int] = {}
|
||||
failure_categories: dict[str, int] = {}
|
||||
failed_events = 0
|
||||
total_events = 0
|
||||
|
||||
for event in events:
|
||||
if event.event_type in {"evaluation_completed", "executor_completed"}:
|
||||
continue
|
||||
|
||||
total_events += 1
|
||||
metadata = event.event_metadata or {}
|
||||
step = metadata.get("step")
|
||||
artifact = metadata.get("artifact")
|
||||
failure_category = metadata.get("failure_category")
|
||||
|
||||
if isinstance(step, str) and step:
|
||||
by_step[step] = by_step.get(step, 0) + 1
|
||||
|
||||
if isinstance(artifact, str) and artifact and artifact != "none":
|
||||
by_artifact[artifact] = by_artifact.get(artifact, 0) + 1
|
||||
|
||||
if event.status == "failed":
|
||||
failed_events += 1
|
||||
category = (
|
||||
failure_category
|
||||
if isinstance(failure_category, str) and failure_category
|
||||
else "unknown_error"
|
||||
)
|
||||
failure_categories[category] = failure_categories.get(category, 0) + 1
|
||||
|
||||
return {
|
||||
"total_events": total_events,
|
||||
"failed_events": failed_events,
|
||||
"by_step": _sorted_buckets(by_step),
|
||||
"by_artifact": _sorted_buckets(by_artifact),
|
||||
"failure_categories": _sorted_buckets(failure_categories),
|
||||
}
|
||||
|
||||
|
||||
def _aggregate_provider_events(
|
||||
events: list[GenerationJobEvent],
|
||||
*,
|
||||
@@ -679,6 +886,38 @@ async def get_story_provider_stats(
|
||||
}
|
||||
|
||||
|
||||
async def get_story_trace_summary(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
story_id: int,
|
||||
user_id: str,
|
||||
days: int | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Aggregate workflow trace metadata from all user-owned jobs for one story."""
|
||||
|
||||
query = (
|
||||
select(GenerationJobEvent)
|
||||
.join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
|
||||
.where(
|
||||
GenerationJob.story_id == story_id,
|
||||
GenerationJob.user_id == user_id,
|
||||
)
|
||||
.order_by(GenerationJobEvent.id)
|
||||
)
|
||||
|
||||
if days is not None:
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
|
||||
query = query.where(GenerationJobEvent.created_at >= cutoff)
|
||||
|
||||
events = (await db.execute(query)).scalars().all()
|
||||
|
||||
return {
|
||||
"story_id": story_id,
|
||||
"window_days": days,
|
||||
**_aggregate_trace_events(events),
|
||||
}
|
||||
|
||||
|
||||
async def get_user_provider_analytics(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
|
||||
Reference in New Issue
Block a user