Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -27,6 +27,17 @@ def _build_admin_test_app(db_session) -> FastAPI:
return app
def _build_admin_auth_required_test_app(db_session) -> FastAPI:
app = FastAPI()
app.include_router(admin_providers.router, prefix="/admin")
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
return app
async def _create_story(
db_session,
*,
@@ -51,6 +62,38 @@ async def _create_story(
return story
async def _record_evaluation_event(
db_session,
*,
user_id: str,
story_id: int,
output_mode: str,
artifact: str,
status: str,
metadata: dict,
):
job = await create_generation_job(
db_session,
user_id=user_id,
output_mode=output_mode,
input_type="keywords",
request_payload={"data": "测试"},
story_id=story_id,
)
return await record_generation_event(
db_session,
job=job,
story_id=story_id,
event_type="evaluation_completed",
status=status,
metadata={
"step": "evaluation",
"artifact": artifact,
**metadata,
},
)
async def test_admin_provider_analytics_aggregate_across_users(db_session, test_user):
second_user = User(
id="github:67890",
@@ -197,6 +240,616 @@ async def test_admin_provider_analytics_aggregate_across_users(db_session, test_
]
async def test_admin_evaluation_analytics_aggregate_internal_events(
db_session,
test_user,
):
second_user = User(
id="google:evaluation-user",
name="Evaluation User",
avatar_url="https://example.com/eval.png",
provider="google",
)
db_session.add(second_user)
await db_session.commit()
story = await _create_story(db_session, user_id=test_user.id, title="评测故事")
storybook = await _create_story(
db_session,
user_id=second_user.id,
title="评测绘本",
mode="storybook",
)
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.92,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "完整"},
{"dimension": "readability", "score": 0.84, "reason": "可读"},
],
"warnings": [],
},
)
await _record_evaluation_event(
db_session,
user_id=second_user.id,
story_id=storybook.id,
output_mode="storybook",
artifact="storybook_pages",
status="failed",
metadata={
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [
{"dimension": "structure", "score": 0.0, "reason": "结构失败"},
{"dimension": "safety", "score": 0.0, "reason": "安全失败"},
],
"quality_gate": {
"issues": [
{
"code": "unsafe_child_content",
"message": "风险词",
"failure_category": "safety_error",
"field": "pages",
}
]
},
"warnings": ["绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_evaluations"
assert data["total_evaluations"] == 2
assert data["passed_evaluations"] == 1
assert data["blocked_evaluations"] == 1
assert data["pass_rate"] == 0.5
assert data["average_score"] == 0.46
assert data["job_count"] == 2
assert data["story_count"] == 2
assert data["user_count"] == 2
assert data["by_artifact"] == [
{"artifact": "story_text", "count": 1},
{"artifact": "storybook_pages", "count": 1},
]
assert data["by_output_mode"] == [
{"output_mode": "story", "count": 1},
{"output_mode": "storybook", "count": 1},
]
assert data["score_bands"] == [
{"band": "blocked_quality_gate", "count": 1},
{"band": "excellent", "count": 1},
]
assert data["dimension_scores"] == [
{"dimension": "structure", "average_score": 0.5, "count": 2},
{"dimension": "readability", "average_score": 0.84, "count": 1},
{"dimension": "safety", "average_score": 0.0, "count": 1},
]
assert data["quality_gate_issues"] == [
{"code": "unsafe_child_content", "count": 1},
]
assert data["failure_categories"] == [
{"category": "safety_error", "count": 1},
]
assert data["warnings"] == [
{
"message": "绘本分页正文长度可能不适合 3-8 岁儿童的翻页阅读体验。",
"count": 1,
},
]
assert "评测故事" not in str(data)
assert "风险词" not in str(data)
assert "完整" not in str(data)
async def test_admin_evaluation_analytics_support_days_and_artifact_filters(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="旧评测")
storybook = await _create_story(
db_session,
user_id=test_user.id,
title="新评测",
mode="storybook",
)
old_event = await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.96,
"passed": True,
"blocking": False,
"scores": [{"dimension": "structure", "score": 1.0, "reason": "完整"}],
"warnings": [],
},
)
old_event.created_at = datetime.now(timezone.utc) - timedelta(days=10)
await db_session.commit()
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=storybook.id,
output_mode="storybook",
artifact="storybook_pages",
status="failed",
metadata={
"overall_score": 0.72,
"passed": False,
"blocking": True,
"scores": [{"dimension": "readability", "score": 0.62, "reason": "过短"}],
"warnings": ["分页正文长度偏短"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics?days=7")
assert response.status_code == 200
data = response.json()
assert data["window_days"] == 7
assert data["total_evaluations"] == 1
assert data["artifact"] is None
assert data["by_artifact"] == [{"artifact": "storybook_pages", "count": 1}]
response = await client.get(
"/admin/evaluations/analytics?artifact=story_text"
)
assert response.status_code == 200
data = response.json()
assert data["artifact"] == "story_text"
assert data["total_evaluations"] == 1
assert data["average_score"] == 0.96
response = await client.get("/admin/evaluations/analytics?artifact=image")
assert response.status_code == 422
async def test_admin_evaluation_analytics_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/evaluations/analytics")
assert response.status_code == 401
async def test_admin_generation_job_trace_returns_internal_event_stream(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="内部链路故事")
job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="story",
input_type="keywords",
request_payload={
"output_mode": "story",
"type": "keywords",
"data": "月亮森林",
"internal_dispatch_token": "admin-visible-token",
"provider_override": "internal-provider",
"evaluation_policy": {"threshold": 0.9},
},
story_id=story.id,
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="workflow_planned",
status="succeeded",
metadata={
"step": "request_acceptance",
"artifact": "none",
"plan": {
"mode": "story",
"tasks": [
{
"key": "generate_narrative",
"step": "text_generation",
"artifact": "story_text",
"required": True,
"recoverable": False,
}
],
},
"internal_threshold": 0.9,
},
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="evaluation_completed",
status="succeeded",
metadata={
"step": "evaluation",
"artifact": "story_text",
"overall_score": 0.94,
"passed": True,
"blocking": False,
"scores": [{"dimension": "structure", "score": 1.0}],
},
)
await record_generation_event(
db_session,
job=job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get(f"/admin/generations/jobs/{job.id}/trace")
assert response.status_code == 200
data = response.json()
assert data["id"] == job.id
assert data["user_id"] == test_user.id
assert data["request_payload"]["data"] == "月亮森林"
assert data["request_payload"]["internal_dispatch_token"] == "admin-visible-token"
assert data["request_payload"]["evaluation_policy"] == {"threshold": 0.9}
event_types = [event["event_type"] for event in data["events"]]
assert event_types == [
"request_accepted",
"workflow_planned",
"evaluation_completed",
"executor_completed",
]
workflow_event = data["events"][1]
assert workflow_event["event_metadata"]["plan"]["tasks"][0]["key"] == (
"generate_narrative"
)
assert workflow_event["event_metadata"]["internal_threshold"] == 0.9
evaluation_event = data["events"][2]
assert evaluation_event["event_metadata"]["overall_score"] == 0.94
assert evaluation_event["event_metadata"]["scores"] == [
{"dimension": "structure", "score": 1.0}
]
executor_event = data["events"][3]
assert executor_event["event_metadata"]["executed_task_keys"] == [
"complete_image_asset"
]
assert executor_event["event_metadata"]["result_assets"] == ["cover_image"]
executor_coverage = data["executor_coverage"]
assert executor_coverage["scope"] == "admin_internal_job_executor_coverage"
assert executor_coverage["total_runs"] == 1
assert executor_coverage["total_planned_tasks"] == 3
assert executor_coverage["total_executed_tasks"] == 1
assert executor_coverage["total_ignored_tasks"] == 2
assert executor_coverage["coverage_ratio"] == 0.3333
assert executor_coverage["job_count"] == 1
assert executor_coverage["story_count"] == 1
assert executor_coverage["user_count"] == 1
assert executor_coverage["by_plan_mode"] == [
{"plan_mode": "asset_generation", "count": 1}
]
assert executor_coverage["by_output_mode"] == [
{"output_mode": "story", "count": 1}
]
assert executor_coverage["executed_task_keys"] == [
{"task_key": "complete_image_asset", "count": 1}
]
assert executor_coverage["ignored_task_keys"] == [
{"task_key": "complete_asset_generation", "count": 1},
{"task_key": "start_asset_generation", "count": 1},
]
assert executor_coverage["result_assets"] == [
{"asset": "cover_image", "count": 1}
]
async def test_admin_generation_job_trace_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/generations/jobs/missing-job/trace")
assert response.status_code == 401
async def test_admin_executor_coverage_aggregates_internal_events(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="执行器覆盖故事")
asset_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_generation",
input_type="audio,image",
request_payload={"story_id": story.id, "assets": ["audio", "image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=asset_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 4,
"executed_task_count": 2,
"ignored_task_count": 2,
"executed_task_keys": ["complete_audio_asset", "complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["audio", "cover_image"],
},
)
retry_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_retry",
input_type="image",
request_payload={"story_id": story.id, "assets": ["image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=retry_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_retry",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": ["start_asset_retry", "complete_asset_retry"],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/executors/coverage")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_executor_coverage"
assert data["total_runs"] == 2
assert data["total_planned_tasks"] == 7
assert data["total_executed_tasks"] == 3
assert data["total_ignored_tasks"] == 4
assert data["coverage_ratio"] == 0.4286
assert data["job_count"] == 2
assert data["story_count"] == 1
assert data["user_count"] == 1
assert data["by_plan_mode"] == [
{"plan_mode": "asset_generation", "count": 1},
{"plan_mode": "asset_retry", "count": 1},
]
assert data["executed_task_keys"] == [
{"task_key": "complete_image_asset", "count": 2},
{"task_key": "complete_audio_asset", "count": 1},
]
assert data["result_assets"] == [
{"asset": "cover_image", "count": 2},
{"asset": "audio", "count": 1},
]
response = await client.get("/admin/executors/coverage?plan_mode=asset_retry")
assert response.status_code == 200
data = response.json()
assert data["plan_mode"] == "asset_retry"
assert data["total_runs"] == 1
assert data["total_planned_tasks"] == 3
assert data["total_executed_tasks"] == 1
response = await client.get("/admin/executors/coverage?plan_mode=story")
assert response.status_code == 422
async def test_admin_executor_coverage_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/executors/coverage")
assert response.status_code == 401
async def test_admin_harness_readiness_returns_ready_when_internal_gates_pass(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="readiness 故事")
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="succeeded",
metadata={
"overall_score": 0.92,
"passed": True,
"blocking": False,
"scores": [
{"dimension": "structure", "score": 1.0, "reason": "内部 reason"},
{"dimension": "readability", "score": 0.84, "reason": "内部 reason"},
],
"warnings": [],
},
)
asset_job = await create_generation_job(
db_session,
user_id=test_user.id,
output_mode="asset_generation",
input_type="image",
request_payload={"story_id": story.id, "assets": ["image"]},
story_id=story.id,
)
await record_generation_event(
db_session,
job=asset_job,
story_id=story.id,
event_type="executor_completed",
status="succeeded",
metadata={
"plan_mode": "asset_generation",
"planned_task_count": 3,
"executed_task_count": 1,
"ignored_task_count": 2,
"executed_task_keys": ["complete_image_asset"],
"ignored_task_keys": [
"start_asset_generation",
"complete_asset_generation",
],
"result_assets": ["cover_image"],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 200
data = response.json()
assert data["scope"] == "admin_internal_harness_readiness"
assert data["status"] == "ready"
assert data["thresholds"] == {
"min_runtime_evaluations": 1,
"min_executor_runs": 1,
"min_evaluation_pass_rate": 0.7,
"min_evaluation_average_score": 0.7,
"min_executor_coverage_ratio": 0.2,
}
assert {check["code"]: check["status"] for check in data["checks"]} == {
"golden_replay": "ready",
"runtime_evaluation_samples": "ready",
"runtime_evaluation_quality": "ready",
"executor_coverage_samples": "ready",
"executor_coverage_ratio": "ready",
}
assert data["golden_replay"]["passed"] is True
assert data["golden_replay"]["total_cases"] == 11
assert data["evaluation_analytics"]["total_evaluations"] == 1
assert data["evaluation_analytics"]["pass_rate"] == 1.0
assert data["executor_coverage"]["total_runs"] == 1
assert data["executor_coverage"]["coverage_ratio"] == 0.3333
assert "内部 reason" not in str(data)
assert "readiness 故事" not in str(data)
async def test_admin_harness_readiness_blocks_low_runtime_quality(
db_session,
test_user,
):
story = await _create_story(db_session, user_id=test_user.id, title="低质量 readiness")
await _record_evaluation_event(
db_session,
user_id=test_user.id,
story_id=story.id,
output_mode="story",
artifact="story_text",
status="failed",
metadata={
"overall_score": 0.0,
"passed": False,
"blocking": True,
"scores": [{"dimension": "structure", "score": 0.0, "reason": "缺失"}],
"quality_gate": {
"issues": [
{
"code": "missing_story_text",
"message": "正文缺失",
"failure_category": "schema_error",
"field": "story_text",
}
]
},
"warnings": [],
},
)
admin_app = _build_admin_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 200
data = response.json()
assert data["status"] == "blocked"
checks = {check["code"]: check for check in data["checks"]}
assert checks["golden_replay"]["status"] == "ready"
assert checks["runtime_evaluation_samples"]["status"] == "ready"
assert checks["runtime_evaluation_quality"]["status"] == "blocked"
assert checks["executor_coverage_samples"]["status"] == "needs_attention"
assert checks["executor_coverage_ratio"]["status"] == "needs_attention"
assert data["evaluation_analytics"]["blocked_evaluations"] == 1
assert data["executor_coverage"]["total_runs"] == 0
assert "正文缺失" not in str(data)
assert "低质量 readiness" not in str(data)
async def test_admin_harness_readiness_requires_admin_auth(db_session):
admin_app = _build_admin_auth_required_test_app(db_session)
transport = ASGITransport(app=admin_app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/admin/harness/readiness")
assert response.status_code == 401
async def test_admin_provider_analytics_support_days_and_capability_filters(
db_session,
test_user,