Expand generation harness observability

This commit is contained in:
2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions

View File

@@ -0,0 +1,262 @@
"""Admin-only readiness audit for harness-driven generation."""
from __future__ import annotations
from pathlib import Path
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
from app.services.admin_executor_coverage import get_admin_executor_coverage
from app.services.harness.evaluation_replay import replay_evaluation_golden_cases
_GOLDEN_CASES_PATH = (
Path(__file__).resolve().parent
/ "harness"
/ "fixtures"
/ "evaluation_golden_cases.json"
)
_MIN_RUNTIME_EVALUATIONS = 1
_MIN_EXECUTOR_RUNS = 1
_MIN_EVALUATION_PASS_RATE = 0.7
_MIN_EVALUATION_AVERAGE_SCORE = 0.7
_MIN_EXECUTOR_COVERAGE_RATIO = 0.2
def _check(
*,
code: str,
status: str,
message: str,
details: dict[str, Any] | None = None,
) -> dict[str, Any]:
return {
"code": code,
"status": status,
"message": message,
"details": details or {},
}
def _overall_status(checks: list[dict[str, Any]]) -> str:
statuses = {check["status"] for check in checks}
if "blocked" in statuses:
return "blocked"
if "needs_attention" in statuses:
return "needs_attention"
return "ready"
def _run_golden_replay() -> dict[str, Any]:
if not _GOLDEN_CASES_PATH.exists():
return {
"passed": False,
"total_cases": 0,
"failed_case_ids": ["fixture_missing"],
"coverage_summary": {},
}
result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH)
return {
"passed": result.passed,
"total_cases": len(result.cases),
"failed_case_ids": list(result.failed_case_ids),
"coverage_summary": result.coverage_summary(),
}
def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]:
if golden_replay["passed"] and golden_replay["total_cases"] > 0:
return _check(
code="golden_replay",
status="ready",
message="内部 golden replay 全部通过。",
details={
"total_cases": golden_replay["total_cases"],
"failed_case_count": len(golden_replay["failed_case_ids"]),
},
)
return _check(
code="golden_replay",
status="blocked",
message="内部 golden replay 未通过,暂停扩大 harness 接管范围。",
details={
"total_cases": golden_replay["total_cases"],
"failed_case_count": len(golden_replay["failed_case_ids"]),
"failed_case_ids": golden_replay["failed_case_ids"],
},
)
def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
total = int(evaluation_analytics["total_evaluations"])
if total >= _MIN_RUNTIME_EVALUATIONS:
return _check(
code="runtime_evaluation_samples",
status="ready",
message="当前窗口已有内部 evaluation 运行样本。",
details={
"total_evaluations": total,
"min_required": _MIN_RUNTIME_EVALUATIONS,
},
)
return _check(
code="runtime_evaluation_samples",
status="needs_attention",
message="当前窗口缺少内部 evaluation 运行样本,建议先跑生成烟测。",
details={
"total_evaluations": total,
"min_required": _MIN_RUNTIME_EVALUATIONS,
},
)
def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
total = int(evaluation_analytics["total_evaluations"])
pass_rate = float(evaluation_analytics["pass_rate"])
average_score = evaluation_analytics["average_score"]
if total == 0:
return _check(
code="runtime_evaluation_quality",
status="needs_attention",
message="暂无运行期 evaluation 质量样本。",
details={
"total_evaluations": total,
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
},
)
if pass_rate < _MIN_EVALUATION_PASS_RATE or (
average_score is not None
and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE
):
return _check(
code="runtime_evaluation_quality",
status="blocked",
message="运行期 evaluation 质量未达到内部 readiness 门槛。",
details={
"pass_rate": pass_rate,
"average_score": average_score,
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
},
)
return _check(
code="runtime_evaluation_quality",
status="ready",
message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。",
details={
"pass_rate": pass_rate,
"average_score": average_score,
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
},
)
def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
total_runs = int(executor_coverage["total_runs"])
if total_runs >= _MIN_EXECUTOR_RUNS:
return _check(
code="executor_coverage_samples",
status="ready",
message="当前窗口已有 executor coverage 运行样本。",
details={
"total_runs": total_runs,
"min_required": _MIN_EXECUTOR_RUNS,
},
)
return _check(
code="executor_coverage_samples",
status="needs_attention",
message="当前窗口缺少 executor coverage 样本,建议先跑资产生成或重试烟测。",
details={
"total_runs": total_runs,
"min_required": _MIN_EXECUTOR_RUNS,
},
)
def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
total_runs = int(executor_coverage["total_runs"])
coverage_ratio = float(executor_coverage["coverage_ratio"])
if total_runs == 0:
return _check(
code="executor_coverage_ratio",
status="needs_attention",
message="暂无 executor coverage 运行样本。",
details={
"total_runs": total_runs,
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
},
)
if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO:
return _check(
code="executor_coverage_ratio",
status="blocked",
message="executor coverage ratio 未达到内部 readiness 门槛。",
details={
"coverage_ratio": coverage_ratio,
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
"total_planned_tasks": executor_coverage["total_planned_tasks"],
"total_executed_tasks": executor_coverage["total_executed_tasks"],
},
)
return _check(
code="executor_coverage_ratio",
status="ready",
message="executor coverage ratio 达到内部 readiness 门槛。",
details={
"coverage_ratio": coverage_ratio,
"total_planned_tasks": executor_coverage["total_planned_tasks"],
"total_executed_tasks": executor_coverage["total_executed_tasks"],
},
)
async def get_admin_harness_readiness(
db: AsyncSession,
*,
days: int | None = None,
) -> dict[str, Any]:
"""Return an admin-only readiness audit for harness release decisions."""
golden_replay = _run_golden_replay()
evaluation_analytics = await get_admin_evaluation_analytics(db, days=days)
executor_coverage = await get_admin_executor_coverage(db, days=days)
checks = [
_golden_replay_check(golden_replay),
_evaluation_sample_check(evaluation_analytics),
_evaluation_quality_check(evaluation_analytics),
_executor_sample_check(executor_coverage),
_executor_ratio_check(executor_coverage),
]
return {
"scope": "admin_internal_harness_readiness",
"window_days": days,
"status": _overall_status(checks),
"thresholds": {
"min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS,
"min_executor_runs": _MIN_EXECUTOR_RUNS,
"min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE,
"min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
"min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
},
"checks": checks,
"golden_replay": golden_replay,
"evaluation_analytics": evaluation_analytics,
"executor_coverage": executor_coverage,
}