Expand generation harness observability
This commit is contained in:
262
backend/app/services/admin_harness_readiness.py
Normal file
262
backend/app/services/admin_harness_readiness.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""Admin-only readiness audit for harness-driven generation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
|
||||
from app.services.admin_executor_coverage import get_admin_executor_coverage
|
||||
from app.services.harness.evaluation_replay import replay_evaluation_golden_cases
|
||||
|
||||
_GOLDEN_CASES_PATH = (
|
||||
Path(__file__).resolve().parent
|
||||
/ "harness"
|
||||
/ "fixtures"
|
||||
/ "evaluation_golden_cases.json"
|
||||
)
|
||||
|
||||
_MIN_RUNTIME_EVALUATIONS = 1
|
||||
_MIN_EXECUTOR_RUNS = 1
|
||||
_MIN_EVALUATION_PASS_RATE = 0.7
|
||||
_MIN_EVALUATION_AVERAGE_SCORE = 0.7
|
||||
_MIN_EXECUTOR_COVERAGE_RATIO = 0.2
|
||||
|
||||
|
||||
def _check(
|
||||
*,
|
||||
code: str,
|
||||
status: str,
|
||||
message: str,
|
||||
details: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"code": code,
|
||||
"status": status,
|
||||
"message": message,
|
||||
"details": details or {},
|
||||
}
|
||||
|
||||
|
||||
def _overall_status(checks: list[dict[str, Any]]) -> str:
|
||||
statuses = {check["status"] for check in checks}
|
||||
if "blocked" in statuses:
|
||||
return "blocked"
|
||||
if "needs_attention" in statuses:
|
||||
return "needs_attention"
|
||||
return "ready"
|
||||
|
||||
|
||||
def _run_golden_replay() -> dict[str, Any]:
|
||||
if not _GOLDEN_CASES_PATH.exists():
|
||||
return {
|
||||
"passed": False,
|
||||
"total_cases": 0,
|
||||
"failed_case_ids": ["fixture_missing"],
|
||||
"coverage_summary": {},
|
||||
}
|
||||
|
||||
result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH)
|
||||
return {
|
||||
"passed": result.passed,
|
||||
"total_cases": len(result.cases),
|
||||
"failed_case_ids": list(result.failed_case_ids),
|
||||
"coverage_summary": result.coverage_summary(),
|
||||
}
|
||||
|
||||
|
||||
def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]:
|
||||
if golden_replay["passed"] and golden_replay["total_cases"] > 0:
|
||||
return _check(
|
||||
code="golden_replay",
|
||||
status="ready",
|
||||
message="内部 golden replay 全部通过。",
|
||||
details={
|
||||
"total_cases": golden_replay["total_cases"],
|
||||
"failed_case_count": len(golden_replay["failed_case_ids"]),
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="golden_replay",
|
||||
status="blocked",
|
||||
message="内部 golden replay 未通过,暂停扩大 harness 接管范围。",
|
||||
details={
|
||||
"total_cases": golden_replay["total_cases"],
|
||||
"failed_case_count": len(golden_replay["failed_case_ids"]),
|
||||
"failed_case_ids": golden_replay["failed_case_ids"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
|
||||
total = int(evaluation_analytics["total_evaluations"])
|
||||
if total >= _MIN_RUNTIME_EVALUATIONS:
|
||||
return _check(
|
||||
code="runtime_evaluation_samples",
|
||||
status="ready",
|
||||
message="当前窗口已有内部 evaluation 运行样本。",
|
||||
details={
|
||||
"total_evaluations": total,
|
||||
"min_required": _MIN_RUNTIME_EVALUATIONS,
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="runtime_evaluation_samples",
|
||||
status="needs_attention",
|
||||
message="当前窗口缺少内部 evaluation 运行样本,建议先跑生成烟测。",
|
||||
details={
|
||||
"total_evaluations": total,
|
||||
"min_required": _MIN_RUNTIME_EVALUATIONS,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
|
||||
total = int(evaluation_analytics["total_evaluations"])
|
||||
pass_rate = float(evaluation_analytics["pass_rate"])
|
||||
average_score = evaluation_analytics["average_score"]
|
||||
|
||||
if total == 0:
|
||||
return _check(
|
||||
code="runtime_evaluation_quality",
|
||||
status="needs_attention",
|
||||
message="暂无运行期 evaluation 质量样本。",
|
||||
details={
|
||||
"total_evaluations": total,
|
||||
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
||||
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
||||
},
|
||||
)
|
||||
|
||||
if pass_rate < _MIN_EVALUATION_PASS_RATE or (
|
||||
average_score is not None
|
||||
and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE
|
||||
):
|
||||
return _check(
|
||||
code="runtime_evaluation_quality",
|
||||
status="blocked",
|
||||
message="运行期 evaluation 质量未达到内部 readiness 门槛。",
|
||||
details={
|
||||
"pass_rate": pass_rate,
|
||||
"average_score": average_score,
|
||||
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
|
||||
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
||||
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="runtime_evaluation_quality",
|
||||
status="ready",
|
||||
message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。",
|
||||
details={
|
||||
"pass_rate": pass_rate,
|
||||
"average_score": average_score,
|
||||
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
|
||||
total_runs = int(executor_coverage["total_runs"])
|
||||
if total_runs >= _MIN_EXECUTOR_RUNS:
|
||||
return _check(
|
||||
code="executor_coverage_samples",
|
||||
status="ready",
|
||||
message="当前窗口已有 executor coverage 运行样本。",
|
||||
details={
|
||||
"total_runs": total_runs,
|
||||
"min_required": _MIN_EXECUTOR_RUNS,
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="executor_coverage_samples",
|
||||
status="needs_attention",
|
||||
message="当前窗口缺少 executor coverage 样本,建议先跑资产生成或重试烟测。",
|
||||
details={
|
||||
"total_runs": total_runs,
|
||||
"min_required": _MIN_EXECUTOR_RUNS,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
|
||||
total_runs = int(executor_coverage["total_runs"])
|
||||
coverage_ratio = float(executor_coverage["coverage_ratio"])
|
||||
|
||||
if total_runs == 0:
|
||||
return _check(
|
||||
code="executor_coverage_ratio",
|
||||
status="needs_attention",
|
||||
message="暂无 executor coverage 运行样本。",
|
||||
details={
|
||||
"total_runs": total_runs,
|
||||
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
||||
},
|
||||
)
|
||||
|
||||
if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO:
|
||||
return _check(
|
||||
code="executor_coverage_ratio",
|
||||
status="blocked",
|
||||
message="executor coverage ratio 未达到内部 readiness 门槛。",
|
||||
details={
|
||||
"coverage_ratio": coverage_ratio,
|
||||
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
||||
"total_planned_tasks": executor_coverage["total_planned_tasks"],
|
||||
"total_executed_tasks": executor_coverage["total_executed_tasks"],
|
||||
},
|
||||
)
|
||||
|
||||
return _check(
|
||||
code="executor_coverage_ratio",
|
||||
status="ready",
|
||||
message="executor coverage ratio 达到内部 readiness 门槛。",
|
||||
details={
|
||||
"coverage_ratio": coverage_ratio,
|
||||
"total_planned_tasks": executor_coverage["total_planned_tasks"],
|
||||
"total_executed_tasks": executor_coverage["total_executed_tasks"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
async def get_admin_harness_readiness(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
days: int | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Return an admin-only readiness audit for harness release decisions."""
|
||||
|
||||
golden_replay = _run_golden_replay()
|
||||
evaluation_analytics = await get_admin_evaluation_analytics(db, days=days)
|
||||
executor_coverage = await get_admin_executor_coverage(db, days=days)
|
||||
|
||||
checks = [
|
||||
_golden_replay_check(golden_replay),
|
||||
_evaluation_sample_check(evaluation_analytics),
|
||||
_evaluation_quality_check(evaluation_analytics),
|
||||
_executor_sample_check(executor_coverage),
|
||||
_executor_ratio_check(executor_coverage),
|
||||
]
|
||||
|
||||
return {
|
||||
"scope": "admin_internal_harness_readiness",
|
||||
"window_days": days,
|
||||
"status": _overall_status(checks),
|
||||
"thresholds": {
|
||||
"min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS,
|
||||
"min_executor_runs": _MIN_EXECUTOR_RUNS,
|
||||
"min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
||||
"min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
||||
"min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
||||
},
|
||||
"checks": checks,
|
||||
"golden_replay": golden_replay,
|
||||
"evaluation_analytics": evaluation_analytics,
|
||||
"executor_coverage": executor_coverage,
|
||||
}
|
||||
Reference in New Issue
Block a user