263 lines
8.7 KiB
Python
263 lines
8.7 KiB
Python
"""Admin-only readiness audit for harness-driven generation."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
|
|
from app.services.admin_executor_coverage import get_admin_executor_coverage
|
|
from app.services.harness.evaluation_replay import replay_evaluation_golden_cases
|
|
|
|
_GOLDEN_CASES_PATH = (
|
|
Path(__file__).resolve().parent
|
|
/ "harness"
|
|
/ "fixtures"
|
|
/ "evaluation_golden_cases.json"
|
|
)
|
|
|
|
_MIN_RUNTIME_EVALUATIONS = 1
|
|
_MIN_EXECUTOR_RUNS = 1
|
|
_MIN_EVALUATION_PASS_RATE = 0.7
|
|
_MIN_EVALUATION_AVERAGE_SCORE = 0.7
|
|
_MIN_EXECUTOR_COVERAGE_RATIO = 0.2
|
|
|
|
|
|
def _check(
|
|
*,
|
|
code: str,
|
|
status: str,
|
|
message: str,
|
|
details: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
return {
|
|
"code": code,
|
|
"status": status,
|
|
"message": message,
|
|
"details": details or {},
|
|
}
|
|
|
|
|
|
def _overall_status(checks: list[dict[str, Any]]) -> str:
|
|
statuses = {check["status"] for check in checks}
|
|
if "blocked" in statuses:
|
|
return "blocked"
|
|
if "needs_attention" in statuses:
|
|
return "needs_attention"
|
|
return "ready"
|
|
|
|
|
|
def _run_golden_replay() -> dict[str, Any]:
|
|
if not _GOLDEN_CASES_PATH.exists():
|
|
return {
|
|
"passed": False,
|
|
"total_cases": 0,
|
|
"failed_case_ids": ["fixture_missing"],
|
|
"coverage_summary": {},
|
|
}
|
|
|
|
result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH)
|
|
return {
|
|
"passed": result.passed,
|
|
"total_cases": len(result.cases),
|
|
"failed_case_ids": list(result.failed_case_ids),
|
|
"coverage_summary": result.coverage_summary(),
|
|
}
|
|
|
|
|
|
def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]:
|
|
if golden_replay["passed"] and golden_replay["total_cases"] > 0:
|
|
return _check(
|
|
code="golden_replay",
|
|
status="ready",
|
|
message="内部 golden replay 全部通过。",
|
|
details={
|
|
"total_cases": golden_replay["total_cases"],
|
|
"failed_case_count": len(golden_replay["failed_case_ids"]),
|
|
},
|
|
)
|
|
|
|
return _check(
|
|
code="golden_replay",
|
|
status="blocked",
|
|
message="内部 golden replay 未通过,暂停扩大 harness 接管范围。",
|
|
details={
|
|
"total_cases": golden_replay["total_cases"],
|
|
"failed_case_count": len(golden_replay["failed_case_ids"]),
|
|
"failed_case_ids": golden_replay["failed_case_ids"],
|
|
},
|
|
)
|
|
|
|
|
|
def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
|
|
total = int(evaluation_analytics["total_evaluations"])
|
|
if total >= _MIN_RUNTIME_EVALUATIONS:
|
|
return _check(
|
|
code="runtime_evaluation_samples",
|
|
status="ready",
|
|
message="当前窗口已有内部 evaluation 运行样本。",
|
|
details={
|
|
"total_evaluations": total,
|
|
"min_required": _MIN_RUNTIME_EVALUATIONS,
|
|
},
|
|
)
|
|
|
|
return _check(
|
|
code="runtime_evaluation_samples",
|
|
status="needs_attention",
|
|
message="当前窗口缺少内部 evaluation 运行样本,建议先跑生成烟测。",
|
|
details={
|
|
"total_evaluations": total,
|
|
"min_required": _MIN_RUNTIME_EVALUATIONS,
|
|
},
|
|
)
|
|
|
|
|
|
def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
|
|
total = int(evaluation_analytics["total_evaluations"])
|
|
pass_rate = float(evaluation_analytics["pass_rate"])
|
|
average_score = evaluation_analytics["average_score"]
|
|
|
|
if total == 0:
|
|
return _check(
|
|
code="runtime_evaluation_quality",
|
|
status="needs_attention",
|
|
message="暂无运行期 evaluation 质量样本。",
|
|
details={
|
|
"total_evaluations": total,
|
|
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
|
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
|
},
|
|
)
|
|
|
|
if pass_rate < _MIN_EVALUATION_PASS_RATE or (
|
|
average_score is not None
|
|
and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE
|
|
):
|
|
return _check(
|
|
code="runtime_evaluation_quality",
|
|
status="blocked",
|
|
message="运行期 evaluation 质量未达到内部 readiness 门槛。",
|
|
details={
|
|
"pass_rate": pass_rate,
|
|
"average_score": average_score,
|
|
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
|
|
"min_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
|
"min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
|
},
|
|
)
|
|
|
|
return _check(
|
|
code="runtime_evaluation_quality",
|
|
status="ready",
|
|
message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。",
|
|
details={
|
|
"pass_rate": pass_rate,
|
|
"average_score": average_score,
|
|
"blocked_evaluations": evaluation_analytics["blocked_evaluations"],
|
|
},
|
|
)
|
|
|
|
|
|
def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
|
|
total_runs = int(executor_coverage["total_runs"])
|
|
if total_runs >= _MIN_EXECUTOR_RUNS:
|
|
return _check(
|
|
code="executor_coverage_samples",
|
|
status="ready",
|
|
message="当前窗口已有 executor coverage 运行样本。",
|
|
details={
|
|
"total_runs": total_runs,
|
|
"min_required": _MIN_EXECUTOR_RUNS,
|
|
},
|
|
)
|
|
|
|
return _check(
|
|
code="executor_coverage_samples",
|
|
status="needs_attention",
|
|
message="当前窗口缺少 executor coverage 样本,建议先跑资产生成或重试烟测。",
|
|
details={
|
|
"total_runs": total_runs,
|
|
"min_required": _MIN_EXECUTOR_RUNS,
|
|
},
|
|
)
|
|
|
|
|
|
def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
|
|
total_runs = int(executor_coverage["total_runs"])
|
|
coverage_ratio = float(executor_coverage["coverage_ratio"])
|
|
|
|
if total_runs == 0:
|
|
return _check(
|
|
code="executor_coverage_ratio",
|
|
status="needs_attention",
|
|
message="暂无 executor coverage 运行样本。",
|
|
details={
|
|
"total_runs": total_runs,
|
|
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
|
},
|
|
)
|
|
|
|
if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO:
|
|
return _check(
|
|
code="executor_coverage_ratio",
|
|
status="blocked",
|
|
message="executor coverage ratio 未达到内部 readiness 门槛。",
|
|
details={
|
|
"coverage_ratio": coverage_ratio,
|
|
"min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
|
"total_planned_tasks": executor_coverage["total_planned_tasks"],
|
|
"total_executed_tasks": executor_coverage["total_executed_tasks"],
|
|
},
|
|
)
|
|
|
|
return _check(
|
|
code="executor_coverage_ratio",
|
|
status="ready",
|
|
message="executor coverage ratio 达到内部 readiness 门槛。",
|
|
details={
|
|
"coverage_ratio": coverage_ratio,
|
|
"total_planned_tasks": executor_coverage["total_planned_tasks"],
|
|
"total_executed_tasks": executor_coverage["total_executed_tasks"],
|
|
},
|
|
)
|
|
|
|
|
|
async def get_admin_harness_readiness(
|
|
db: AsyncSession,
|
|
*,
|
|
days: int | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Return an admin-only readiness audit for harness release decisions."""
|
|
|
|
golden_replay = _run_golden_replay()
|
|
evaluation_analytics = await get_admin_evaluation_analytics(db, days=days)
|
|
executor_coverage = await get_admin_executor_coverage(db, days=days)
|
|
|
|
checks = [
|
|
_golden_replay_check(golden_replay),
|
|
_evaluation_sample_check(evaluation_analytics),
|
|
_evaluation_quality_check(evaluation_analytics),
|
|
_executor_sample_check(executor_coverage),
|
|
_executor_ratio_check(executor_coverage),
|
|
]
|
|
|
|
return {
|
|
"scope": "admin_internal_harness_readiness",
|
|
"window_days": days,
|
|
"status": _overall_status(checks),
|
|
"thresholds": {
|
|
"min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS,
|
|
"min_executor_runs": _MIN_EXECUTOR_RUNS,
|
|
"min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE,
|
|
"min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
|
|
"min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
|
|
},
|
|
"checks": checks,
|
|
"golden_replay": golden_replay,
|
|
"evaluation_analytics": evaluation_analytics,
|
|
"executor_coverage": executor_coverage,
|
|
}
|