"""Admin-only readiness audit for harness-driven generation.""" from __future__ import annotations from pathlib import Path from typing import Any from sqlalchemy.ext.asyncio import AsyncSession from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics from app.services.admin_executor_coverage import get_admin_executor_coverage from app.services.harness.evaluation_replay import replay_evaluation_golden_cases _GOLDEN_CASES_PATH = ( Path(__file__).resolve().parent / "harness" / "fixtures" / "evaluation_golden_cases.json" ) _MIN_RUNTIME_EVALUATIONS = 1 _MIN_EXECUTOR_RUNS = 1 _MIN_EVALUATION_PASS_RATE = 0.7 _MIN_EVALUATION_AVERAGE_SCORE = 0.7 _MIN_EXECUTOR_COVERAGE_RATIO = 0.2 def _check( *, code: str, status: str, message: str, details: dict[str, Any] | None = None, ) -> dict[str, Any]: return { "code": code, "status": status, "message": message, "details": details or {}, } def _overall_status(checks: list[dict[str, Any]]) -> str: statuses = {check["status"] for check in checks} if "blocked" in statuses: return "blocked" if "needs_attention" in statuses: return "needs_attention" return "ready" def _run_golden_replay() -> dict[str, Any]: if not _GOLDEN_CASES_PATH.exists(): return { "passed": False, "total_cases": 0, "failed_case_ids": ["fixture_missing"], "coverage_summary": {}, } result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH) return { "passed": result.passed, "total_cases": len(result.cases), "failed_case_ids": list(result.failed_case_ids), "coverage_summary": result.coverage_summary(), } def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]: if golden_replay["passed"] and golden_replay["total_cases"] > 0: return _check( code="golden_replay", status="ready", message="内部 golden replay 全部通过。", details={ "total_cases": golden_replay["total_cases"], "failed_case_count": len(golden_replay["failed_case_ids"]), }, ) return _check( code="golden_replay", status="blocked", message="内部 golden replay 未通过,暂停扩大 harness 接管范围。", details={ "total_cases": golden_replay["total_cases"], "failed_case_count": len(golden_replay["failed_case_ids"]), "failed_case_ids": golden_replay["failed_case_ids"], }, ) def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]: total = int(evaluation_analytics["total_evaluations"]) if total >= _MIN_RUNTIME_EVALUATIONS: return _check( code="runtime_evaluation_samples", status="ready", message="当前窗口已有内部 evaluation 运行样本。", details={ "total_evaluations": total, "min_required": _MIN_RUNTIME_EVALUATIONS, }, ) return _check( code="runtime_evaluation_samples", status="needs_attention", message="当前窗口缺少内部 evaluation 运行样本,建议先跑生成烟测。", details={ "total_evaluations": total, "min_required": _MIN_RUNTIME_EVALUATIONS, }, ) def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]: total = int(evaluation_analytics["total_evaluations"]) pass_rate = float(evaluation_analytics["pass_rate"]) average_score = evaluation_analytics["average_score"] if total == 0: return _check( code="runtime_evaluation_quality", status="needs_attention", message="暂无运行期 evaluation 质量样本。", details={ "total_evaluations": total, "min_pass_rate": _MIN_EVALUATION_PASS_RATE, "min_average_score": _MIN_EVALUATION_AVERAGE_SCORE, }, ) if pass_rate < _MIN_EVALUATION_PASS_RATE or ( average_score is not None and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE ): return _check( code="runtime_evaluation_quality", status="blocked", message="运行期 evaluation 质量未达到内部 readiness 门槛。", details={ "pass_rate": pass_rate, "average_score": average_score, "blocked_evaluations": evaluation_analytics["blocked_evaluations"], "min_pass_rate": _MIN_EVALUATION_PASS_RATE, "min_average_score": _MIN_EVALUATION_AVERAGE_SCORE, }, ) return _check( code="runtime_evaluation_quality", status="ready", message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。", details={ "pass_rate": pass_rate, "average_score": average_score, "blocked_evaluations": evaluation_analytics["blocked_evaluations"], }, ) def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]: total_runs = int(executor_coverage["total_runs"]) if total_runs >= _MIN_EXECUTOR_RUNS: return _check( code="executor_coverage_samples", status="ready", message="当前窗口已有 executor coverage 运行样本。", details={ "total_runs": total_runs, "min_required": _MIN_EXECUTOR_RUNS, }, ) return _check( code="executor_coverage_samples", status="needs_attention", message="当前窗口缺少 executor coverage 样本,建议先跑资产生成或重试烟测。", details={ "total_runs": total_runs, "min_required": _MIN_EXECUTOR_RUNS, }, ) def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]: total_runs = int(executor_coverage["total_runs"]) coverage_ratio = float(executor_coverage["coverage_ratio"]) if total_runs == 0: return _check( code="executor_coverage_ratio", status="needs_attention", message="暂无 executor coverage 运行样本。", details={ "total_runs": total_runs, "min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO, }, ) if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO: return _check( code="executor_coverage_ratio", status="blocked", message="executor coverage ratio 未达到内部 readiness 门槛。", details={ "coverage_ratio": coverage_ratio, "min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO, "total_planned_tasks": executor_coverage["total_planned_tasks"], "total_executed_tasks": executor_coverage["total_executed_tasks"], }, ) return _check( code="executor_coverage_ratio", status="ready", message="executor coverage ratio 达到内部 readiness 门槛。", details={ "coverage_ratio": coverage_ratio, "total_planned_tasks": executor_coverage["total_planned_tasks"], "total_executed_tasks": executor_coverage["total_executed_tasks"], }, ) async def get_admin_harness_readiness( db: AsyncSession, *, days: int | None = None, ) -> dict[str, Any]: """Return an admin-only readiness audit for harness release decisions.""" golden_replay = _run_golden_replay() evaluation_analytics = await get_admin_evaluation_analytics(db, days=days) executor_coverage = await get_admin_executor_coverage(db, days=days) checks = [ _golden_replay_check(golden_replay), _evaluation_sample_check(evaluation_analytics), _evaluation_quality_check(evaluation_analytics), _executor_sample_check(executor_coverage), _executor_ratio_check(executor_coverage), ] return { "scope": "admin_internal_harness_readiness", "window_days": days, "status": _overall_status(checks), "thresholds": { "min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS, "min_executor_runs": _MIN_EXECUTOR_RUNS, "min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE, "min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE, "min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO, }, "checks": checks, "golden_replay": golden_replay, "evaluation_analytics": evaluation_analytics, "executor_coverage": executor_coverage, }