Expand generation harness observability

2026-06-24 10:48:23 +08:00
parent 459ca9edef
commit 1f34d80083
35 changed files with 8003 additions and 112 deletions
--- a/backend/app/services/admin_harness_readiness.py
+++ b/backend/app/services/admin_harness_readiness.py
@@ -0,0 +1,262 @@
+"""Admin-only readiness audit for harness-driven generation."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.services.admin_evaluation_analytics import get_admin_evaluation_analytics
+from app.services.admin_executor_coverage import get_admin_executor_coverage
+from app.services.harness.evaluation_replay import replay_evaluation_golden_cases
+
+_GOLDEN_CASES_PATH = (
+    Path(__file__).resolve().parent
+    / "harness"
+    / "fixtures"
+    / "evaluation_golden_cases.json"
+)
+
+_MIN_RUNTIME_EVALUATIONS = 1
+_MIN_EXECUTOR_RUNS = 1
+_MIN_EVALUATION_PASS_RATE = 0.7
+_MIN_EVALUATION_AVERAGE_SCORE = 0.7
+_MIN_EXECUTOR_COVERAGE_RATIO = 0.2
+
+
+def _check(
+    *,
+    code: str,
+    status: str,
+    message: str,
+    details: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    return {
+        "code": code,
+        "status": status,
+        "message": message,
+        "details": details or {},
+    }
+
+
+def _overall_status(checks: list[dict[str, Any]]) -> str:
+    statuses = {check["status"] for check in checks}
+    if "blocked" in statuses:
+        return "blocked"
+    if "needs_attention" in statuses:
+        return "needs_attention"
+    return "ready"
+
+
+def _run_golden_replay() -> dict[str, Any]:
+    if not _GOLDEN_CASES_PATH.exists():
+        return {
+            "passed": False,
+            "total_cases": 0,
+            "failed_case_ids": ["fixture_missing"],
+            "coverage_summary": {},
+        }
+
+    result = replay_evaluation_golden_cases(_GOLDEN_CASES_PATH)
+    return {
+        "passed": result.passed,
+        "total_cases": len(result.cases),
+        "failed_case_ids": list(result.failed_case_ids),
+        "coverage_summary": result.coverage_summary(),
+    }
+
+
+def _golden_replay_check(golden_replay: dict[str, Any]) -> dict[str, Any]:
+    if golden_replay["passed"] and golden_replay["total_cases"] > 0:
+        return _check(
+            code="golden_replay",
+            status="ready",
+            message="内部 golden replay 全部通过。",
+            details={
+                "total_cases": golden_replay["total_cases"],
+                "failed_case_count": len(golden_replay["failed_case_ids"]),
+            },
+        )
+
+    return _check(
+        code="golden_replay",
+        status="blocked",
+        message="内部 golden replay 未通过，暂停扩大 harness 接管范围。",
+        details={
+            "total_cases": golden_replay["total_cases"],
+            "failed_case_count": len(golden_replay["failed_case_ids"]),
+            "failed_case_ids": golden_replay["failed_case_ids"],
+        },
+    )
+
+
+def _evaluation_sample_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
+    total = int(evaluation_analytics["total_evaluations"])
+    if total >= _MIN_RUNTIME_EVALUATIONS:
+        return _check(
+            code="runtime_evaluation_samples",
+            status="ready",
+            message="当前窗口已有内部 evaluation 运行样本。",
+            details={
+                "total_evaluations": total,
+                "min_required": _MIN_RUNTIME_EVALUATIONS,
+            },
+        )
+
+    return _check(
+        code="runtime_evaluation_samples",
+        status="needs_attention",
+        message="当前窗口缺少内部 evaluation 运行样本，建议先跑生成烟测。",
+        details={
+            "total_evaluations": total,
+            "min_required": _MIN_RUNTIME_EVALUATIONS,
+        },
+    )
+
+
+def _evaluation_quality_check(evaluation_analytics: dict[str, Any]) -> dict[str, Any]:
+    total = int(evaluation_analytics["total_evaluations"])
+    pass_rate = float(evaluation_analytics["pass_rate"])
+    average_score = evaluation_analytics["average_score"]
+
+    if total == 0:
+        return _check(
+            code="runtime_evaluation_quality",
+            status="needs_attention",
+            message="暂无运行期 evaluation 质量样本。",
+            details={
+                "total_evaluations": total,
+                "min_pass_rate": _MIN_EVALUATION_PASS_RATE,
+                "min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
+            },
+        )
+
+    if pass_rate < _MIN_EVALUATION_PASS_RATE or (
+        average_score is not None
+        and float(average_score) < _MIN_EVALUATION_AVERAGE_SCORE
+    ):
+        return _check(
+            code="runtime_evaluation_quality",
+            status="blocked",
+            message="运行期 evaluation 质量未达到内部 readiness 门槛。",
+            details={
+                "pass_rate": pass_rate,
+                "average_score": average_score,
+                "blocked_evaluations": evaluation_analytics["blocked_evaluations"],
+                "min_pass_rate": _MIN_EVALUATION_PASS_RATE,
+                "min_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
+            },
+        )
+
+    return _check(
+        code="runtime_evaluation_quality",
+        status="ready",
+        message="运行期 evaluation 通过率和平均分达到内部 readiness 门槛。",
+        details={
+            "pass_rate": pass_rate,
+            "average_score": average_score,
+            "blocked_evaluations": evaluation_analytics["blocked_evaluations"],
+        },
+    )
+
+
+def _executor_sample_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
+    total_runs = int(executor_coverage["total_runs"])
+    if total_runs >= _MIN_EXECUTOR_RUNS:
+        return _check(
+            code="executor_coverage_samples",
+            status="ready",
+            message="当前窗口已有 executor coverage 运行样本。",
+            details={
+                "total_runs": total_runs,
+                "min_required": _MIN_EXECUTOR_RUNS,
+            },
+        )
+
+    return _check(
+        code="executor_coverage_samples",
+        status="needs_attention",
+        message="当前窗口缺少 executor coverage 样本，建议先跑资产生成或重试烟测。",
+        details={
+            "total_runs": total_runs,
+            "min_required": _MIN_EXECUTOR_RUNS,
+        },
+    )
+
+
+def _executor_ratio_check(executor_coverage: dict[str, Any]) -> dict[str, Any]:
+    total_runs = int(executor_coverage["total_runs"])
+    coverage_ratio = float(executor_coverage["coverage_ratio"])
+
+    if total_runs == 0:
+        return _check(
+            code="executor_coverage_ratio",
+            status="needs_attention",
+            message="暂无 executor coverage 运行样本。",
+            details={
+                "total_runs": total_runs,
+                "min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
+            },
+        )
+
+    if coverage_ratio < _MIN_EXECUTOR_COVERAGE_RATIO:
+        return _check(
+            code="executor_coverage_ratio",
+            status="blocked",
+            message="executor coverage ratio 未达到内部 readiness 门槛。",
+            details={
+                "coverage_ratio": coverage_ratio,
+                "min_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
+                "total_planned_tasks": executor_coverage["total_planned_tasks"],
+                "total_executed_tasks": executor_coverage["total_executed_tasks"],
+            },
+        )
+
+    return _check(
+        code="executor_coverage_ratio",
+        status="ready",
+        message="executor coverage ratio 达到内部 readiness 门槛。",
+        details={
+            "coverage_ratio": coverage_ratio,
+            "total_planned_tasks": executor_coverage["total_planned_tasks"],
+            "total_executed_tasks": executor_coverage["total_executed_tasks"],
+        },
+    )
+
+
+async def get_admin_harness_readiness(
+    db: AsyncSession,
+    *,
+    days: int | None = None,
+) -> dict[str, Any]:
+    """Return an admin-only readiness audit for harness release decisions."""
+
+    golden_replay = _run_golden_replay()
+    evaluation_analytics = await get_admin_evaluation_analytics(db, days=days)
+    executor_coverage = await get_admin_executor_coverage(db, days=days)
+
+    checks = [
+        _golden_replay_check(golden_replay),
+        _evaluation_sample_check(evaluation_analytics),
+        _evaluation_quality_check(evaluation_analytics),
+        _executor_sample_check(executor_coverage),
+        _executor_ratio_check(executor_coverage),
+    ]
+
+    return {
+        "scope": "admin_internal_harness_readiness",
+        "window_days": days,
+        "status": _overall_status(checks),
+        "thresholds": {
+            "min_runtime_evaluations": _MIN_RUNTIME_EVALUATIONS,
+            "min_executor_runs": _MIN_EXECUTOR_RUNS,
+            "min_evaluation_pass_rate": _MIN_EVALUATION_PASS_RATE,
+            "min_evaluation_average_score": _MIN_EVALUATION_AVERAGE_SCORE,
+            "min_executor_coverage_ratio": _MIN_EXECUTOR_COVERAGE_RATIO,
+        },
+        "checks": checks,
+        "golden_replay": golden_replay,
+        "evaluation_analytics": evaluation_analytics,
+        "executor_coverage": executor_coverage,
+    }