dreamweaver/backend/app/services/admin_evaluation_analytics.py

"""Admin-only analytics for internal generation evaluation events."""

from __future__ import annotations

from datetime import datetime, timedelta, timezone
from typing import Any

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.db.models import GenerationJob, GenerationJobEvent


def _as_float(value: Any) -> float | None:
    if isinstance(value, int | float):
        return float(value)
    return None


def _sorted_count_buckets(counts: dict[str, int], *, key_name: str) -> list[dict[str, Any]]:
    return [
        {key_name: name, "count": count}
        for name, count in sorted(
            counts.items(),
            key=lambda item: (-item[1], item[0]),
        )
    ]


def _average_bucket(
    totals: dict[str, float],
    counts: dict[str, int],
    *,
    key_name: str,
) -> list[dict[str, Any]]:
    rows = [
        {
            key_name: name,
            "average_score": round(totals[name] / counts[name], 4),
            "count": counts[name],
        }
        for name in totals
        if counts.get(name)
    ]
    rows.sort(key=lambda item: (-int(item["count"]), str(item[key_name])))
    return rows


def _score_band(score: float) -> str:
    if score >= 0.9:
        return "excellent"
    if score >= 0.8:
        return "good"
    if score >= 0.7:
        return "pass"
    if score > 0:
        return "blocked_low_score"
    return "blocked_quality_gate"


def _metadata_scores(metadata: dict[str, Any]) -> list[dict[str, Any]]:
    raw_scores = metadata.get("scores")
    if not isinstance(raw_scores, list):
        return []
    return [score for score in raw_scores if isinstance(score, dict)]


def _quality_gate_issues(metadata: dict[str, Any]) -> list[dict[str, Any]]:
    quality_gate = metadata.get("quality_gate")
    if not isinstance(quality_gate, dict):
        return []
    raw_issues = quality_gate.get("issues")
    if not isinstance(raw_issues, list):
        return []
    return [issue for issue in raw_issues if isinstance(issue, dict)]


async def get_admin_evaluation_analytics(
    db: AsyncSession,
    *,
    days: int | None = None,
    artifact: str | None = None,
) -> dict[str, Any]:
    """Aggregate internal evaluation results for the admin control plane."""

    cutoff = datetime.now(timezone.utc) - timedelta(days=days) if days is not None else None

    query = (
        select(GenerationJobEvent, GenerationJob)
        .join(GenerationJob, GenerationJobEvent.job_id == GenerationJob.id)
        .where(GenerationJobEvent.event_type == "evaluation_completed")
        .order_by(GenerationJobEvent.id)
    )
    if cutoff is not None:
        query = query.where(GenerationJobEvent.created_at >= cutoff)

    rows = (await db.execute(query)).all()

    total_evaluations = 0
    passed_evaluations = 0
    blocked_evaluations = 0
    score_total = 0.0
    score_count = 0
    job_ids: set[str] = set()
    story_ids: set[int] = set()
    user_ids: set[str] = set()
    artifacts: dict[str, int] = {}
    output_modes: dict[str, int] = {}
    score_bands: dict[str, int] = {}
    dimension_totals: dict[str, float] = {}
    dimension_counts: dict[str, int] = {}
    quality_gate_codes: dict[str, int] = {}
    failure_categories: dict[str, int] = {}
    warning_counts: dict[str, int] = {}

    for event, job in rows:
        metadata = event.event_metadata or {}
        event_artifact = str(metadata.get("artifact") or "unknown")
        if artifact is not None and event_artifact != artifact:
            continue

        total_evaluations += 1
        job_ids.add(job.id)
        user_ids.add(job.user_id)
        if event.story_id is not None:
            story_ids.add(int(event.story_id))
        elif job.story_id is not None:
            story_ids.add(int(job.story_id))

        artifacts[event_artifact] = artifacts.get(event_artifact, 0) + 1
        output_modes[job.output_mode] = output_modes.get(job.output_mode, 0) + 1

        passed = metadata.get("passed") is True
        blocking = metadata.get("blocking") is True
        if passed:
            passed_evaluations += 1
        if blocking:
            blocked_evaluations += 1

        overall_score = _as_float(metadata.get("overall_score"))
        if overall_score is not None:
            score_total += overall_score
            score_count += 1
            band = _score_band(overall_score)
            score_bands[band] = score_bands.get(band, 0) + 1

        for score in _metadata_scores(metadata):
            dimension = score.get("dimension")
            dimension_score = _as_float(score.get("score"))
            if not isinstance(dimension, str) or dimension_score is None:
                continue
            dimension_totals[dimension] = dimension_totals.get(dimension, 0.0) + dimension_score
            dimension_counts[dimension] = dimension_counts.get(dimension, 0) + 1

        for issue in _quality_gate_issues(metadata):
            code = issue.get("code")
            if isinstance(code, str) and code:
                quality_gate_codes[code] = quality_gate_codes.get(code, 0) + 1
            failure_category = issue.get("failure_category")
            if isinstance(failure_category, str) and failure_category:
                failure_categories[failure_category] = (
                    failure_categories.get(failure_category, 0) + 1
                )

        warnings = metadata.get("warnings")
        if isinstance(warnings, list):
            for warning in warnings:
                if isinstance(warning, str) and warning:
                    warning_counts[warning] = warning_counts.get(warning, 0) + 1

    return {
        "scope": "admin_internal_evaluations",
        "window_days": days,
        "artifact": artifact,
        "total_evaluations": total_evaluations,
        "passed_evaluations": passed_evaluations,
        "blocked_evaluations": blocked_evaluations,
        "pass_rate": (
            round(passed_evaluations / total_evaluations, 4)
            if total_evaluations
            else 0.0
        ),
        "average_score": round(score_total / score_count, 4) if score_count else None,
        "job_count": len(job_ids),
        "story_count": len(story_ids),
        "user_count": len(user_ids),
        "by_artifact": _sorted_count_buckets(artifacts, key_name="artifact"),
        "by_output_mode": _sorted_count_buckets(output_modes, key_name="output_mode"),
        "score_bands": _sorted_count_buckets(score_bands, key_name="band"),
        "dimension_scores": _average_bucket(
            dimension_totals,
            dimension_counts,
            key_name="dimension",
        ),
        "quality_gate_issues": _sorted_count_buckets(
            quality_gate_codes,
            key_name="code",
        ),
        "failure_categories": _sorted_count_buckets(
            failure_categories,
            key_name="category",
        ),
        "warnings": _sorted_count_buckets(warning_counts, key_name="message"),
    }