feat: improve voice studio alpha recovery flow

2026-04-19 23:25:41 +08:00
parent 46d6201529
commit 4ecf0c09c0
9 changed files with 657 additions and 14 deletions
--- a/backend/app/api/voice_sessions.py
+++ b/backend/app/api/voice_sessions.py
@@ -12,6 +12,7 @@ from fastapi import (
 )
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.core.config import settings
 from app.core.deps import require_user
 from app.core.rate_limiter import check_rate_limit
 from app.db.database import get_db
@@ -34,11 +35,14 @@ from app.services.voice_session_service import (
    create_voice_turn_from_text_service,
    create_voice_turn_from_upload_service,
    finalize_voice_session_service,
+    get_latest_active_voice_session_service,
    get_voice_session_detail_service,
    get_voice_turn_audio_service,
    get_voice_turn_service,
    get_voice_turn_user_audio_service,
    list_voice_sessions_service,
+    retry_voice_turn_audio_service,
+    retry_voice_turn_service,
 )

 router = APIRouter()
@@ -68,8 +72,13 @@ async def create_voice_session(

@router.get("/voice-sessions", response_model=list[VoiceSessionSummaryResponse])
 async def list_voice_sessions(
-    limit: int = Query(default=8, ge=1, le=20),
+    limit: int = Query(
+        default=settings.voice_session_default_list_limit,
+        ge=1,
+        le=settings.voice_session_max_list_limit,
+    ),
    active_only: bool = Query(default=False),
+    active_first: bool = Query(default=True),
    user: User = Depends(require_user),
    db: AsyncSession = Depends(get_db),
 ):
@@ -79,9 +88,19 @@ async def list_voice_sessions(
        db,
        limit=limit,
        active_only=active_only,
+        active_first=active_first,
    )


+@router.get("/voice-sessions/active", response_model=VoiceSessionSummaryResponse | None)
+async def get_latest_active_voice_session(
+    user: User = Depends(require_user),
+    db: AsyncSession = Depends(get_db),
+):
+    """Get the latest active voice session for quick resume behavior."""
+    return await get_latest_active_voice_session_service(user.id, db)
+
+
@router.get("/voice-sessions/{session_id}", response_model=VoiceSessionDetailResponse)
 async def get_voice_session(
    session_id: str,
@@ -158,6 +177,21 @@ async def get_voice_turn(
    return await get_voice_turn_service(session_id, turn_id, user.id, db)


+@router.post(
+    "/voice-sessions/{session_id}/turns/{turn_id}/retry",
+    response_model=VoiceTurnAcceptedResponse,
+    status_code=status.HTTP_202_ACCEPTED,
+)
+async def retry_voice_turn(
+    session_id: str,
+    turn_id: str,
+    user: User = Depends(require_user),
+    db: AsyncSession = Depends(get_db),
+):
+    """Retry one failed voice turn using its saved transcript."""
+    return await retry_voice_turn_service(session_id, turn_id, user.id, db)
+
+
@router.get("/voice-sessions/{session_id}/turns/{turn_id}/audio")
 async def get_voice_turn_audio(
    session_id: str,
@@ -170,6 +204,20 @@ async def get_voice_turn_audio(
    return Response(content=audio_bytes, media_type="audio/mpeg")


+@router.post(
+    "/voice-sessions/{session_id}/turns/{turn_id}/retry-audio",
+    response_model=VoiceTurnSummaryResponse,
+)
+async def retry_voice_turn_audio(
+    session_id: str,
+    turn_id: str,
+    user: User = Depends(require_user),
+    db: AsyncSession = Depends(get_db),
+):
+    """Retry assistant audio synthesis when one turn only has text output."""
+    return await retry_voice_turn_audio_service(session_id, turn_id, user.id, db)
+
+
@router.get("/voice-sessions/{session_id}/turns/{turn_id}/user-audio")
 async def get_voice_turn_user_audio(
    session_id: str,
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -82,6 +82,18 @@ class Settings(BaseSettings):
        "zh",
        description="Preferred language hint for voice transcription",
    )
+    voice_turn_max_upload_bytes: int = Field(
+        5 * 1024 * 1024,
+        description="Maximum accepted upload size in bytes for one voice turn audio file",
+    )
+    voice_session_default_list_limit: int = Field(
+        8,
+        description="Default number of recent voice sessions returned to the client",
+    )
+    voice_session_max_list_limit: int = Field(
+        20,
+        description="Maximum number of recent voice sessions returned to the client",
+    )
    story_audio_cache_ttl_days: int = Field(
        30,
        description="TTL in days before cached story audio is pruned",
--- a/backend/app/schemas/voice_session_schemas.py
+++ b/backend/app/schemas/voice_session_schemas.py
@@ -101,6 +101,7 @@ class VoiceSessionSummaryResponse(BaseModel):
    latest_detected_intent: str | None = None
    latest_assistant_audio_ready: bool = False
    last_turn_status: str | None = None
+    transcription_mode_hint: str | None = None
    can_continue: bool = False
    can_finalize: bool = False
    last_error: str | None = None
--- a/backend/app/services/voice_session_service.py
+++ b/backend/app/services/voice_session_service.py
@@ -6,9 +6,10 @@ from datetime import datetime, timezone
 from typing import Any

 from fastapi import HTTPException
-from sqlalchemy import desc, select
+from sqlalchemy import case, desc, select
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.core.config import settings
 from app.core.logging import get_logger
 from app.db.models import VoiceSession, VoiceSessionEvent, VoiceTurn
 from app.schemas.voice_session_schemas import (
@@ -134,6 +135,7 @@ def _session_to_summary(
            session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
        ),
        last_turn_status=latest_turn.status if latest_turn else None,
+        transcription_mode_hint=settings.voice_transcription_mode,
        can_continue=_session_can_continue(session),
        can_finalize=_session_can_finalize(session),
        last_error=session.last_error,
@@ -602,17 +604,29 @@ async def list_voice_sessions_service(
    user_id: str,
    db: AsyncSession,
    *,
-    limit: int = 8,
+    limit: int | None = None,
    active_only: bool = False,
+    active_first: bool = False,
 ) -> list[VoiceSessionSummaryResponse]:
-    query = (
-        select(VoiceSession)
-        .where(VoiceSession.user_id == user_id)
-        .order_by(desc(VoiceSession.updated_at), desc(VoiceSession.created_at))
-        .limit(limit)
-    )
+    resolved_limit = limit or settings.voice_session_default_list_limit
+    resolved_limit = max(1, min(resolved_limit, settings.voice_session_max_list_limit))
+    query = select(VoiceSession).where(VoiceSession.user_id == user_id)
    if active_only:
        query = query.where(VoiceSession.status.in_(CONTINUABLE_SESSION_STATUSES))
+    if active_first:
+        query = query.order_by(
+            desc(
+                case(
+                    (VoiceSession.status.in_(CONTINUABLE_SESSION_STATUSES), 1),
+                    else_=0,
+                )
+            ),
+            desc(VoiceSession.updated_at),
+            desc(VoiceSession.created_at),
+        )
+    else:
+        query = query.order_by(desc(VoiceSession.updated_at), desc(VoiceSession.created_at))
+    query = query.limit(resolved_limit)

    sessions = (await db.execute(query)).scalars().all()
    summaries: list[VoiceSessionSummaryResponse] = []
@@ -628,6 +642,30 @@ async def list_voice_sessions_service(
    return summaries


+async def get_latest_active_voice_session_service(
+    user_id: str,
+    db: AsyncSession,
+) -> VoiceSessionSummaryResponse | None:
+    query = (
+        select(VoiceSession)
+        .where(
+            VoiceSession.user_id == user_id,
+            VoiceSession.status.in_(CONTINUABLE_SESSION_STATUSES),
+        )
+        .order_by(desc(VoiceSession.updated_at), desc(VoiceSession.created_at))
+        .limit(1)
+    )
+    session = (await db.execute(query)).scalar_one_or_none()
+    if session is None:
+        return None
+    latest_turn = await _get_latest_turn(db, session_id=session.id)
+    return _session_to_summary(
+        session,
+        latest_turn=latest_turn,
+        total_turns=session.current_turn_index,
+    )
+
+
 async def create_voice_session_service(
    request: VoiceSessionCreateRequest,
    user_id: str,
@@ -766,6 +804,13 @@ async def create_voice_turn_from_upload_service(
            status_code=409,
            detail="Voice session is not ready for another turn.",
        )
+    if not audio_bytes:
+        raise HTTPException(status_code=400, detail="上传音频为空，请重新录音后再试。")
+    if len(audio_bytes) > settings.voice_turn_max_upload_bytes:
+        raise HTTPException(
+            status_code=413,
+            detail="上传音频过大，请缩短单轮录音时长后再试。",
+        )
    next_turn_index = session.current_turn_index + 1
    user_audio_path = write_uploaded_user_audio(
        session_id=session.id,
@@ -805,6 +850,91 @@ async def create_voice_turn_from_upload_service(
    )


+async def retry_voice_turn_service(
+    session_id: str,
+    turn_id: str,
+    user_id: str,
+    db: AsyncSession,
+) -> VoiceTurnAcceptedResponse:
+    turn = await _get_owned_turn(
+        db,
+        session_id=session_id,
+        turn_id=turn_id,
+        user_id=user_id,
+    )
+    if turn.status != "failed":
+        raise HTTPException(status_code=409, detail="Only failed turns can be retried.")
+    if not turn.user_transcript:
+        raise HTTPException(status_code=409, detail="This turn has no transcript to retry.")
+
+    return await create_voice_turn_from_text_service(
+        session_id,
+        VoiceTurnCreateFallbackRequest(
+            transcript_text=turn.user_transcript,
+            duration_ms=turn.user_audio_duration_ms,
+        ),
+        user_id,
+        db,
+    )
+
+
+async def retry_voice_turn_audio_service(
+    session_id: str,
+    turn_id: str,
+    user_id: str,
+    db: AsyncSession,
+) -> VoiceTurnSummaryResponse:
+    turn = await _get_owned_turn(
+        db,
+        session_id=session_id,
+        turn_id=turn_id,
+        user_id=user_id,
+    )
+    if not turn.assistant_text:
+        raise HTTPException(status_code=409, detail="This turn has no assistant text to speak.")
+    if session_audio_exists(turn.assistant_audio_path):
+        raise HTTPException(status_code=409, detail="Assistant audio already exists for this turn.")
+
+    try:
+        audio_bytes = await text_to_speech(
+            turn.assistant_text,
+            db=db,
+            user_id=user_id,
+        )
+        saved_path = write_session_audio(
+            build_turn_assistant_audio_path(turn.session_id, turn.turn_index),
+            audio_bytes,
+        )
+        turn.assistant_audio_path = saved_path
+        turn.assistant_audio_duration_ms = None
+        if turn.status == "narrative_ready":
+            turn.status = "audio_ready"
+        await db.commit()
+        await db.refresh(turn)
+        await _record_session_event(
+            db,
+            session_id=turn.session_id,
+            turn_id=turn.id,
+            event_type="assistant_audio_retry_succeeded",
+            status="succeeded",
+            message="Assistant audio regenerated for one voice turn.",
+            metadata={"audio_path": saved_path},
+        )
+    except Exception as exc:
+        await _record_session_event(
+            db,
+            session_id=turn.session_id,
+            turn_id=turn.id,
+            event_type="assistant_audio_retry_failed",
+            status="failed",
+            message="Assistant audio retry failed.",
+            metadata={"error": str(exc)},
+        )
+        raise HTTPException(status_code=503, detail="语音补发失败，请稍后再试。") from exc
+
+    return _turn_to_summary(turn)
+
+
 async def get_voice_turn_service(
    session_id: str,
    turn_id: str,