feat: add voice studio prototype flow

2026-04-19 23:10:16 +08:00
parent f106f740dd
commit 46d6201529
14 changed files with 1745 additions and 212 deletions
--- a/backend/app/services/voice_transcription_service.py
+++ b/backend/app/services/voice_transcription_service.py
@@ -0,0 +1,134 @@
+"""Voice transcription helpers for co-creation sessions."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from io import BytesIO
+
+from fastapi import HTTPException
+from openai import AsyncOpenAI
+
+from app.core.config import settings
+from app.core.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass(frozen=True)
+class VoiceTranscriptionResult:
+    """Normalized transcription result for one uploaded voice turn."""
+
+    transcript_text: str
+    confidence: float | None = None
+    provider: str = "demo"
+
+
+def _normalize_transcript(transcript_text: str) -> str:
+    return transcript_text.strip()
+
+
+async def _transcribe_demo(
+    *,
+    audio_bytes: bytes,
+    mime_type: str | None,
+    transcript_hint: str | None,
+) -> VoiceTranscriptionResult:
+    hint = _normalize_transcript(transcript_hint or "")
+    if hint:
+        return VoiceTranscriptionResult(
+            transcript_text=hint,
+            confidence=1.0,
+            provider="demo",
+        )
+
+    if mime_type and mime_type.startswith("text/"):
+        text = _normalize_transcript(audio_bytes.decode("utf-8", errors="ignore"))
+        if text:
+            return VoiceTranscriptionResult(
+                transcript_text=text,
+                confidence=1.0,
+                provider="demo",
+            )
+
+    raise HTTPException(
+        status_code=503,
+        detail=(
+            "当前环境未配置真实语音转写，请先使用文本共创模式，"
+            "或在开发模式下提供 transcript_hint。"
+        ),
+    )
+
+
+async def _transcribe_openai(
+    *,
+    audio_bytes: bytes,
+    file_name: str,
+    mime_type: str | None,
+    transcript_hint: str | None,
+) -> VoiceTranscriptionResult:
+    if not settings.openai_api_key:
+        raise HTTPException(
+            status_code=503,
+            detail="OPENAI_API_KEY 未配置，无法使用 OpenAI 语音转写。",
+        )
+
+    client = AsyncOpenAI(api_key=settings.openai_api_key)
+    audio_file = BytesIO(audio_bytes)
+    audio_file.name = file_name
+
+    prompt = transcript_hint.strip() if transcript_hint else None
+
+    try:
+        response = await client.audio.transcriptions.create(
+            model=settings.voice_transcription_model,
+            file=audio_file,
+            language=settings.voice_transcription_language,
+            prompt=prompt,
+        )
+    except Exception as exc:
+        logger.warning("voice_transcription_openai_failed", error=str(exc))
+        raise HTTPException(
+            status_code=503,
+            detail="语音转写服务暂时不可用，请稍后重试。",
+        ) from exc
+
+    transcript_text = _normalize_transcript(getattr(response, "text", "") or "")
+    if not transcript_text:
+        raise HTTPException(status_code=502, detail="语音转写结果为空，请重试。")
+
+    return VoiceTranscriptionResult(
+        transcript_text=transcript_text,
+        confidence=None,
+        provider="openai",
+    )
+
+
+async def transcribe_voice_audio(
+    *,
+    audio_bytes: bytes,
+    file_name: str,
+    mime_type: str | None,
+    transcript_hint: str | None = None,
+) -> VoiceTranscriptionResult:
+    """Transcribe one uploaded audio turn according to the configured mode."""
+
+    mode = (settings.voice_transcription_mode or "demo").strip().lower()
+
+    if mode == "disabled":
+        raise HTTPException(
+            status_code=503,
+            detail="当前环境已禁用语音转写，请先使用文本共创模式。",
+        )
+    if mode == "openai":
+        return await _transcribe_openai(
+            audio_bytes=audio_bytes,
+            file_name=file_name,
+            mime_type=mime_type,
+            transcript_hint=transcript_hint,
+        )
+
+    return await _transcribe_demo(
+        audio_bytes=audio_bytes,
+        mime_type=mime_type,
+        transcript_hint=transcript_hint,
+    )