feat: add ASR provider support for voice co-creation

2026-04-24 17:58:49 +08:00
parent 7e450aa5fc
commit 3805c18622
22 changed files with 471 additions and 126 deletions
--- a/backend/app/services/voice_transcription_service.py
+++ b/backend/app/services/voice_transcription_service.py
@@ -3,15 +3,12 @@
 from __future__ import annotations

 from dataclasses import dataclass
-from io import BytesIO

 from fastapi import HTTPException
-from openai import AsyncOpenAI
+from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.config import settings
-from app.core.logging import get_logger
-
-logger = get_logger(__name__)
+from app.services.provider_router import transcribe_audio


@dataclass(frozen=True)
@@ -23,84 +20,9 @@ class VoiceTranscriptionResult:
    provider: str = "demo"


-def _normalize_transcript(transcript_text: str) -> str:
-    return transcript_text.strip()
-
-
-async def _transcribe_demo(
-    *,
-    audio_bytes: bytes,
-    mime_type: str | None,
-    transcript_hint: str | None,
-) -> VoiceTranscriptionResult:
-    hint = _normalize_transcript(transcript_hint or "")
-    if hint:
-        return VoiceTranscriptionResult(
-            transcript_text=hint,
-            confidence=1.0,
-            provider="demo",
-        )
-
-    if mime_type and mime_type.startswith("text/"):
-        text = _normalize_transcript(audio_bytes.decode("utf-8", errors="ignore"))
-        if text:
-            return VoiceTranscriptionResult(
-                transcript_text=text,
-                confidence=1.0,
-                provider="demo",
-            )
-
-    raise HTTPException(
-        status_code=503,
-        detail=(
-            "当前环境未配置真实语音转写，请先使用文本共创模式，"
-            "或在开发模式下提供 transcript_hint。"
-        ),
-    )
-
-
-async def _transcribe_openai(
-    *,
-    audio_bytes: bytes,
-    file_name: str,
-    mime_type: str | None,
-    transcript_hint: str | None,
-) -> VoiceTranscriptionResult:
-    if not settings.openai_api_key:
-        raise HTTPException(
-            status_code=503,
-            detail="OPENAI_API_KEY 未配置，无法使用 OpenAI 语音转写。",
-        )
-
-    client = AsyncOpenAI(api_key=settings.openai_api_key)
-    audio_file = BytesIO(audio_bytes)
-    audio_file.name = file_name
-
-    prompt = transcript_hint.strip() if transcript_hint else None
-
-    try:
-        response = await client.audio.transcriptions.create(
-            model=settings.voice_transcription_model,
-            file=audio_file,
-            language=settings.voice_transcription_language,
-            prompt=prompt,
-        )
-    except Exception as exc:
-        logger.warning("voice_transcription_openai_failed", error=str(exc))
-        raise HTTPException(
-            status_code=503,
-            detail="语音转写服务暂时不可用，请稍后重试。",
-        ) from exc
-
-    transcript_text = _normalize_transcript(getattr(response, "text", "") or "")
-    if not transcript_text:
-        raise HTTPException(status_code=502, detail="语音转写结果为空，请重试。")
-
-    return VoiceTranscriptionResult(
-        transcript_text=transcript_text,
-        confidence=None,
-        provider="openai",
-    )
+def _resolve_transcript_hint(transcript_hint: str | None) -> str | None:
+    normalized = (transcript_hint or "").strip()
+    return normalized or None


 async def transcribe_voice_audio(
@@ -109,26 +31,35 @@ async def transcribe_voice_audio(
    file_name: str,
    mime_type: str | None,
    transcript_hint: str | None = None,
+    db: AsyncSession | None = None,
+    user_id: str | None = None,
 ) -> VoiceTranscriptionResult:
-    """Transcribe one uploaded audio turn according to the configured mode."""
+    """Transcribe one uploaded audio turn using configured ASR providers."""

-    mode = (settings.voice_transcription_mode or "demo").strip().lower()
+    mode = (settings.voice_transcription_mode or "provider").strip().lower()

    if mode == "disabled":
        raise HTTPException(
            status_code=503,
            detail="当前环境已禁用语音转写，请先使用文本共创模式。",
        )
-    if mode == "openai":
-        return await _transcribe_openai(
-            audio_bytes=audio_bytes,
-            file_name=file_name,
-            mime_type=mime_type,
-            transcript_hint=transcript_hint,
-        )

-    return await _transcribe_demo(
+    hint = _resolve_transcript_hint(transcript_hint)
+    provider_name = "openai_asr" if mode == "openai" else mode
+    strategy_providers = None if mode == "provider" else [provider_name]
+    result = await transcribe_audio(
        audio_bytes=audio_bytes,
+        file_name=file_name,
        mime_type=mime_type,
-        transcript_hint=transcript_hint,
+        transcript_hint=hint,
+        language=settings.voice_transcription_language,
+        provider_names=strategy_providers,
+        db=db,
+        user_id=user_id,
+    )
+
+    return VoiceTranscriptionResult(
+        transcript_text=result.transcript_text,
+        confidence=result.confidence,
+        provider=result.provider,
    )