feat: add ASR provider support for voice co-creation

This commit is contained in:
2026-04-24 17:58:49 +08:00
parent 7e450aa5fc
commit 3805c18622
22 changed files with 471 additions and 126 deletions

View File

@@ -3,15 +3,12 @@
from __future__ import annotations
from dataclasses import dataclass
from io import BytesIO
from fastapi import HTTPException
from openai import AsyncOpenAI
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings
from app.core.logging import get_logger
logger = get_logger(__name__)
from app.services.provider_router import transcribe_audio
@dataclass(frozen=True)
@@ -23,84 +20,9 @@ class VoiceTranscriptionResult:
provider: str = "demo"
def _normalize_transcript(transcript_text: str) -> str:
return transcript_text.strip()
async def _transcribe_demo(
*,
audio_bytes: bytes,
mime_type: str | None,
transcript_hint: str | None,
) -> VoiceTranscriptionResult:
hint = _normalize_transcript(transcript_hint or "")
if hint:
return VoiceTranscriptionResult(
transcript_text=hint,
confidence=1.0,
provider="demo",
)
if mime_type and mime_type.startswith("text/"):
text = _normalize_transcript(audio_bytes.decode("utf-8", errors="ignore"))
if text:
return VoiceTranscriptionResult(
transcript_text=text,
confidence=1.0,
provider="demo",
)
raise HTTPException(
status_code=503,
detail=(
"当前环境未配置真实语音转写,请先使用文本共创模式,"
"或在开发模式下提供 transcript_hint。"
),
)
async def _transcribe_openai(
*,
audio_bytes: bytes,
file_name: str,
mime_type: str | None,
transcript_hint: str | None,
) -> VoiceTranscriptionResult:
if not settings.openai_api_key:
raise HTTPException(
status_code=503,
detail="OPENAI_API_KEY 未配置,无法使用 OpenAI 语音转写。",
)
client = AsyncOpenAI(api_key=settings.openai_api_key)
audio_file = BytesIO(audio_bytes)
audio_file.name = file_name
prompt = transcript_hint.strip() if transcript_hint else None
try:
response = await client.audio.transcriptions.create(
model=settings.voice_transcription_model,
file=audio_file,
language=settings.voice_transcription_language,
prompt=prompt,
)
except Exception as exc:
logger.warning("voice_transcription_openai_failed", error=str(exc))
raise HTTPException(
status_code=503,
detail="语音转写服务暂时不可用,请稍后重试。",
) from exc
transcript_text = _normalize_transcript(getattr(response, "text", "") or "")
if not transcript_text:
raise HTTPException(status_code=502, detail="语音转写结果为空,请重试。")
return VoiceTranscriptionResult(
transcript_text=transcript_text,
confidence=None,
provider="openai",
)
def _resolve_transcript_hint(transcript_hint: str | None) -> str | None:
normalized = (transcript_hint or "").strip()
return normalized or None
async def transcribe_voice_audio(
@@ -109,26 +31,35 @@ async def transcribe_voice_audio(
file_name: str,
mime_type: str | None,
transcript_hint: str | None = None,
db: AsyncSession | None = None,
user_id: str | None = None,
) -> VoiceTranscriptionResult:
"""Transcribe one uploaded audio turn according to the configured mode."""
"""Transcribe one uploaded audio turn using configured ASR providers."""
mode = (settings.voice_transcription_mode or "demo").strip().lower()
mode = (settings.voice_transcription_mode or "provider").strip().lower()
if mode == "disabled":
raise HTTPException(
status_code=503,
detail="当前环境已禁用语音转写,请先使用文本共创模式。",
)
if mode == "openai":
return await _transcribe_openai(
audio_bytes=audio_bytes,
file_name=file_name,
mime_type=mime_type,
transcript_hint=transcript_hint,
)
return await _transcribe_demo(
hint = _resolve_transcript_hint(transcript_hint)
provider_name = "openai_asr" if mode == "openai" else mode
strategy_providers = None if mode == "provider" else [provider_name]
result = await transcribe_audio(
audio_bytes=audio_bytes,
file_name=file_name,
mime_type=mime_type,
transcript_hint=transcript_hint,
transcript_hint=hint,
language=settings.voice_transcription_language,
provider_names=strategy_providers,
db=db,
user_id=user_id,
)
return VoiceTranscriptionResult(
transcript_text=result.transcript_text,
confidence=result.confidence,
provider=result.provider,
)