feat: add ASR provider support for voice co-creation

2026-04-24 17:58:49 +08:00
parent 7e450aa5fc
commit 3805c18622
22 changed files with 471 additions and 126 deletions
--- a/backend/app/services/adapters/init.py
+++ b/backend/app/services/adapters/init.py
@@ -4,7 +4,11 @@
 from app.services.adapters import demo as _demo_adapters  # noqa: F401
 from app.services.adapters.base import AdapterConfig, BaseAdapter

-# Image adapters
+# ASR adapters
+from app.services.adapters.asr import demo as _asr_demo_adapter  # noqa: F401
+from app.services.adapters.asr import openai as _asr_openai_adapter  # noqa: F401
+
+# Image adapters
 from app.services.adapters.image import cqtai as _image_cqtai_adapter  # noqa: F401
 from app.services.adapters.registry import AdapterRegistry

--- a/backend/app/services/adapters/asr/init.py
+++ b/backend/app/services/adapters/asr/init.py
@@ -0,0 +1 @@
+"""ASR adapters."""
--- a/backend/app/services/adapters/asr/demo.py
+++ b/backend/app/services/adapters/asr/demo.py
@@ -0,0 +1,57 @@
+"""Demo ASR adapter for local voice co-creation smoke tests."""
+
+from __future__ import annotations
+
+from fastapi import HTTPException
+
+from app.services.adapters.asr.models import TranscriptionOutput
+from app.services.adapters.base import BaseAdapter
+from app.services.adapters.registry import AdapterRegistry
+
+
+@AdapterRegistry.register("asr", "demo")
+class DemoASRAdapter(BaseAdapter[TranscriptionOutput]):
+    """Return transcript hints or text uploads without external ASR services."""
+
+    adapter_type = "asr"
+    adapter_name = "demo"
+
+    async def execute(
+        self,
+        audio_bytes: bytes,
+        file_name: str | None = None,
+        mime_type: str | None = None,
+        transcript_hint: str | None = None,
+        **kwargs,
+    ) -> TranscriptionOutput:
+        hint = (transcript_hint or "").strip()
+        if hint:
+            return TranscriptionOutput(
+                transcript_text=hint,
+                confidence=1.0,
+                provider=self.adapter_name,
+            )
+
+        if mime_type and mime_type.startswith("text/"):
+            text = audio_bytes.decode("utf-8", errors="ignore").strip()
+            if text:
+                return TranscriptionOutput(
+                    transcript_text=text,
+                    confidence=1.0,
+                    provider=self.adapter_name,
+                )
+
+        raise HTTPException(
+            status_code=503,
+            detail=(
+                "当前环境未配置真实语音转写，请先使用文本共创模式，"
+                "或在开发模式下提供 transcript_hint。"
+            ),
+        )
+
+    async def health_check(self) -> bool:
+        return True
+
+    @property
+    def estimated_cost(self) -> float:
+        return 0.0
--- a/backend/app/services/adapters/asr/models.py
+++ b/backend/app/services/adapters/asr/models.py
@@ -0,0 +1,11 @@
+"""ASR adapter result models."""
+
+from pydantic import BaseModel
+
+
+class TranscriptionOutput(BaseModel):
+    """Normalized speech-to-text output from one ASR provider."""
+
+    transcript_text: str
+    confidence: float | None = None
+    provider: str
--- a/backend/app/services/adapters/asr/openai.py
+++ b/backend/app/services/adapters/asr/openai.py
@@ -0,0 +1,76 @@
+"""OpenAI ASR adapter."""
+
+from __future__ import annotations
+
+from io import BytesIO
+
+from fastapi import HTTPException
+from openai import AsyncOpenAI
+
+from app.core.logging import get_logger
+from app.services.adapters.asr.models import TranscriptionOutput
+from app.services.adapters.base import BaseAdapter
+from app.services.adapters.registry import AdapterRegistry
+
+logger = get_logger(__name__)
+
+
+@AdapterRegistry.register("asr", "openai_asr")
+class OpenAIASRAdapter(BaseAdapter[TranscriptionOutput]):
+    """Transcribe uploaded voice turn audio with OpenAI audio transcription."""
+
+    adapter_type = "asr"
+    adapter_name = "openai_asr"
+
+    async def execute(
+        self,
+        audio_bytes: bytes,
+        file_name: str | None = None,
+        mime_type: str | None = None,
+        transcript_hint: str | None = None,
+        language: str | None = None,
+        **kwargs,
+    ) -> TranscriptionOutput:
+        if not self.config.api_key:
+            raise HTTPException(
+                status_code=503,
+                detail="OPENAI_API_KEY 未配置，无法使用 OpenAI 语音转写。",
+            )
+
+        client = AsyncOpenAI(api_key=self.config.api_key)
+        audio_file = BytesIO(audio_bytes)
+        audio_file.name = file_name or "voice-turn.webm"
+
+        prompt = transcript_hint.strip() if transcript_hint else None
+        model = self.config.model or "gpt-4o-mini-transcribe"
+
+        try:
+            response = await client.audio.transcriptions.create(
+                model=model,
+                file=audio_file,
+                language=language,
+                prompt=prompt,
+            )
+        except Exception as exc:
+            logger.warning("openai_asr_failed", error=str(exc))
+            raise HTTPException(
+                status_code=503,
+                detail="语音转写服务暂时不可用，请稍后重试。",
+            ) from exc
+
+        transcript_text = (getattr(response, "text", "") or "").strip()
+        if not transcript_text:
+            raise HTTPException(status_code=502, detail="语音转写结果为空，请重试。")
+
+        return TranscriptionOutput(
+            transcript_text=transcript_text,
+            confidence=None,
+            provider=self.adapter_name,
+        )
+
+    async def health_check(self) -> bool:
+        return bool(self.config.api_key)
+
+    @property
+    def estimated_cost(self) -> float:
+        return 0.006