feat: add ASR provider support for voice co-creation
This commit is contained in:
@@ -4,7 +4,11 @@
|
||||
from app.services.adapters import demo as _demo_adapters # noqa: F401
|
||||
from app.services.adapters.base import AdapterConfig, BaseAdapter
|
||||
|
||||
# Image adapters
|
||||
# ASR adapters
|
||||
from app.services.adapters.asr import demo as _asr_demo_adapter # noqa: F401
|
||||
from app.services.adapters.asr import openai as _asr_openai_adapter # noqa: F401
|
||||
|
||||
# Image adapters
|
||||
from app.services.adapters.image import cqtai as _image_cqtai_adapter # noqa: F401
|
||||
from app.services.adapters.registry import AdapterRegistry
|
||||
|
||||
|
||||
1
backend/app/services/adapters/asr/__init__.py
Normal file
1
backend/app/services/adapters/asr/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""ASR adapters."""
|
||||
57
backend/app/services/adapters/asr/demo.py
Normal file
57
backend/app/services/adapters/asr/demo.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Demo ASR adapter for local voice co-creation smoke tests."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
from app.services.adapters.asr.models import TranscriptionOutput
|
||||
from app.services.adapters.base import BaseAdapter
|
||||
from app.services.adapters.registry import AdapterRegistry
|
||||
|
||||
|
||||
@AdapterRegistry.register("asr", "demo")
|
||||
class DemoASRAdapter(BaseAdapter[TranscriptionOutput]):
|
||||
"""Return transcript hints or text uploads without external ASR services."""
|
||||
|
||||
adapter_type = "asr"
|
||||
adapter_name = "demo"
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
audio_bytes: bytes,
|
||||
file_name: str | None = None,
|
||||
mime_type: str | None = None,
|
||||
transcript_hint: str | None = None,
|
||||
**kwargs,
|
||||
) -> TranscriptionOutput:
|
||||
hint = (transcript_hint or "").strip()
|
||||
if hint:
|
||||
return TranscriptionOutput(
|
||||
transcript_text=hint,
|
||||
confidence=1.0,
|
||||
provider=self.adapter_name,
|
||||
)
|
||||
|
||||
if mime_type and mime_type.startswith("text/"):
|
||||
text = audio_bytes.decode("utf-8", errors="ignore").strip()
|
||||
if text:
|
||||
return TranscriptionOutput(
|
||||
transcript_text=text,
|
||||
confidence=1.0,
|
||||
provider=self.adapter_name,
|
||||
)
|
||||
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail=(
|
||||
"当前环境未配置真实语音转写,请先使用文本共创模式,"
|
||||
"或在开发模式下提供 transcript_hint。"
|
||||
),
|
||||
)
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def estimated_cost(self) -> float:
|
||||
return 0.0
|
||||
11
backend/app/services/adapters/asr/models.py
Normal file
11
backend/app/services/adapters/asr/models.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""ASR adapter result models."""
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class TranscriptionOutput(BaseModel):
|
||||
"""Normalized speech-to-text output from one ASR provider."""
|
||||
|
||||
transcript_text: str
|
||||
confidence: float | None = None
|
||||
provider: str
|
||||
76
backend/app/services/adapters/asr/openai.py
Normal file
76
backend/app/services/adapters/asr/openai.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""OpenAI ASR adapter."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from fastapi import HTTPException
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from app.core.logging import get_logger
|
||||
from app.services.adapters.asr.models import TranscriptionOutput
|
||||
from app.services.adapters.base import BaseAdapter
|
||||
from app.services.adapters.registry import AdapterRegistry
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@AdapterRegistry.register("asr", "openai_asr")
|
||||
class OpenAIASRAdapter(BaseAdapter[TranscriptionOutput]):
|
||||
"""Transcribe uploaded voice turn audio with OpenAI audio transcription."""
|
||||
|
||||
adapter_type = "asr"
|
||||
adapter_name = "openai_asr"
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
audio_bytes: bytes,
|
||||
file_name: str | None = None,
|
||||
mime_type: str | None = None,
|
||||
transcript_hint: str | None = None,
|
||||
language: str | None = None,
|
||||
**kwargs,
|
||||
) -> TranscriptionOutput:
|
||||
if not self.config.api_key:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="OPENAI_API_KEY 未配置,无法使用 OpenAI 语音转写。",
|
||||
)
|
||||
|
||||
client = AsyncOpenAI(api_key=self.config.api_key)
|
||||
audio_file = BytesIO(audio_bytes)
|
||||
audio_file.name = file_name or "voice-turn.webm"
|
||||
|
||||
prompt = transcript_hint.strip() if transcript_hint else None
|
||||
model = self.config.model or "gpt-4o-mini-transcribe"
|
||||
|
||||
try:
|
||||
response = await client.audio.transcriptions.create(
|
||||
model=model,
|
||||
file=audio_file,
|
||||
language=language,
|
||||
prompt=prompt,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("openai_asr_failed", error=str(exc))
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="语音转写服务暂时不可用,请稍后重试。",
|
||||
) from exc
|
||||
|
||||
transcript_text = (getattr(response, "text", "") or "").strip()
|
||||
if not transcript_text:
|
||||
raise HTTPException(status_code=502, detail="语音转写结果为空,请重试。")
|
||||
|
||||
return TranscriptionOutput(
|
||||
transcript_text=transcript_text,
|
||||
confidence=None,
|
||||
provider=self.adapter_name,
|
||||
)
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
return bool(self.config.api_key)
|
||||
|
||||
@property
|
||||
def estimated_cost(self) -> float:
|
||||
return 0.006
|
||||
Reference in New Issue
Block a user