feat: add ASR provider support for voice co-creation

This commit is contained in:
2026-04-24 17:58:49 +08:00
parent 7e450aa5fc
commit 3805c18622
22 changed files with 471 additions and 126 deletions

View File

@@ -63,11 +63,14 @@ docker compose ps
docker compose logs -f backend docker compose logs -f backend
./scripts/demo_smoke.sh ./scripts/demo_smoke.sh
SMOKE_AUDIO=1 ./scripts/demo_smoke.sh SMOKE_AUDIO=1 ./scripts/demo_smoke.sh
SMOKE_VOICE=1 ./scripts/demo_smoke.sh
docker compose down docker compose down
docker compose down -v docker compose down -v
``` ```
`scripts/demo_smoke.sh` 会检查健康状态、本地登录、统一生成后台任务、主记录落库、资产重试、故事列表和 Provider 能力分层。默认跳过 TTS演示前需要验证语音链路时使用 `SMOKE_AUDIO=1` `scripts/demo_smoke.sh` 会检查健康状态、本地登录、统一生成后台任务、主记录落库、资产重试、故事列表和 Provider 能力分层。默认跳过 TTS 和语音共创;演示前需要验证朗读链路时使用 `SMOKE_AUDIO=1`,需要验证 Voice Studio Alpha 时使用 `SMOKE_VOICE=1`
语音共创的 ASR 能力已纳入 Provider 分层。默认 `ASR_PROVIDERS=["demo"]` 会使用 `transcript_hint` 或文本上传作为本地演示转写;需要真实转写时可设置 `ASR_PROVIDERS=["openai_asr", "demo"]` 并配置 `OPENAI_API_KEY`
## 手动开发 ## 手动开发
@@ -142,7 +145,7 @@ npm run build
| GET | `/api/stories/{story_id}` | 故事详情 | | GET | `/api/stories/{story_id}` | 故事详情 |
| DELETE | `/api/stories/{story_id}` | 删除故事 | | DELETE | `/api/stories/{story_id}` | 删除故事 |
| GET/POST/PUT/DELETE | `/admin/providers` | Provider 管理,需开启管理后台 | | GET/POST/PUT/DELETE | `/admin/providers` | Provider 管理,需开启管理后台 |
| GET | `/admin/providers/capabilities` | Provider 能力分层说明,需开启管理后台 | | GET | `/admin/providers/capabilities` | Provider 能力分层说明text/image/tts/storybook/asr,需开启管理后台 |
## 文档入口 ## 文档入口

View File

@@ -109,6 +109,14 @@
> >
绘本 绘本
</button> </button>
<button
type="button"
class="rounded-lg border px-3 py-1.5 text-sm transition-colors"
:class="analyticsCapability === 'asr' ? 'border-indigo-600 bg-indigo-600 text-white' : 'border-gray-200 bg-white text-gray-600 hover:border-gray-400'"
@click="analyticsCapability = 'asr'"
>
语音识别
</button>
</div> </div>
</div> </div>
@@ -316,7 +324,7 @@
<!-- Tabs --> <!-- Tabs -->
<div class="flex space-x-1 bg-gray-100 p-1 rounded-xl w-fit"> <div class="flex space-x-1 bg-gray-100 p-1 rounded-xl w-fit">
<button <button
v-for="tab in ['text', 'image', 'tts', 'storybook']" v-for="tab in ['text', 'image', 'tts', 'storybook', 'asr']"
:key="tab" :key="tab"
@click="activeTab = tab" @click="activeTab = tab"
class="px-6 py-2 rounded-lg text-sm font-medium transition-all duration-200" class="px-6 py-2 rounded-lg text-sm font-medium transition-all duration-200"
@@ -593,7 +601,7 @@ const analytics = ref<ProviderAnalyticsResponse | null>(null)
const analyticsLoading = ref(false) const analyticsLoading = ref(false)
const analyticsError = ref('') const analyticsError = ref('')
const analyticsWindow = ref<'7' | '30' | 'all'>('30') const analyticsWindow = ref<'7' | '30' | 'all'>('30')
const analyticsCapability = ref<'all' | 'text' | 'image' | 'tts' | 'storybook'>('all') const analyticsCapability = ref<'all' | 'text' | 'image' | 'tts' | 'storybook' | 'asr'>('all')
const editing = ref(false) const editing = ref(false)
const form = ref<Partial<Provider> & { api_key?: string; config_json: Record<string, any> }>({ const form = ref<Partial<Provider> & { api_key?: string; config_json: Record<string, any> }>({
type: 'text', type: 'text',
@@ -638,6 +646,8 @@ function formatCapability(value: string) {
return '语音' return '语音'
case 'storybook': case 'storybook':
return '绘本' return '绘本'
case 'asr':
return '语音识别'
default: default:
return value return value
} }

View File

@@ -43,6 +43,8 @@ IMAGE_PROVIDERS=["cqtai"]
TTS_PROVIDERS=["minimax", "elevenlabs", "edge_tts"] TTS_PROVIDERS=["minimax", "elevenlabs", "edge_tts"]
# 绘本结构生成: 默认复用 Gemini Storybook adapter # 绘本结构生成: 默认复用 Gemini Storybook adapter
STORYBOOK_PROVIDERS=["storybook_primary"] STORYBOOK_PROVIDERS=["storybook_primary"]
# 语音识别: 本地演示默认 demo真实转写可设置为 ["openai_asr", "demo"]
ASR_PROVIDERS=["demo"]
# [模型参数] # [模型参数]
TEXT_MODEL=gemini-2.0-flash TEXT_MODEL=gemini-2.0-flash
@@ -82,6 +84,9 @@ ELEVENLABS_API_KEY=
# OpenAI (如需使用) # OpenAI (如需使用)
OPENAI_API_KEY= OPENAI_API_KEY=
OPENAI_API_BASE= OPENAI_API_BASE=
# OpenAI ASR
VOICE_TRANSCRIPTION_MODEL=gpt-4o-mini-transcribe
VOICE_TRANSCRIPTION_LANGUAGE=zh
# ---------------------------------------------- # ----------------------------------------------
# 3. 第三方登录 (OAuth Config) [可选] # 3. 第三方登录 (OAuth Config) [可选]

View File

@@ -17,7 +17,7 @@ router = APIRouter(dependencies=[Depends(admin_guard)])
class ProviderCreate(BaseModel): class ProviderCreate(BaseModel):
name: str name: str
type: str = Field(..., pattern="^(text|image|tts|storybook)$") type: str = Field(..., pattern="^(text|image|tts|storybook|asr)$")
adapter: str adapter: str
model: str | None = None model: str | None = None
api_base: str | None = None api_base: str | None = None

View File

@@ -58,6 +58,7 @@ class Settings(BaseSettings):
image_providers: list[str] = Field(default_factory=lambda: ["cqtai"]) image_providers: list[str] = Field(default_factory=lambda: ["cqtai"])
tts_providers: list[str] = Field(default_factory=lambda: ["minimax", "elevenlabs", "edge_tts"]) tts_providers: list[str] = Field(default_factory=lambda: ["minimax", "elevenlabs", "edge_tts"])
storybook_providers: list[str] = Field(default_factory=lambda: ["storybook_primary"]) storybook_providers: list[str] = Field(default_factory=lambda: ["storybook_primary"])
asr_providers: list[str] = Field(default_factory=lambda: ["demo"])
enable_demo_providers: bool = Field( enable_demo_providers: bool = Field(
False, False,
description="Enable local deterministic demo providers for portfolio demos", description="Enable local deterministic demo providers for portfolio demos",
@@ -71,8 +72,8 @@ class Settings(BaseSettings):
description="Directory for persisted voice co-creation session assets", description="Directory for persisted voice co-creation session assets",
) )
voice_transcription_mode: str = Field( voice_transcription_mode: str = Field(
"demo", "provider",
description="Voice transcription mode: demo, openai, or disabled", description="Voice transcription mode: provider or disabled; provider order is controlled by ASR_PROVIDERS",
) )
voice_transcription_model: str = Field( voice_transcription_model: str = Field(
"gpt-4o-mini-transcribe", "gpt-4o-mini-transcribe",

View File

@@ -19,7 +19,7 @@ class Provider(Base):
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=_uuid) id: Mapped[str] = mapped_column(String(36), primary_key=True, default=_uuid)
name: Mapped[str] = mapped_column(String(100), nullable=False) name: Mapped[str] = mapped_column(String(100), nullable=False)
type: Mapped[str] = mapped_column(String(50), nullable=False) # text/image/tts/storybook type: Mapped[str] = mapped_column(String(50), nullable=False) # text/image/tts/storybook/asr
adapter: Mapped[str] = mapped_column(String(100), nullable=False) adapter: Mapped[str] = mapped_column(String(100), nullable=False)
model: Mapped[str] = mapped_column(String(200), nullable=True) model: Mapped[str] = mapped_column(String(200), nullable=True)
api_base: Mapped[str] = mapped_column(String(300), nullable=True) api_base: Mapped[str] = mapped_column(String(300), nullable=True)
@@ -97,7 +97,7 @@ class CostRecord(Base):
user_id: Mapped[str] = mapped_column(String(36), nullable=False, index=True) user_id: Mapped[str] = mapped_column(String(36), nullable=False, index=True)
provider_id: Mapped[str] = mapped_column(String(36), nullable=True) # 可能是环境变量配置 provider_id: Mapped[str] = mapped_column(String(36), nullable=True) # 可能是环境变量配置
provider_name: Mapped[str] = mapped_column(String(100), nullable=False) provider_name: Mapped[str] = mapped_column(String(100), nullable=False)
capability: Mapped[str] = mapped_column(String(50), nullable=False) # text/image/tts/storybook capability: Mapped[str] = mapped_column(String(50), nullable=False) # text/image/tts/storybook/asr
estimated_cost: Mapped[Decimal] = mapped_column(Numeric(10, 6), nullable=False) estimated_cost: Mapped[Decimal] = mapped_column(Numeric(10, 6), nullable=False)
timestamp: Mapped[datetime] = mapped_column( timestamp: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=datetime.utcnow, index=True DateTime(timezone=True), default=datetime.utcnow, index=True

View File

@@ -4,7 +4,11 @@
from app.services.adapters import demo as _demo_adapters # noqa: F401 from app.services.adapters import demo as _demo_adapters # noqa: F401
from app.services.adapters.base import AdapterConfig, BaseAdapter from app.services.adapters.base import AdapterConfig, BaseAdapter
# Image adapters # ASR adapters
from app.services.adapters.asr import demo as _asr_demo_adapter # noqa: F401
from app.services.adapters.asr import openai as _asr_openai_adapter # noqa: F401
# Image adapters
from app.services.adapters.image import cqtai as _image_cqtai_adapter # noqa: F401 from app.services.adapters.image import cqtai as _image_cqtai_adapter # noqa: F401
from app.services.adapters.registry import AdapterRegistry from app.services.adapters.registry import AdapterRegistry

View File

@@ -0,0 +1 @@
"""ASR adapters."""

View File

@@ -0,0 +1,57 @@
"""Demo ASR adapter for local voice co-creation smoke tests."""
from __future__ import annotations
from fastapi import HTTPException
from app.services.adapters.asr.models import TranscriptionOutput
from app.services.adapters.base import BaseAdapter
from app.services.adapters.registry import AdapterRegistry
@AdapterRegistry.register("asr", "demo")
class DemoASRAdapter(BaseAdapter[TranscriptionOutput]):
"""Return transcript hints or text uploads without external ASR services."""
adapter_type = "asr"
adapter_name = "demo"
async def execute(
self,
audio_bytes: bytes,
file_name: str | None = None,
mime_type: str | None = None,
transcript_hint: str | None = None,
**kwargs,
) -> TranscriptionOutput:
hint = (transcript_hint or "").strip()
if hint:
return TranscriptionOutput(
transcript_text=hint,
confidence=1.0,
provider=self.adapter_name,
)
if mime_type and mime_type.startswith("text/"):
text = audio_bytes.decode("utf-8", errors="ignore").strip()
if text:
return TranscriptionOutput(
transcript_text=text,
confidence=1.0,
provider=self.adapter_name,
)
raise HTTPException(
status_code=503,
detail=(
"当前环境未配置真实语音转写,请先使用文本共创模式,"
"或在开发模式下提供 transcript_hint。"
),
)
async def health_check(self) -> bool:
return True
@property
def estimated_cost(self) -> float:
return 0.0

View File

@@ -0,0 +1,11 @@
"""ASR adapter result models."""
from pydantic import BaseModel
class TranscriptionOutput(BaseModel):
"""Normalized speech-to-text output from one ASR provider."""
transcript_text: str
confidence: float | None = None
provider: str

View File

@@ -0,0 +1,76 @@
"""OpenAI ASR adapter."""
from __future__ import annotations
from io import BytesIO
from fastapi import HTTPException
from openai import AsyncOpenAI
from app.core.logging import get_logger
from app.services.adapters.asr.models import TranscriptionOutput
from app.services.adapters.base import BaseAdapter
from app.services.adapters.registry import AdapterRegistry
logger = get_logger(__name__)
@AdapterRegistry.register("asr", "openai_asr")
class OpenAIASRAdapter(BaseAdapter[TranscriptionOutput]):
"""Transcribe uploaded voice turn audio with OpenAI audio transcription."""
adapter_type = "asr"
adapter_name = "openai_asr"
async def execute(
self,
audio_bytes: bytes,
file_name: str | None = None,
mime_type: str | None = None,
transcript_hint: str | None = None,
language: str | None = None,
**kwargs,
) -> TranscriptionOutput:
if not self.config.api_key:
raise HTTPException(
status_code=503,
detail="OPENAI_API_KEY 未配置,无法使用 OpenAI 语音转写。",
)
client = AsyncOpenAI(api_key=self.config.api_key)
audio_file = BytesIO(audio_bytes)
audio_file.name = file_name or "voice-turn.webm"
prompt = transcript_hint.strip() if transcript_hint else None
model = self.config.model or "gpt-4o-mini-transcribe"
try:
response = await client.audio.transcriptions.create(
model=model,
file=audio_file,
language=language,
prompt=prompt,
)
except Exception as exc:
logger.warning("openai_asr_failed", error=str(exc))
raise HTTPException(
status_code=503,
detail="语音转写服务暂时不可用,请稍后重试。",
) from exc
transcript_text = (getattr(response, "text", "") or "").strip()
if not transcript_text:
raise HTTPException(status_code=502, detail="语音转写结果为空,请重试。")
return TranscriptionOutput(
transcript_text=transcript_text,
confidence=None,
provider=self.adapter_name,
)
async def health_check(self) -> bool:
return bool(self.config.api_key)
@property
def estimated_cost(self) -> float:
return 0.006

View File

@@ -4,7 +4,7 @@ from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Literal, Protocol, TypeAlias from typing import Literal, Protocol, TypeAlias
ProviderType: TypeAlias = Literal["text", "image", "tts", "storybook"] ProviderType: TypeAlias = Literal["text", "image", "tts", "storybook", "asr"]
class RoutingStrategy(str, Enum): class RoutingStrategy(str, Enum):
@@ -36,6 +36,7 @@ class ProviderSettings(Protocol):
image_providers: list[str] image_providers: list[str]
tts_providers: list[str] tts_providers: list[str]
storybook_providers: list[str] storybook_providers: list[str]
asr_providers: list[str]
enable_demo_providers: bool enable_demo_providers: bool
@@ -71,6 +72,14 @@ CAPABILITY_POLICIES: dict[ProviderType, CapabilityPolicy] = {
default_providers=("storybook_primary",), default_providers=("storybook_primary",),
demo_provider="demo", demo_provider="demo",
), ),
"asr": CapabilityPolicy(
capability="asr",
label="语音识别",
description="将孩子上传的语音回合转写为文本输入。",
settings_attr="asr_providers",
default_providers=("demo",),
demo_provider="demo",
),
} }
@@ -94,6 +103,8 @@ API_KEY_MAP: dict[str, str] = {
"antigravity_api_key": "antigravity_api_key", "antigravity_api_key": "antigravity_api_key",
"image_primary": "image_api_key", "image_primary": "image_api_key",
"image_api_key": "image_api_key", "image_api_key": "image_api_key",
# ASR
"openai_asr": "openai_api_key",
# TTS # TTS
"minimax": "minimax_api_key", "minimax": "minimax_api_key",
"minimax_api_key": "minimax_api_key", "minimax_api_key": "minimax_api_key",

View File

@@ -113,6 +113,14 @@ def _get_default_config(adapter_name: str) -> AdapterConfig | None:
timeout_ms=1000, timeout_ms=1000,
) )
# --- ASR Defaults ---
if adapter_name == "openai_asr":
return AdapterConfig(
api_key=settings.openai_api_key,
model=settings.voice_transcription_model,
timeout_ms=60000,
)
# --- Text Defaults --- # --- Text Defaults ---
if adapter_name in ("gemini", "text_primary"): if adapter_name in ("gemini", "text_primary"):
return AdapterConfig( return AdapterConfig(
@@ -289,7 +297,7 @@ async def _route_with_failover(
"""通用 provider failover 路由。 """通用 provider failover 路由。
Args: Args:
provider_type: 供应商类型 (text/image/tts/storybook) provider_type: 供应商类型 (text/image/tts/storybook/asr)
strategy: 路由策略 strategy: 路由策略
db: 数据库会话(可选,用于指标收集和熔断检查) db: 数据库会话(可选,用于指标收集和熔断检查)
user_id: 用户 ID可选用于成本追踪和预算检查 user_id: 用户 ID可选用于成本追踪和预算检查
@@ -297,7 +305,14 @@ async def _route_with_failover(
story_id: 故事 ID可选用于关联 provider 事件) story_id: 故事 ID可选用于关联 provider 事件)
**kwargs: 传递给适配器的参数 **kwargs: 传递给适配器的参数
""" """
providers = await _get_providers_with_config(provider_type) provider_names = kwargs.pop("provider_names", None)
if provider_names:
providers = [
(name, _get_default_config(name) or AdapterConfig(api_key=""), None)
for name in provider_names
]
else:
providers = await _get_providers_with_config(provider_type)
if not providers: if not providers:
raise ValueError(f"No {provider_type} providers configured.") raise ValueError(f"No {provider_type} providers configured.")
@@ -457,6 +472,35 @@ async def _route_with_failover(
raise ValueError(f"No {provider_type} provider succeeded. Errors: {' | '.join(errors)}") raise ValueError(f"No {provider_type} provider succeeded. Errors: {' | '.join(errors)}")
async def transcribe_audio(
audio_bytes: bytes,
file_name: str | None = None,
mime_type: str | None = None,
transcript_hint: str | None = None,
language: str | None = None,
provider_names: list[str] | None = None,
strategy: RoutingStrategy = RoutingStrategy.PRIORITY,
db: AsyncSession | None = None,
user_id: str | None = None,
):
"""语音转写,支持 provider failover。"""
from app.services.adapters.asr.models import TranscriptionOutput
result: TranscriptionOutput = await _route_with_failover(
"asr",
strategy=strategy,
db=db,
user_id=user_id,
audio_bytes=audio_bytes,
file_name=file_name,
mime_type=mime_type,
transcript_hint=transcript_hint,
language=language,
provider_names=provider_names,
)
return result
async def generate_story_content( async def generate_story_content(
input_type: Literal["keywords", "full_story"], input_type: Literal["keywords", "full_story"],
data: str, data: str,

View File

@@ -1448,6 +1448,8 @@ async def create_voice_turn_from_upload_service(
file_name=file_name, file_name=file_name,
mime_type=mime_type, mime_type=mime_type,
transcript_hint=transcript_hint, transcript_hint=transcript_hint,
db=db,
user_id=user_id,
) )
except HTTPException as exc: except HTTPException as exc:
session.last_error = str(exc.detail) session.last_error = str(exc.detail)

View File

@@ -3,15 +3,12 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO
from fastapi import HTTPException from fastapi import HTTPException
from openai import AsyncOpenAI from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings from app.core.config import settings
from app.core.logging import get_logger from app.services.provider_router import transcribe_audio
logger = get_logger(__name__)
@dataclass(frozen=True) @dataclass(frozen=True)
@@ -23,84 +20,9 @@ class VoiceTranscriptionResult:
provider: str = "demo" provider: str = "demo"
def _normalize_transcript(transcript_text: str) -> str: def _resolve_transcript_hint(transcript_hint: str | None) -> str | None:
return transcript_text.strip() normalized = (transcript_hint or "").strip()
return normalized or None
async def _transcribe_demo(
*,
audio_bytes: bytes,
mime_type: str | None,
transcript_hint: str | None,
) -> VoiceTranscriptionResult:
hint = _normalize_transcript(transcript_hint or "")
if hint:
return VoiceTranscriptionResult(
transcript_text=hint,
confidence=1.0,
provider="demo",
)
if mime_type and mime_type.startswith("text/"):
text = _normalize_transcript(audio_bytes.decode("utf-8", errors="ignore"))
if text:
return VoiceTranscriptionResult(
transcript_text=text,
confidence=1.0,
provider="demo",
)
raise HTTPException(
status_code=503,
detail=(
"当前环境未配置真实语音转写,请先使用文本共创模式,"
"或在开发模式下提供 transcript_hint。"
),
)
async def _transcribe_openai(
*,
audio_bytes: bytes,
file_name: str,
mime_type: str | None,
transcript_hint: str | None,
) -> VoiceTranscriptionResult:
if not settings.openai_api_key:
raise HTTPException(
status_code=503,
detail="OPENAI_API_KEY 未配置,无法使用 OpenAI 语音转写。",
)
client = AsyncOpenAI(api_key=settings.openai_api_key)
audio_file = BytesIO(audio_bytes)
audio_file.name = file_name
prompt = transcript_hint.strip() if transcript_hint else None
try:
response = await client.audio.transcriptions.create(
model=settings.voice_transcription_model,
file=audio_file,
language=settings.voice_transcription_language,
prompt=prompt,
)
except Exception as exc:
logger.warning("voice_transcription_openai_failed", error=str(exc))
raise HTTPException(
status_code=503,
detail="语音转写服务暂时不可用,请稍后重试。",
) from exc
transcript_text = _normalize_transcript(getattr(response, "text", "") or "")
if not transcript_text:
raise HTTPException(status_code=502, detail="语音转写结果为空,请重试。")
return VoiceTranscriptionResult(
transcript_text=transcript_text,
confidence=None,
provider="openai",
)
async def transcribe_voice_audio( async def transcribe_voice_audio(
@@ -109,26 +31,35 @@ async def transcribe_voice_audio(
file_name: str, file_name: str,
mime_type: str | None, mime_type: str | None,
transcript_hint: str | None = None, transcript_hint: str | None = None,
db: AsyncSession | None = None,
user_id: str | None = None,
) -> VoiceTranscriptionResult: ) -> VoiceTranscriptionResult:
"""Transcribe one uploaded audio turn according to the configured mode.""" """Transcribe one uploaded audio turn using configured ASR providers."""
mode = (settings.voice_transcription_mode or "demo").strip().lower() mode = (settings.voice_transcription_mode or "provider").strip().lower()
if mode == "disabled": if mode == "disabled":
raise HTTPException( raise HTTPException(
status_code=503, status_code=503,
detail="当前环境已禁用语音转写,请先使用文本共创模式。", detail="当前环境已禁用语音转写,请先使用文本共创模式。",
) )
if mode == "openai":
return await _transcribe_openai(
audio_bytes=audio_bytes,
file_name=file_name,
mime_type=mime_type,
transcript_hint=transcript_hint,
)
return await _transcribe_demo( hint = _resolve_transcript_hint(transcript_hint)
provider_name = "openai_asr" if mode == "openai" else mode
strategy_providers = None if mode == "provider" else [provider_name]
result = await transcribe_audio(
audio_bytes=audio_bytes, audio_bytes=audio_bytes,
file_name=file_name,
mime_type=mime_type, mime_type=mime_type,
transcript_hint=transcript_hint, transcript_hint=hint,
language=settings.voice_transcription_language,
provider_names=strategy_providers,
db=db,
user_id=user_id,
)
return VoiceTranscriptionResult(
transcript_text=result.transcript_text,
confidence=result.confidence,
provider=result.provider,
) )

View File

@@ -244,8 +244,9 @@ class TestProviderPolicy:
policies = list_capability_policies() policies = list_capability_policies()
capabilities = {item["capability"] for item in policies} capabilities = {item["capability"] for item in policies}
assert capabilities == {"text", "image", "tts", "storybook"} assert capabilities == {"text", "image", "tts", "storybook", "asr"}
assert DEFAULT_PROVIDERS["storybook"] == ["storybook_primary"] assert DEFAULT_PROVIDERS["storybook"] == ["storybook_primary"]
assert DEFAULT_PROVIDERS["asr"] == ["demo"]
def test_demo_provider_only_added_to_supported_capabilities(self): def test_demo_provider_only_added_to_supported_capabilities(self):
settings = SimpleNamespace( settings = SimpleNamespace(
@@ -253,6 +254,7 @@ class TestProviderPolicy:
image_providers=["cqtai"], image_providers=["cqtai"],
tts_providers=["edge_tts"], tts_providers=["edge_tts"],
storybook_providers=["storybook_primary"], storybook_providers=["storybook_primary"],
asr_providers=["openai_asr"],
enable_demo_providers=True, enable_demo_providers=True,
) )
@@ -263,6 +265,7 @@ class TestProviderPolicy:
"storybook_primary", "storybook_primary",
] ]
assert get_provider_names_from_settings("tts", settings) == ["edge_tts"] assert get_provider_names_from_settings("tts", settings) == ["edge_tts"]
assert get_provider_names_from_settings("asr", settings) == ["demo", "openai_asr"]
def test_policy_defaults_when_settings_lists_are_empty(self): def test_policy_defaults_when_settings_lists_are_empty(self):
settings = SimpleNamespace( settings = SimpleNamespace(
@@ -270,6 +273,7 @@ class TestProviderPolicy:
image_providers=[], image_providers=[],
tts_providers=[], tts_providers=[],
storybook_providers=[], storybook_providers=[],
asr_providers=[],
enable_demo_providers=False, enable_demo_providers=False,
) )
@@ -279,6 +283,22 @@ class TestProviderPolicy:
"elevenlabs", "elevenlabs",
"edge_tts", "edge_tts",
] ]
assert get_provider_names_from_settings("asr", settings) == ["demo"]
@pytest.mark.asyncio
async def test_asr_demo_provider_uses_transcript_hint(self):
from app.services import provider_router
result = await provider_router.transcribe_audio(
audio_bytes=b"fake-audio",
file_name="turn.webm",
mime_type="audio/webm",
transcript_hint="我想听一个小熊找星星的故事",
)
assert result.transcript_text == "我想听一个小熊找星星的故事"
assert result.confidence == 1.0
assert result.provider == "demo"
class TestProviderConfigFromDB: class TestProviderConfigFromDB:

View File

@@ -26,6 +26,7 @@ docker compose ps
- 用户端http://localhost:52080 - 用户端http://localhost:52080
- 本地登录http://localhost:52080/auth/dev/signin - 本地登录http://localhost:52080/auth/dev/signin
- 语音共创http://localhost:52080/voice-studio
- Adminhttp://localhost:52888 - Adminhttp://localhost:52888
- 后端健康http://localhost:52000/health - 后端健康http://localhost:52000/health
- Admin 后端健康http://localhost:52800/health - Admin 后端健康http://localhost:52800/health
@@ -46,6 +47,18 @@ docker compose ps
SMOKE_AUDIO=1 ./scripts/demo_smoke.sh SMOKE_AUDIO=1 ./scripts/demo_smoke.sh
``` ```
需要检查语音共创 Alpha 时:
```bash
SMOKE_VOICE=1 ./scripts/demo_smoke.sh
```
需要同时检查 TTS 和语音共创时:
```bash
SMOKE_AUDIO=1 SMOKE_VOICE=1 ./scripts/demo_smoke.sh
```
通过标准: 通过标准:
- [ ] backend health 返回 `ok` - [ ] backend health 返回 `ok`
@@ -62,9 +75,10 @@ SMOKE_AUDIO=1 ./scripts/demo_smoke.sh
- [ ] 绘本 provider stats 返回成功率、耗时和成本字段 - [ ] 绘本 provider stats 返回成功率、耗时和成本字段
- [ ] 绘本图片 retry 后 `image_status=ready` - [ ] 绘本图片 retry 后 `image_status=ready`
- [ ] 绘本阅读页能看到生成轨迹和资源重试历史 - [ ] 绘本阅读页能看到生成轨迹和资源重试历史
- [ ] `/admin/providers/capabilities` 返回 `text/image/tts/storybook` - [ ] `/admin/providers/capabilities` 返回 `text/image/tts/storybook/asr`
- [ ] `/api/audio/{story_id}/status` 能查询音频缓存状态且不触发生成 - [ ] `/api/audio/{story_id}/status` 能查询音频缓存状态且不触发生成
- [ ] 如果启用 `SMOKE_AUDIO=1`,音频 retry 后 `audio_status=ready` - [ ] 如果启用 `SMOKE_AUDIO=1`,音频 retry 后 `audio_status=ready`
- [ ] 如果启用 `SMOKE_VOICE=1`,语音共创会话可完成文本 fallback、上传回合、analytics 和 finalize 到 Story
- [ ] 验证结果已记录到 `docs/planning/demo-validation-log.md` - [ ] 验证结果已记录到 `docs/planning/demo-validation-log.md`
--- ---
@@ -101,11 +115,26 @@ SMOKE_AUDIO=1 ./scripts/demo_smoke.sh
1. 打开 Admin。 1. 打开 Admin。
2. 说明管理端不是用户主链路,而是产品拥有者维护供应链路的辅助能力。 2. 说明管理端不是用户主链路,而是产品拥有者维护供应链路的辅助能力。
3. 通过接口或页面说明: 3. 通过接口或页面说明:
- Capability: `text/image/tts/storybook` - Capability: `text/image/tts/storybook/asr`
- Provider: 具体供应商配置 - Provider: 具体供应商配置
- Adapter: API 调用实现 - Adapter: API 调用实现
- Routing Policy: 优先级/成本/延迟/轮询 - Routing Policy: 优先级/成本/延迟/轮询
### 路径 D: 语音共创 Alpha
1. 打开用户端并进入“语音共创”。
2. 创建一个新会话,先使用文本 fallback 快速演示:
- 首轮:`我想听一个小熊和星星一起找家的故事`
- 修正:`不要让小熊害怕,让月亮姐姐帮它`
3. 展示每轮内容:
- 用户表达 / 系统理解
- 系统文字回应
- TTS 语音回应状态
- 最近事件和待处理提示
4. 演示低置信度确认:说明系统会提示“本轮系统理解为”,家长可选择继续、重说或改成文本。
5. 点击结束并保存,确认正式 Story 进入故事库。
6. 打开生成轨迹,说明语音共创 finalize 后的封面资产补全已经接回统一 generation job。
--- ---
## 4. 3 分钟讲解结构 ## 4. 3 分钟讲解结构
@@ -137,6 +166,8 @@ DreamWeaver 是面向 3-8 岁亲子场景的个性化 AI 绘本与陪伴式讲
| 网络导致 TTS 失败 | 说明音频是可恢复资产,不阻塞故事阅读;使用已缓存样本或跳过 TTS | | 网络导致 TTS 失败 | 说明音频是可恢复资产,不阻塞故事阅读;使用已缓存样本或跳过 TTS |
| 图片 provider 未补全 | 展示 partial ready说明主内容已可读、资产可稍后补全 | | 图片 provider 未补全 | 展示 partial ready说明主内容已可读、资产可稍后补全 |
| 图片 provider 失败 | 展示 degraded completed 与 retry 机制 | | 图片 provider 失败 | 展示 degraded completed 与 retry 机制 |
| 录音或 ASR 不稳定 | 切到文本 fallback说明 Alpha 阶段已保留降级路径 |
| 语音共创低置信度卡住 | 使用“按这个理解继续”或“改成文本输入”完成本轮 |
| Docker 冷启动慢 | 演示前提前运行 smoke 脚本并保持容器运行 | | Docker 冷启动慢 | 演示前提前运行 smoke 脚本并保持容器运行 |
| Admin 页面不适合主展示 | 只用 Provider 分层说明辅助讲系统设计 | | Admin 页面不适合主展示 | 只用 Provider 分层说明辅助讲系统设计 |
| 面试官追问生产部署 | 明确当前是求职版 MVP本轮重点是产品闭环和系统边界 | | 面试官追问生产部署 | 明确当前是求职版 MVP本轮重点是产品闭环和系统边界 |
@@ -149,4 +180,5 @@ DreamWeaver 是面向 3-8 岁亲子场景的个性化 AI 绘本与陪伴式讲
- [ ] 能现场看到普通故事和绘本结果。 - [ ] 能现场看到普通故事和绘本结果。
- [ ] 能解释失败降级和资产重试。 - [ ] 能解释失败降级和资产重试。
- [ ] 能解释为什么 Provider 分层是产品设计,而不是单纯技术炫技。 - [ ] 能解释为什么 Provider 分层是产品设计,而不是单纯技术炫技。
- [ ] 能说明语音共创当前是 Phase A Alpha而不是实时语音最终形态。
- [ ] 能说明下一步计划,而不是让项目停在 demo。 - [ ] 能说明下一步计划,而不是让项目停在 demo。

View File

@@ -2,6 +2,48 @@
这份记录用于演示前快速说明“当前本地 Docker 环境已经验证到什么程度”。新的验证记录按时间倒序追加。 这份记录用于演示前快速说明“当前本地 Docker 环境已经验证到什么程度”。新的验证记录按时间倒序追加。
## 2026-04-24
补充验证:
- 已拉取远端 `main``7e450aa fix: stabilize auth and generation workflows`
- 用户端 `npm run build` 通过,包含最新 Voice Studio、登录态修复和 generation trace 变更。
- 管理端首次 `npm run build` 因 Rollup Linux optional dependency 缺失失败;执行 `npm install` 补齐 `@rollup/rollup-linux-x64-gnu` 后,管理端 `npm run build` 通过。
- 后端当前仓库内 `.venv` 是 Windows 虚拟环境结构WSL/bash 下无法直接执行 `.venv/bin/python`;系统也没有全局 `pytest`。尝试创建 Linux venv 时发现当前 WSL 缺少 `python3.12-venv`,尝试使用 Docker 时发现当前 WSL 未启用 Docker Desktop 集成。本轮未完成后端 pytest需要后续在 Linux venv、Docker 或 Windows PowerShell 环境补跑。
- 语音共创 PRD 已从 Discovery Track 更新为 Phase A Alpha并补充 Alpha 验收矩阵、退出标准和未完成项。
- 演示 checklist 已新增 Voice Studio 入口、语音共创 Alpha 手动演示路径和风险预案。
- `scripts/demo_smoke.sh` 已新增可选 `SMOKE_VOICE=1` 分支,覆盖 Voice Session 创建、文本 fallback、上传回合 demo transcript hint、会话 detail/events、voice analytics、finalize 到 Story 和故事可读性断言。
- ASR 已纳入 Provider 能力分层:默认 `ASR_PROVIDERS=["demo"]`,真实转写可配置 `ASR_PROVIDERS=["openai_asr", "demo"]``OPENAI_API_KEY`
- 管理端 Provider UI 已补 `asr`运营摘要支持按语音识别筛选Provider tab 可创建/查看 ASR provider用户端嵌入的 Provider 管理页同步新增 `asr` tab。
- `bash -n scripts/demo_smoke.sh` 通过。
执行命令:
```bash
cd frontend && npm run build
cd admin-frontend && npm run build
cd admin-frontend && npm install
cd admin-frontend && npm run build
cd backend && pytest -q
cd backend && ./.venv/bin/python --version
cd backend && python3 -m venv .venv-linux
docker compose ps
bash -n scripts/demo_smoke.sh
```
结果:
- 用户端 `vue-tsc && vite build` 通过。
- 管理端 `vue-tsc && vite build` 在补依赖后通过。
- `scripts/demo_smoke.sh` shell 语法检查通过;受当前 WSL 未启用 Docker 影响,未执行完整接口 smoke。
- 后端测试未运行成功,原因是当前执行环境缺少 Linux 可用的 Python dev venv / pytest且 WSL 未启用 Docker。
后续补验建议:
- 在 WSL 下先安装 `python3.12-venv`,再执行 `cd backend && python3 -m venv .venv-linux && .venv-linux/bin/pip install -e ".[dev]" && .venv-linux/bin/python -m pytest -q`
- 或在 Windows PowerShell 下执行 `cd backend; .\.venv\Scripts\python.exe -m pytest -q`
- 后端通过后,再运行 `docker compose up -d --build``SMOKE_VOICE=1 ./scripts/demo_smoke.sh`,并手动走一遍 Voice Studio Alpha 路径。
## 2026-04-18 ## 2026-04-18
补充验证: 补充验证:

View File

@@ -1,9 +1,9 @@
# Product Requirements Document: 语音共创模式增量方案 # Product Requirements Document: 语音共创模式增量方案
**Version**: 0.1 **Version**: 0.2
**Date**: 2026-04-19 **Date**: 2026-04-24
**Author**: Codex (based on founder direction) **Author**: Codex (based on founder direction)
**Status**: Discovery Track / 不插队当前主开发线 **Status**: Phase A Alpha / 已进入可演示收束
--- ---
@@ -13,7 +13,7 @@ DreamWeaver 当前已经具备“输入主题 -> 生成故事/绘本 -> 补全
这个方向的价值不在于再加一个输入方式,而在于把 DreamWeaver 从“生成结果”推进到“陪伴式创作过程”。孩子不是先写清楚需求再等待结果,而是可以像和讲故事的人对话一样,说出自己想要的角色、情节和变化,系统实时或准实时地接住这些表达,再继续讲下去。 这个方向的价值不在于再加一个输入方式,而在于把 DreamWeaver 从“生成结果”推进到“陪伴式创作过程”。孩子不是先写清楚需求再等待结果,而是可以像和讲故事的人对话一样,说出自己想要的角色、情节和变化,系统实时或准实时地接住这些表达,再继续讲下去。
本增量 PRD 的目标不是立刻把语音共创插入当前主开发线,而是先把它定义为一条独立、可评估、可拆阶段落地的产品路线。当前主线仍应继续沿着统一生成工作流、跨环境观测、断点续跑与稳定性治理推进;语音共创作为下一波产品升级方向,先完成需求定义、架构判断和分阶段实施策略 本增量 PRD 最初用于把语音共创定义为一条独立、可评估、可拆阶段落地的产品路线。2026-04-24 更新后,远端 `main` 已经提前跑通 Phase A Alpha独立 Voice Studio、语音/文本回合、低置信度确认、安全改写、TTS 回复、会话恢复、finalize 保存为 Story以及接回统一 generation job 的资产补全与 trace。下一步不应继续扩大到 Phase B 实时化,而应先完成 Alpha 验收、真实 ASR Provider 接入、成本/观测补齐,并回到原主线的跨环境 Provider 汇聚、监控告警和断点续跑
--- ---
@@ -21,19 +21,20 @@ DreamWeaver 当前已经具备“输入主题 -> 生成故事/绘本 -> 补全
### Decision ### Decision
语音共创模式 **现在进入产品发现与方案设计阶段**,但 **不插队当前主开发线** 语音共创模式已经从 **产品发现与方案设计阶段** 进入 **Phase A Alpha 可演示收束阶段**
### Why ### Why
- 当前主线已经明确:统一生成工作流、任务控制、Provider 运营分析、监控与恢复能力。 - 当前主线已经完成统一生成工作流、任务控制、Provider 运营分析、资产补全 trace 和基本恢复能力。
- 语音共创会引入新的交互模式、新的数据模型和新的低延迟系统要求,如果直接插入,会同时打断稳定性主线和架构收束节奏 - Phase A 的数据模型、API、Voice Studio 和 finalize 链路已经落地,但仍处于 Alpha它需要验收、真实 ASR 接入和观测补齐,而不是继续扩大范围
- 先写清楚增量 PRD可以避免后续“想到什么做什么”也能帮助后面的技术选型、原型验证和资源预估 - Phase B/Phase C 会引入流式 ASR、WebSocket、barge-in 和更高实时性要求,应等 Phase A 的产品价值和稳定性被验证后再启动
### Proposed Sequencing ### Proposed Sequencing
1. 继续推进当前主线:跨环境 Provider 汇聚、监控告警、断点续跑与更细粒度任务控制 1. 先完成 Phase A Alpha 收束:回归验证、演示清单、验收矩阵和已知限制记录
2. 并行完成语音共创模式的交互原型、增量 PRD 和技术预研 2. 补齐真实 ASR Provider、turn 级成本/指标归因、Voice Studio smoke 路径和失败降级验收
3. 等当前主线进入相对稳定阶段后,再按分阶段方案启动语音共创 MVP 3. 回到生产化主线:跨环境 Provider 汇聚、监控告警、断点续跑与更细粒度任务控制
4. Phase A 稳定并验证产品价值后,再评估 Phase B 准实时共创。
--- ---
@@ -498,6 +499,32 @@ DreamWeaver 的语音共创模式应当成为一种“孩子可以开口参与
## MoSCoW Prioritization ## MoSCoW Prioritization
## Phase A Alpha Acceptance Snapshot2026-04-24
| Requirement | Status | Evidence | Next Action |
| --- | --- | --- | --- |
| FR-001 语音发起故事共创会话 | Alpha Done | `VoiceStudio` 已提供独立入口,支持录音上传回合和文本 fallback后端有 `POST /api/voice-sessions/{id}/turns` | 用真实儿童表达样本补演示 smoke |
| FR-002 区分开始、继续、修正 | Alpha Done | turn service 已按 `start/continue/correct` 更新会话状态,修正不会清空整段故事 | 增加更多真实儿童表达样本验收 |
| FR-003 系统语音回应并继续讲述 | Alpha Done | 每轮生成 assistant 文本后调用 TTSTTS 失败保留文本响应 | 记录 TTS 延迟与失败率到更细指标 |
| FR-004 保存为正式故事资产 | Alpha Done | `finalize` 已持久化 Story并返回 `generation_job_id` 接回封面资产补全 trace | 补 finalize 后故事库/详情页端到端 smoke |
| FR-005 记录语音会话状态 | Alpha Done | 已有 `voice_sessions / voice_turns / voice_session_events`,前端展示最近 turn 与事件 | 补 turn 级成本与 Provider 归因 |
| FR-006 家长确认关键改写 | Alpha Done | 低 `transcript_confidence``intent_confidence` 会触发确认,支持继续、重说、改文本 | 打磨确认文案和移动端操作密度 |
| FR-007 分段插图节点 | Partial | 当前支持结束后统一封面补全,并为 asset job 接回统一 trace | 后续评估关键段落插图,不进当前 P0 |
| FR-008 分支剧情 | Deferred | 当前状态模型不阻断未来扩展,但未实现分叉体验 | 保持 P2Phase A 不做 |
| NFR-001 响应可接受 | Needs Measurement | 回合式体验已实现,但尚无 p95 指标采集 | 加入 ASR/TTS/turn 编排耗时埋点 |
| NFR-002 儿童内容安全 | Alpha Done | 已新增用户转写安全检查、assistant 柔性改写和 `safety_flags` 事件 | 扩充安全样本和误伤回归 |
| NFR-003 成本可观测 | Partial | generation job/provider analytics 已覆盖资产补全voice turn 级 ASR/TTS 成本仍需细化 | 把 ASR/Dialogue/TTS 成本写入 turn/event metadata |
| NFR-004 会话可恢复 | Alpha Done | Voice Studio 支持最近会话恢复和 active session 查询 | 补刷新/切页手动验收记录 |
| NFR-005 架构可插拔 | Alpha Done | ASR 已纳入 `asr` Provider capability默认 demo fallback可配置 `openai_asr` | 后续补更多 ASR provider 与管理端体验 |
### Alpha Exit Criteria
- 后端测试、前端构建、管理端构建和 Docker smoke 在当前环境可重复通过。
- Voice Studio 手动路径覆盖:创建会话、文本 fallback、录音回合、低置信度确认、重说/改文本、finalize、故事库回看、资产 trace。
- 真实 ASR Provider 至少完成一个可配置适配器,并保留 demo fallback。已接入 `openai_asr`,待真实 Key 环境验收)
- turn 级事件至少能区分 ASR、Dialogue、TTS、Safety、Confirmation、Finalize 和 Asset Generation。
- PRD、技术方案、演示 checklist 与当前实现保持一致。
### Must Have ### Must Have
- 语音发起故事 - 语音发起故事

View File

@@ -446,8 +446,8 @@ Phase A 明确不做以下内容:
理由是: 理由是:
- 当前 admin Provider 只有 `text/image/tts/storybook` - 当前 admin Provider 已扩展到 `text/image/tts/storybook/asr`
- 如果一开始把 `asr` 也并进全套管理能力,改动面会大很多 - Phase A Alpha 已把 ASR 纳入最小 Provider 能力,但仍保留 demo fallback避免真实转写不可用时阻塞演示
### B. Dialogue Orchestrator ### B. Dialogue Orchestrator

View File

@@ -68,7 +68,7 @@
<!-- Tabs --> <!-- Tabs -->
<div class="flex space-x-1 bg-gray-100 p-1 rounded-xl w-fit"> <div class="flex space-x-1 bg-gray-100 p-1 rounded-xl w-fit">
<button <button
v-for="tab in ['text', 'image', 'tts', 'storybook']" v-for="tab in ['text', 'image', 'tts', 'storybook', 'asr']"
:key="tab" :key="tab"
@click="activeTab = tab" @click="activeTab = tab"
class="px-6 py-2 rounded-lg text-sm font-medium transition-all duration-200" class="px-6 py-2 rounded-lg text-sm font-medium transition-all duration-200"

View File

@@ -6,6 +6,7 @@ BACKEND_URL="${BACKEND_URL:-http://localhost:52000}"
ADMIN_BACKEND_URL="${ADMIN_BACKEND_URL:-http://localhost:52800}" ADMIN_BACKEND_URL="${ADMIN_BACKEND_URL:-http://localhost:52800}"
ADMIN_AUTH="${ADMIN_AUTH:-admin:admin}" ADMIN_AUTH="${ADMIN_AUTH:-admin:admin}"
SMOKE_AUDIO="${SMOKE_AUDIO:-0}" SMOKE_AUDIO="${SMOKE_AUDIO:-0}"
SMOKE_VOICE="${SMOKE_VOICE:-0}"
COOKIE_JAR="$(mktemp "${TMPDIR:-/tmp}/dreamweaver-cookie.XXXXXX")" COOKIE_JAR="$(mktemp "${TMPDIR:-/tmp}/dreamweaver-cookie.XXXXXX")"
cleanup() { cleanup() {
@@ -33,6 +34,12 @@ post_json() {
curl -fsS -b "$COOKIE_JAR" -H 'Content-Type: application/json' -d "$payload" "$url" curl -fsS -b "$COOKIE_JAR" -H 'Content-Type: application/json' -d "$payload" "$url"
} }
post_form() {
local url="$1"
shift
curl -fsS -b "$COOKIE_JAR" "$@" "$url"
}
get_json() { get_json() {
local url="$1" local url="$1"
curl -fsS -b "$COOKIE_JAR" "$url" curl -fsS -b "$COOKIE_JAR" "$url"
@@ -86,7 +93,7 @@ assert_jq "$session_json" '.user.id == "github:dev_user_001"' "dev session shoul
say "Checking provider capability policy" say "Checking provider capability policy"
capabilities_json="$(curl -fsS -u "$ADMIN_AUTH" "$ADMIN_BACKEND_URL/admin/providers/capabilities")" capabilities_json="$(curl -fsS -u "$ADMIN_AUTH" "$ADMIN_BACKEND_URL/admin/providers/capabilities")"
assert_jq "$capabilities_json" 'map(.capability) | sort == ["image","storybook","text","tts"]' "capabilities should include text/image/tts/storybook" assert_jq "$capabilities_json" 'map(.capability) | sort == ["asr","image","storybook","text","tts"]' "capabilities should include text/image/tts/storybook/asr"
say "Generating text story without assets" say "Generating text story without assets"
story_json="$(post_json "$APP_URL/api/generations" '{ story_json="$(post_json "$APP_URL/api/generations" '{
@@ -149,6 +156,67 @@ else
say "Skipping audio smoke; set SMOKE_AUDIO=1 to include TTS" say "Skipping audio smoke; set SMOKE_AUDIO=1 to include TTS"
fi fi
if [[ "$SMOKE_VOICE" == "1" ]]; then
say "Creating voice co-creation session"
voice_session_json="$(post_json "$APP_URL/api/voice-sessions" '{}')"
voice_session_id="$(jq -r '.id' <<<"$voice_session_json")"
assert_jq "$voice_session_json" '.status == "draft" and .target_mode == "story" and .can_continue == true' "voice session should be created as a resumable draft"
echo "$voice_session_json" | jq '{id,status,target_mode,current_turn_index,can_continue,can_finalize,transcription_mode_hint}'
say "Submitting voice text fallback turn"
voice_turn_json="$(post_json "$APP_URL/api/voice-sessions/$voice_session_id/turns/fallback" '{
"transcript_text": "我想听一个小熊和星星一起找家的故事",
"duration_ms": 1200
}')"
voice_turn_id="$(jq -r '.turn_id' <<<"$voice_turn_json")"
assert_jq "$voice_turn_json" '.status != "failed" and .turn_id != null and .turn_id != ""' "voice fallback turn should be accepted"
voice_turn_detail_json="$(get_json "$APP_URL/api/voice-sessions/$voice_session_id/turns/$voice_turn_id")"
assert_jq "$voice_turn_detail_json" '.user_transcript | contains("小熊")' "voice fallback turn should keep user transcript"
assert_jq "$voice_turn_detail_json" '.assistant_text != null and .assistant_text != ""' "voice fallback turn should return assistant text"
assert_jq "$voice_turn_detail_json" '.detected_intent == "start" and .requires_confirmation == false' "first voice turn should start the story without confirmation"
echo "$voice_turn_detail_json" | jq '{id,status,detected_intent,requires_confirmation,assistant_audio_ready,assistant_text}'
say "Submitting voice uploaded turn with demo transcript hint"
voice_upload_json="$(post_form "$APP_URL/api/voice-sessions/$voice_session_id/turns" \
-F 'audio_file=@/dev/null;filename=turn.webm;type=audio/webm' \
-F 'duration_ms=900' \
-F 'transcript_hint=不要让小熊害怕,让月亮姐姐帮它')"
voice_upload_turn_id="$(jq -r '.turn_id' <<<"$voice_upload_json")"
assert_jq "$voice_upload_json" '.status != "failed" and .transcription_provider == "demo"' "voice upload turn should use demo transcript hint"
voice_upload_detail_json="$(get_json "$APP_URL/api/voice-sessions/$voice_session_id/turns/$voice_upload_turn_id")"
assert_jq "$voice_upload_detail_json" '.user_transcript | contains("月亮姐姐")' "voice upload turn should expose hinted transcript"
assert_jq "$voice_upload_detail_json" '.detected_intent == "correct" and .assistant_text != null' "voice upload correction should continue the narrative"
echo "$voice_upload_detail_json" | jq '{id,status,transcription_provider,detected_intent,requires_confirmation,assistant_audio_ready,assistant_text}'
say "Checking voice session detail and analytics"
voice_detail_json="$(get_json "$APP_URL/api/voice-sessions/$voice_session_id")"
assert_jq "$voice_detail_json" '.current_turn_index >= 2 and (.recent_turns | length) >= 2 and (.events | length) >= 2 and .can_finalize == true' "voice session should include turns/events and be finalizable"
assert_jq "$voice_detail_json" '([.events[].event_type] | index("turn_transcribed")) != null and ([.events[].event_type] | index("turn_narrative_ready")) != null' "voice session should record key turn events"
echo "$voice_detail_json" | jq '{id,status,current_turn_index,can_finalize,latest_detected_intent,events:([.events[].event_type] | unique)}'
voice_analytics_json="$(get_json "$APP_URL/api/voice-sessions/analytics?days=7")"
assert_jq "$voice_analytics_json" '.window_days == 7 and .total_sessions >= 1 and .total_turns >= 2 and .successful_turns >= 2' "voice analytics should include the smoke session"
echo "$voice_analytics_json" | jq '{window_days,total_sessions,total_turns,successful_turns,failed_turns,turn_success_rate,finalize_conversion_rate}'
say "Finalizing voice session into story"
voice_finalize_json="$(post_json "$APP_URL/api/voice-sessions/$voice_session_id/finalize" '{
"save_story": true,
"generate_cover": true,
"generate_final_audio": false
}')"
voice_story_id="$(jq -r '.story_id' <<<"$voice_finalize_json")"
assert_jq "$voice_finalize_json" '.status == "completed" and .story_id != null' "voice session should finalize into a story"
echo "$voice_finalize_json" | jq '{session_id,status,story_id,generation_job_id}'
voice_story_json="$(get_json "$APP_URL/api/generations/$voice_story_id")"
assert_jq "$voice_story_json" '.mode == "generated" and .generation_status != "failed" and .text_status == "ready"' "voice finalized story should be readable"
echo "$voice_story_json" | jq '{id,title,mode,generation_status,text_status,image_status,audio_status,retryable_assets}'
else
say "Skipping voice co-creation smoke; set SMOKE_VOICE=1 to include Voice Studio Alpha"
fi
say "Generating storybook without images" say "Generating storybook without images"
storybook_json="$(post_json "$APP_URL/api/generations" '{ storybook_json="$(post_json "$APP_URL/api/generations" '{
"output_mode": "storybook", "output_mode": "storybook",