From 7c85d7a2e767b5906e9456a6a20dd5bd243f12e5 Mon Sep 17 00:00:00 2001 From: Yuyan Date: Sun, 19 Apr 2026 22:41:23 +0800 Subject: [PATCH] docs: add voice co-creation migration api draft --- docs/README.md | 3 + ...co-creation-phase-a-migration-api-draft.md | 859 ++++++++++++++++++ 2 files changed, 862 insertions(+) create mode 100644 docs/technical/voice-co-creation-phase-a-migration-api-draft.md diff --git a/docs/README.md b/docs/README.md index d726a3e..a33fc56 100644 --- a/docs/README.md +++ b/docs/README.md @@ -62,6 +62,9 @@ - `technical/voice-co-creation-phase-a-tech-spec.md` 语音共创 Phase A 技术方案。用于把增量 PRD 收敛成最小可实现的会话模型、API 草图、状态机与实现顺序。 +- `technical/voice-co-creation-phase-a-migration-api-draft.md` + 语音共创 Phase A 迁移与 API 草案。用于把 `voice_sessions` 数据迁移、Pydantic schema、路由签名和实现落点细化到可直接开工的程度。 + ## 维护规则 - 新 PRD 放到 `docs/product/` diff --git a/docs/technical/voice-co-creation-phase-a-migration-api-draft.md b/docs/technical/voice-co-creation-phase-a-migration-api-draft.md new file mode 100644 index 0000000..7e3d2f9 --- /dev/null +++ b/docs/technical/voice-co-creation-phase-a-migration-api-draft.md @@ -0,0 +1,859 @@ +# 实现草案:语音共创 Phase A 数据迁移与 API Schema + +**Version**: 0.1 +**Date**: 2026-04-19 +**Status**: Draft / Ready for implementation handoff + +--- + +## 1. 目的 + +这份文档是 [语音共创 Phase A 技术方案](/Users/zt/Code/dreamweaver/docs/technical/voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。 + +它的目标很明确: + +- 把数据库迁移命名、表结构和索引钉住 +- 把后端文件落点钉住 +- 把 Pydantic schema 草图钉住 +- 把 API request / response 和错误语义钉住 + +这样下一步真正写代码时,可以直接从这份草案拆成: + +1. Alembic migration +2. SQLAlchemy models +3. Pydantic schemas +4. API routes +5. Service implementation + +--- + +## 2. 建议变更清单 + +### 2.1 新增 Alembic revision + +建议 revision 文件名: + +`backend/alembic/versions/0013_add_voice_sessions_phase_a.py` + +建议 revision metadata: + +```python +revision = "0013_add_voice_sessions_phase_a" +down_revision = "0012_story_text_status" +branch_labels = None +depends_on = None +``` + +### 2.2 建议新增后端文件 + +- `backend/app/api/voice_sessions.py` +- `backend/app/schemas/voice_session_schemas.py` +- `backend/app/services/voice_session_service.py` +- `backend/app/services/voice_session_storage.py` +- `backend/tests/test_voice_sessions.py` + +### 2.3 建议改动现有文件 + +- `backend/app/db/models.py` + 增加 `VoiceSession` / `VoiceTurn` / `VoiceSessionEvent` +- `backend/app/main.py` + 注册 voice session 路由 +- `docs/README.md` + 文档索引 + +--- + +## 3. 数据库迁移草案 + +## 3.1 新表:`voice_sessions` + +### 设计目标 + +- 承载一个语音共创会话 +- 与正式 `stories` 解耦 +- 可恢复、可收束、可排障 + +### 建议字段 + +| Column | Type | Nullable | Default | Notes | +| --- | --- | --- | --- | --- | +| `id` | `String(36)` | No | uuid | 主键 | +| `user_id` | `String(255)` | No | - | FK -> `users.id` | +| `child_profile_id` | `String(36)` | Yes | - | FK -> `child_profiles.id` | +| `universe_id` | `String(36)` | Yes | - | FK -> `story_universes.id` | +| `final_story_id` | `Integer` | Yes | - | FK -> `stories.id` | +| `target_mode` | `String(32)` | No | `"story"` | Phase A 固定 story | +| `status` | `String(32)` | No | `"draft"` | `draft/active/processing_turn/waiting_user/finalizing_story/completed/abandoned/failed` | +| `current_turn_index` | `Integer` | No | `0` | 当前轮次 | +| `working_title` | `String(255)` | Yes | - | 会话中临时标题 | +| `story_state` | `JSON` | No | `"{}"` | 中间故事状态 | +| `latest_user_transcript` | `Text` | Yes | - | 最近一轮用户转写 | +| `latest_assistant_text` | `Text` | Yes | - | 最近一轮系统文本 | +| `last_error` | `Text` | Yes | - | 最近错误 | +| `created_at` | `DateTime(timezone=True)` | No | `now()` | 创建时间 | +| `updated_at` | `DateTime(timezone=True)` | No | `now()` | 更新时间 | + +### 建议索引 + +- `ix_voice_sessions_user_id` +- `ix_voice_sessions_child_profile_id` +- `ix_voice_sessions_universe_id` +- `ix_voice_sessions_final_story_id` +- `ix_voice_sessions_status` +- `ix_voice_sessions_created_at` + +## 3.2 新表:`voice_turns` + +### 设计目标 + +- 记录每一轮语音输入与系统响应 +- 既能支持恢复,也能支持调试 + +### 建议字段 + +| Column | Type | Nullable | Default | Notes | +| --- | --- | --- | --- | --- | +| `id` | `String(36)` | No | uuid | 主键 | +| `session_id` | `String(36)` | No | - | FK -> `voice_sessions.id` | +| `turn_index` | `Integer` | No | - | 从 1 开始 | +| `status` | `String(32)` | No | `"received"` | `received/transcribing/intent_resolved/narrative_ready/audio_ready/failed` | +| `user_audio_path` | `String(500)` | Yes | - | 原始录音路径 | +| `user_audio_mime_type` | `String(100)` | Yes | - | 例如 `audio/webm` | +| `user_audio_duration_ms` | `Integer` | Yes | - | 客户端上报或服务端探测 | +| `user_transcript` | `Text` | Yes | - | 转写文本 | +| `transcript_confidence` | `Float` | Yes | - | ASR 置信度 | +| `detected_intent` | `String(32)` | No | `"unknown"` | `start_story/continue_story/correct_story/end_story/save_story/unknown` | +| `intent_confidence` | `Float` | Yes | - | 意图识别置信度 | +| `story_patch` | `JSON` | No | `"{}"` | 本轮对故事状态的 patch | +| `assistant_text` | `Text` | Yes | - | 系统文本回应 | +| `assistant_audio_path` | `String(500)` | Yes | - | 系统音频路径 | +| `assistant_audio_duration_ms` | `Integer` | Yes | - | 系统音频长度 | +| `error_message` | `Text` | Yes | - | 本轮错误 | +| `created_at` | `DateTime(timezone=True)` | No | `now()` | 创建时间 | +| `updated_at` | `DateTime(timezone=True)` | No | `now()` | 更新时间 | + +### 约束与索引建议 + +- Unique constraint: + - `uq_voice_turn_session_turn_index` on `("session_id", "turn_index")` +- Indexes: + - `ix_voice_turns_session_id` + - `ix_voice_turns_status` + - `ix_voice_turns_created_at` + +## 3.3 新表:`voice_session_events` + +### 设计目标 + +- 追加式记录会话层事件 +- 不和 `generation_job_events` 混表 + +### 建议字段 + +| Column | Type | Nullable | Default | Notes | +| --- | --- | --- | --- | --- | +| `id` | `Integer` | No | autoincrement | 主键 | +| `session_id` | `String(36)` | No | - | FK -> `voice_sessions.id` | +| `turn_id` | `String(36)` | Yes | - | FK -> `voice_turns.id` | +| `event_type` | `String(64)` | No | - | 见后文事件建议 | +| `status` | `String(32)` | No | - | `received/succeeded/failed/info` 等 | +| `message` | `Text` | Yes | - | 用户可读或日志信息 | +| `event_metadata` | `JSON` | No | `"{}"` | 附加信息 | +| `created_at` | `DateTime(timezone=True)` | No | `now()` | 创建时间 | + +### 建议索引 + +- `ix_voice_session_events_session_id` +- `ix_voice_session_events_turn_id` +- `ix_voice_session_events_created_at` + +--- + +## 4. Alembic 迁移草案 + +下面这段不是最终可直接执行的生产代码,但已经足够接近真实 migration: + +```python +"""add voice co-creation phase a tables + +Revision ID: 0013_add_voice_sessions_phase_a +Revises: 0012_story_text_status +Create Date: 2026-04-19 + +""" + +import sqlalchemy as sa +from alembic import op + +revision = "0013_add_voice_sessions_phase_a" +down_revision = "0012_story_text_status" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "voice_sessions", + sa.Column("id", sa.String(length=36), nullable=False), + sa.Column("user_id", sa.String(length=255), nullable=False), + sa.Column("child_profile_id", sa.String(length=36), nullable=True), + sa.Column("universe_id", sa.String(length=36), nullable=True), + sa.Column("final_story_id", sa.Integer(), nullable=True), + sa.Column("target_mode", sa.String(length=32), nullable=False, server_default="story"), + sa.Column("status", sa.String(length=32), nullable=False, server_default="draft"), + sa.Column("current_turn_index", sa.Integer(), nullable=False, server_default="0"), + sa.Column("working_title", sa.String(length=255), nullable=True), + sa.Column("story_state", sa.JSON(), nullable=False, server_default="{}"), + sa.Column("latest_user_transcript", sa.Text(), nullable=True), + sa.Column("latest_assistant_text", sa.Text(), nullable=True), + sa.Column("last_error", sa.Text(), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now()), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now()), + sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["child_profile_id"], ["child_profiles.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint(["universe_id"], ["story_universes.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint(["final_story_id"], ["stories.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_voice_sessions_user_id", "voice_sessions", ["user_id"]) + op.create_index("ix_voice_sessions_child_profile_id", "voice_sessions", ["child_profile_id"]) + op.create_index("ix_voice_sessions_universe_id", "voice_sessions", ["universe_id"]) + op.create_index("ix_voice_sessions_final_story_id", "voice_sessions", ["final_story_id"]) + op.create_index("ix_voice_sessions_status", "voice_sessions", ["status"]) + op.create_index("ix_voice_sessions_created_at", "voice_sessions", ["created_at"]) + + op.create_table( + "voice_turns", + sa.Column("id", sa.String(length=36), nullable=False), + sa.Column("session_id", sa.String(length=36), nullable=False), + sa.Column("turn_index", sa.Integer(), nullable=False), + sa.Column("status", sa.String(length=32), nullable=False, server_default="received"), + sa.Column("user_audio_path", sa.String(length=500), nullable=True), + sa.Column("user_audio_mime_type", sa.String(length=100), nullable=True), + sa.Column("user_audio_duration_ms", sa.Integer(), nullable=True), + sa.Column("user_transcript", sa.Text(), nullable=True), + sa.Column("transcript_confidence", sa.Float(), nullable=True), + sa.Column("detected_intent", sa.String(length=32), nullable=False, server_default="unknown"), + sa.Column("intent_confidence", sa.Float(), nullable=True), + sa.Column("story_patch", sa.JSON(), nullable=False, server_default="{}"), + sa.Column("assistant_text", sa.Text(), nullable=True), + sa.Column("assistant_audio_path", sa.String(length=500), nullable=True), + sa.Column("assistant_audio_duration_ms", sa.Integer(), nullable=True), + sa.Column("error_message", sa.Text(), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now()), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now()), + sa.ForeignKeyConstraint(["session_id"], ["voice_sessions.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("session_id", "turn_index", name="uq_voice_turn_session_turn_index"), + ) + op.create_index("ix_voice_turns_session_id", "voice_turns", ["session_id"]) + op.create_index("ix_voice_turns_status", "voice_turns", ["status"]) + op.create_index("ix_voice_turns_created_at", "voice_turns", ["created_at"]) + + op.create_table( + "voice_session_events", + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column("session_id", sa.String(length=36), nullable=False), + sa.Column("turn_id", sa.String(length=36), nullable=True), + sa.Column("event_type", sa.String(length=64), nullable=False), + sa.Column("status", sa.String(length=32), nullable=False), + sa.Column("message", sa.Text(), nullable=True), + sa.Column("event_metadata", sa.JSON(), nullable=False, server_default="{}"), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now()), + sa.ForeignKeyConstraint(["session_id"], ["voice_sessions.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["turn_id"], ["voice_turns.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + "ix_voice_session_events_session_id", + "voice_session_events", + ["session_id"], + ) + op.create_index( + "ix_voice_session_events_turn_id", + "voice_session_events", + ["turn_id"], + ) + op.create_index( + "ix_voice_session_events_created_at", + "voice_session_events", + ["created_at"], + ) + + +def downgrade() -> None: + op.drop_index("ix_voice_session_events_created_at", table_name="voice_session_events") + op.drop_index("ix_voice_session_events_turn_id", table_name="voice_session_events") + op.drop_index("ix_voice_session_events_session_id", table_name="voice_session_events") + op.drop_table("voice_session_events") + + op.drop_index("ix_voice_turns_created_at", table_name="voice_turns") + op.drop_index("ix_voice_turns_status", table_name="voice_turns") + op.drop_index("ix_voice_turns_session_id", table_name="voice_turns") + op.drop_table("voice_turns") + + op.drop_index("ix_voice_sessions_created_at", table_name="voice_sessions") + op.drop_index("ix_voice_sessions_status", table_name="voice_sessions") + op.drop_index("ix_voice_sessions_final_story_id", table_name="voice_sessions") + op.drop_index("ix_voice_sessions_universe_id", table_name="voice_sessions") + op.drop_index("ix_voice_sessions_child_profile_id", table_name="voice_sessions") + op.drop_index("ix_voice_sessions_user_id", table_name="voice_sessions") + op.drop_table("voice_sessions") +``` + +--- + +## 5. SQLAlchemy Model 草图 + +建议落在 `backend/app/db/models.py`,风格对齐现有 `GenerationJob`: + +```python +class VoiceSession(Base): + __tablename__ = "voice_sessions" + + id: Mapped[str] = mapped_column(String(36), primary_key=True, default=_uuid) + user_id: Mapped[str] = mapped_column( + String(255), ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True + ) + child_profile_id: Mapped[str | None] = mapped_column( + String(36), ForeignKey("child_profiles.id", ondelete="SET NULL"), nullable=True, index=True + ) + universe_id: Mapped[str | None] = mapped_column( + String(36), ForeignKey("story_universes.id", ondelete="SET NULL"), nullable=True, index=True + ) + final_story_id: Mapped[int | None] = mapped_column( + Integer, ForeignKey("stories.id", ondelete="SET NULL"), nullable=True, index=True + ) + target_mode: Mapped[str] = mapped_column(String(32), nullable=False, default="story") + status: Mapped[str] = mapped_column(String(32), nullable=False, default="draft", index=True) + current_turn_index: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + working_title: Mapped[str | None] = mapped_column(String(255), nullable=True) + story_state: Mapped[dict] = mapped_column(JSON, default=dict) + latest_user_transcript: Mapped[str | None] = mapped_column(Text, nullable=True) + latest_assistant_text: Mapped[str | None] = mapped_column(Text, nullable=True) + last_error: Mapped[str | None] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), index=True + ) + updated_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now() + ) + + +class VoiceTurn(Base): + __tablename__ = "voice_turns" + __table_args__ = ( + UniqueConstraint("session_id", "turn_index", name="uq_voice_turn_session_turn_index"), + ) + + id: Mapped[str] = mapped_column(String(36), primary_key=True, default=_uuid) + session_id: Mapped[str] = mapped_column( + String(36), ForeignKey("voice_sessions.id", ondelete="CASCADE"), nullable=False, index=True + ) + turn_index: Mapped[int] = mapped_column(Integer, nullable=False) + status: Mapped[str] = mapped_column(String(32), nullable=False, default="received", index=True) + user_audio_path: Mapped[str | None] = mapped_column(String(500), nullable=True) + user_audio_mime_type: Mapped[str | None] = mapped_column(String(100), nullable=True) + user_audio_duration_ms: Mapped[int | None] = mapped_column(Integer, nullable=True) + user_transcript: Mapped[str | None] = mapped_column(Text, nullable=True) + transcript_confidence: Mapped[float | None] = mapped_column(Float, nullable=True) + detected_intent: Mapped[str] = mapped_column(String(32), nullable=False, default="unknown") + intent_confidence: Mapped[float | None] = mapped_column(Float, nullable=True) + story_patch: Mapped[dict] = mapped_column(JSON, default=dict) + assistant_text: Mapped[str | None] = mapped_column(Text, nullable=True) + assistant_audio_path: Mapped[str | None] = mapped_column(String(500), nullable=True) + assistant_audio_duration_ms: Mapped[int | None] = mapped_column(Integer, nullable=True) + error_message: Mapped[str | None] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), index=True + ) + updated_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now() + ) + + +class VoiceSessionEvent(Base): + __tablename__ = "voice_session_events" + + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + session_id: Mapped[str] = mapped_column( + String(36), ForeignKey("voice_sessions.id", ondelete="CASCADE"), nullable=False, index=True + ) + turn_id: Mapped[str | None] = mapped_column( + String(36), ForeignKey("voice_turns.id", ondelete="SET NULL"), nullable=True, index=True + ) + event_type: Mapped[str] = mapped_column(String(64), nullable=False) + status: Mapped[str] = mapped_column(String(32), nullable=False) + message: Mapped[str | None] = mapped_column(Text, nullable=True) + event_metadata: Mapped[dict] = mapped_column(JSON, default=dict) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), index=True + ) +``` + +--- + +## 6. Pydantic Schema 草图 + +建议新文件:`backend/app/schemas/voice_session_schemas.py` + +## 6.1 常量建议 + +```python +MAX_VOICE_TRANSCRIPT_LENGTH = 1000 +MAX_VOICE_TARGET_MODE = ("story",) +MAX_TURN_DURATION_MS = 90_000 +``` + +## 6.2 Request Schemas + +```python +from datetime import datetime +from typing import Any, Literal + +from pydantic import BaseModel, Field + + +class VoiceSessionCreateRequest(BaseModel): + child_profile_id: str | None = None + universe_id: str | None = None + target_mode: Literal["story"] = Field(default="story") + + +class VoiceTurnCreateFallbackRequest(BaseModel): + transcript_text: str = Field(..., min_length=1, max_length=1000) + duration_ms: int | None = Field(default=None, ge=1, le=90_000) + + +class VoiceSessionFinalizeRequest(BaseModel): + save_story: bool = True + generate_cover: bool = True + generate_final_audio: bool = False + + +class VoiceSessionAbandonRequest(BaseModel): + reason: str | None = Field(default=None, max_length=200) +``` + +## 6.3 Response Schemas + +```python +class VoiceSessionEventResponse(BaseModel): + id: int + session_id: str + turn_id: str | None = None + event_type: str + status: str + message: str | None = None + event_metadata: dict[str, Any] = Field(default_factory=dict) + created_at: datetime + + +class VoiceTurnSummaryResponse(BaseModel): + id: str + session_id: str + turn_index: int + status: str + user_transcript: str | None = None + transcript_confidence: float | None = None + detected_intent: str + intent_confidence: float | None = None + assistant_text: str | None = None + assistant_audio_ready: bool = False + assistant_audio_url: str | None = None + error_message: str | None = None + created_at: datetime + updated_at: datetime + + +class VoiceSessionSummaryResponse(BaseModel): + id: str + child_profile_id: str | None = None + universe_id: str | None = None + final_story_id: int | None = None + target_mode: str + status: str + current_turn_index: int + working_title: str | None = None + story_state: dict[str, Any] = Field(default_factory=dict) + latest_user_transcript: str | None = None + latest_assistant_text: str | None = None + can_continue: bool = False + can_finalize: bool = False + last_error: str | None = None + created_at: datetime + updated_at: datetime + + +class VoiceSessionDetailResponse(VoiceSessionSummaryResponse): + recent_turns: list[VoiceTurnSummaryResponse] = Field(default_factory=list) + events: list[VoiceSessionEventResponse] = Field(default_factory=list) + + +class VoiceTurnAcceptedResponse(BaseModel): + turn_id: str + session_id: str + status: str + + +class VoiceSessionFinalizeResponse(BaseModel): + session_id: str + status: str + story_id: int | None = None + generation_job_id: str | None = None +``` + +--- + +## 7. 路由草图 + +建议新文件:`backend/app/api/voice_sessions.py` + +## 7.1 路由清单 + +### 创建会话 + +```python +@router.post("/voice-sessions", response_model=VoiceSessionSummaryResponse, status_code=201) +async def create_voice_session(...) +``` + +### 获取会话详情 + +```python +@router.get("/voice-sessions/{session_id}", response_model=VoiceSessionDetailResponse) +async def get_voice_session(...) +``` + +### 提交一轮语音 + +首版建议主接口使用 `multipart/form-data`: + +```python +@router.post( + "/voice-sessions/{session_id}/turns", + response_model=VoiceTurnAcceptedResponse, + status_code=202, +) +async def create_voice_turn( + session_id: str, + audio_file: UploadFile = File(...), + duration_ms: int | None = Form(default=None), + user: User = Depends(require_user), + db: AsyncSession = Depends(get_db), +): + ... +``` + +### 提交一轮文本 fallback + +为了开发期调试、桌面浏览器兼容和测试稳定性,建议同步提供: + +```python +@router.post( + "/voice-sessions/{session_id}/turns/fallback", + response_model=VoiceTurnAcceptedResponse, + status_code=202, +) +async def create_voice_turn_from_text(...) +``` + +### 获取一轮结果 + +```python +@router.get( + "/voice-sessions/{session_id}/turns/{turn_id}", + response_model=VoiceTurnSummaryResponse, +) +async def get_voice_turn(...) +``` + +### 结束并保存 + +```python +@router.post( + "/voice-sessions/{session_id}/finalize", + response_model=VoiceSessionFinalizeResponse, +) +async def finalize_voice_session(...) +``` + +### 放弃会话 + +```python +@router.post("/voice-sessions/{session_id}/abandon", response_model=VoiceSessionSummaryResponse) +async def abandon_voice_session(...) +``` + +--- + +## 8. API 行为语义 + +## 8.1 `POST /api/voice-sessions` + +### Request + +```json +{ + "child_profile_id": "profile-id", + "universe_id": "universe-id", + "target_mode": "story" +} +``` + +### Response + +```json +{ + "id": "session-id", + "child_profile_id": "profile-id", + "universe_id": "universe-id", + "final_story_id": null, + "target_mode": "story", + "status": "draft", + "current_turn_index": 0, + "working_title": null, + "story_state": {}, + "latest_user_transcript": null, + "latest_assistant_text": null, + "can_continue": true, + "can_finalize": false, + "last_error": null, + "created_at": "2026-04-19T12:00:00Z", + "updated_at": "2026-04-19T12:00:00Z" +} +``` + +## 8.2 `POST /api/voice-sessions/{session_id}/turns` + +### Request + +`multipart/form-data` + +- `audio_file` +- `duration_ms`(可选) + +### Response + +```json +{ + "turn_id": "turn-id", + "session_id": "session-id", + "status": "received" +} +``` + +说明: + +- 这一步只表示本轮已被接收 +- 前端需继续轮询 `GET /api/voice-sessions/{session_id}/turns/{turn_id}` + +## 8.3 `GET /api/voice-sessions/{session_id}/turns/{turn_id}` + +### Response + +```json +{ + "id": "turn-id", + "session_id": "session-id", + "turn_index": 2, + "status": "audio_ready", + "user_transcript": "不要让它哭了,给它一个朋友", + "transcript_confidence": 0.91, + "detected_intent": "correct_story", + "intent_confidence": 0.87, + "assistant_text": "小猫擦了擦眼泪,这时月亮后面飞来了一位会发光的小伙伴。", + "assistant_audio_ready": true, + "assistant_audio_url": "/static/voice-sessions/session-id/turn-002-assistant.mp3", + "error_message": null, + "created_at": "2026-04-19T12:01:00Z", + "updated_at": "2026-04-19T12:01:04Z" +} +``` + +## 8.4 `POST /api/voice-sessions/{session_id}/finalize` + +### Request + +```json +{ + "save_story": true, + "generate_cover": true, + "generate_final_audio": false +} +``` + +### Response + +```json +{ + "session_id": "session-id", + "status": "completed", + "story_id": 123, + "generation_job_id": "optional-asset-job-id" +} +``` + +说明: + +- `story_id` 是正式沉淀结果 +- 如果 finalize 后还触发了封面等资产补全,可返回 `generation_job_id` + +--- + +## 9. Service 方法草图 + +建议新文件:`backend/app/services/voice_session_service.py` + +建议至少包含这些入口: + +```python +async def create_voice_session_service(...) +async def get_voice_session_detail_service(...) +async def create_voice_turn_service(...) +async def create_voice_turn_from_text_service(...) +async def get_voice_turn_service(...) +async def finalize_voice_session_service(...) +async def abandon_voice_session_service(...) +``` + +### 推荐内部 helper + +```python +async def _store_user_audio(...) +async def _transcribe_voice_turn(...) +async def _resolve_turn_intent(...) +async def _apply_story_patch(...) +async def _generate_assistant_turn(...) +async def _synthesize_assistant_audio(...) +async def _persist_session_event(...) +async def _finalize_session_to_story(...) +``` + +--- + +## 10. 错误语义建议 + +### 404 + +- session 不存在 +- turn 不存在 + +### 409 + +- session 当前状态不允许继续提交 turn +- session 已经 completed / abandoned +- finalize 重复提交 + +### 422 + +- 音频文件缺失 +- transcript fallback 为空 +- `target_mode` 非 Phase A 支持值 + +### 503 + +- ASR provider 临时不可用 +- TTS provider 临时不可用且无法降级 + +### 降级语义 + +- ASR 失败:本轮失败,可重试 +- 意图解析失败:本轮标记 `unknown`,前端提示重说 +- TTS 失败但文本成功:turn 状态停在 `narrative_ready`,不让整个 session 失败 + +--- + +## 11. 事件建议 + +建议 `voice_session_events.event_type` 首版支持: + +- `session_created` +- `turn_received` +- `turn_transcribing` +- `turn_transcribed` +- `intent_resolved` +- `story_patch_applied` +- `assistant_text_ready` +- `assistant_audio_ready` +- `assistant_audio_failed` +- `session_finalizing` +- `session_saved_as_story` +- `session_abandoned` +- `session_failed` + +--- + +## 12. 文件存储草案 + +建议目录: + +`storage/voice_sessions//` + +### 文件命名 + +- `turn-001-user.webm` +- `turn-001-assistant.mp3` +- `turn-002-user.webm` +- `turn-002-assistant.mp3` + +### 建议单独封装 + +`backend/app/services/voice_session_storage.py` + +建议方法: + +```python +def session_storage_dir(session_id: str) -> Path +def build_turn_user_audio_path(session_id: str, turn_index: int, suffix: str) -> Path +def build_turn_assistant_audio_path(session_id: str, turn_index: int) -> Path +``` + +--- + +## 13. 最小实现顺序 + +### 第 1 步 + +- Alembic migration +- SQLAlchemy models + +### 第 2 步 + +- `voice_session_schemas.py` +- `voice_sessions.py` 路由骨架 + +### 第 3 步 + +- 文本 fallback 路由先通 +- 不依赖真实音频,也能先走完整 session 流程 + +### 第 4 步 + +- 接入真实音频上传 +- 接入 ASR +- 接入 TTS + +### 第 5 步 + +- finalize -> Story +- 复用现有故事库链路 + +这个顺序的好处是: + +- 先打通状态流 +- 再接真实语音 +- 风险分层最清楚 + +--- + +## 14. 当前最值得继续的下一步 + +如果我们要把这份草案继续往前推成真正可编码状态,最合理的下一步不是直接铺开所有实现,而是: + +1. 先把 migration 和 SQLAlchemy model skeleton 真正写出来 +2. 再把 `voice_session_schemas.py` 和 `voice_sessions.py` 的空实现搭起来 +3. 先用文本 fallback 跑通整条链路 +4. 最后再接真实录音和 ASR + +这会比“先做浏览器录音再补后端状态”稳得多。