feat: add voice session confirmation guardrails

2026-04-20 12:29:14 +08:00
parent 4ecf0c09c0
commit dbb512719d
8 changed files with 406 additions and 50 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -82,6 +82,14 @@ class Settings(BaseSettings):
        "zh",
        description="Preferred language hint for voice transcription",
    )
    voice_turn_low_transcript_confidence: float = Field(
        0.65,
        description="Prompt for confirmation when transcript confidence falls below this threshold",
    )
    voice_turn_low_intent_confidence: float = Field(
        0.70,
        description="Prompt for confirmation when intent confidence falls below this threshold",
    )
    voice_turn_max_upload_bytes: int = Field(
        5 * 1024 * 1024,
        description="Maximum accepted upload size in bytes for one voice turn audio file",
--- a/backend/app/schemas/voice_session_schemas.py
+++ b/backend/app/schemas/voice_session_schemas.py
@@ -73,6 +73,10 @@ class VoiceTurnSummaryResponse(BaseModel):
    transcription_provider: str | None = None
    detected_intent: str
    intent_confidence: float | None = None
    understanding_summary: str | None = None
    requires_confirmation: bool = False
    confirmation_reason: str | None = None
    confirmation_message: str | None = None
    assistant_text: str | None = None
    assistant_audio_ready: bool = False
    assistant_audio_url: str | None = None
@@ -99,6 +103,9 @@ class VoiceSessionSummaryResponse(BaseModel):
    latest_user_transcript: str | None = None
    latest_assistant_text: str | None = None
    latest_detected_intent: str | None = None
    latest_understanding_summary: str | None = None
    latest_requires_confirmation: bool = False
    latest_confirmation_message: str | None = None
    latest_assistant_audio_ready: bool = False
    last_turn_status: str | None = None
    transcription_mode_hint: str | None = None
--- a/backend/app/services/voice_session_service.py
+++ b/backend/app/services/voice_session_service.py
@@ -79,8 +79,129 @@ def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> st
    return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio"
 def _format_intent_label(intent: str | None) -> str:
    labels = {
        "start_story": "开启故事",
        "continue_story": "继续讲述",
        "correct_story": "修正走向",
        "end_story": "先停在这里",
        "save_story": "保存当前故事",
        "unknown": "待确认",
    }
    return labels.get(intent or "", "待确认")
 def _build_understanding_summary(
    *,
    transcript_text: str | None,
    detected_intent: str,
 ) -> str | None:
    normalized_transcript = (transcript_text or "").strip()
    if detected_intent == "unknown":
        if normalized_transcript:
            return f"本轮系统暂时还没完全理解：{normalized_transcript}"
        return "本轮系统暂时还没完全理解孩子刚才的表达。"
    if normalized_transcript:
        return f"本轮系统理解为「{_format_intent_label(detected_intent)}」：{normalized_transcript}"
    return f"本轮系统理解为「{_format_intent_label(detected_intent)}」"
 def _build_confirmation_message(
    *,
    transcript_text: str | None,
    detected_intent: str,
    reasons: list[str],
 ) -> str:
    natural_understanding = ""
    normalized_transcript = (transcript_text or "").strip()
    if detected_intent != "unknown":
        if normalized_transcript:
            natural_understanding = (
                f"我现在先理解成你想「{_format_intent_label(detected_intent)}」："
                f"{normalized_transcript}。"
            )
        else:
            natural_understanding = f"我现在先理解成你想「{_format_intent_label(detected_intent)}」。"
    if "intent_unknown" in reasons:
        prefix = "我这一次还没有完全听懂。"
    elif {
        "low_transcript_confidence",
        "low_intent_confidence",
    }.issubset(set(reasons)):
        prefix = "我这一次听得还不够清楚，也不太确定该怎么接剧情。"
    elif "low_transcript_confidence" in reasons:
        prefix = "我这一次可能没有完全听清。"
    else:
        prefix = "我这一次还不太确定你是想继续讲，还是想改一下剧情。"
    return (
        f"{prefix}{natural_understanding}"
        "请家长帮忙确认一下；如果不对，可以换一种说法再说一遍，我们再继续编下去。"
    )
 def _resolve_turn_confirmation_state(
    *,
    transcript_text: str | None,
    transcript_confidence: float | None,
    detected_intent: str,
    intent_confidence: float | None,
    story_patch: dict[str, Any] | None = None,
 ) -> dict[str, Any]:
    patch = story_patch or {}
    requires_confirmation = patch.get("requires_confirmation")
    confirmation_reason = patch.get("confirmation_reason")
    confirmation_message = patch.get("confirmation_message")
    understanding_summary = patch.get("understanding_summary")
    reasons: list[str] = []
    if detected_intent == "unknown":
        reasons.append("intent_unknown")
    if (
        transcript_confidence is not None
        and transcript_confidence < settings.voice_turn_low_transcript_confidence
    ):
        reasons.append("low_transcript_confidence")
    if (
        intent_confidence is not None
        and intent_confidence < settings.voice_turn_low_intent_confidence
    ):
        reasons.append("low_intent_confidence")
    if requires_confirmation is None:
        requires_confirmation = bool(reasons)
    if confirmation_reason is None and reasons:
        confirmation_reason = ",".join(reasons)
    if understanding_summary is None:
        understanding_summary = _build_understanding_summary(
            transcript_text=transcript_text,
            detected_intent=detected_intent,
        )
    if confirmation_message is None and requires_confirmation:
        confirmation_message = _build_confirmation_message(
            transcript_text=transcript_text,
            detected_intent=detected_intent,
            reasons=reasons or ["intent_unknown"],
        )
    return {
        "understanding_summary": understanding_summary,
        "requires_confirmation": bool(requires_confirmation),
        "confirmation_reason": confirmation_reason,
        "confirmation_message": confirmation_message,
    }
 def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
    turn_patch = turn.story_patch or {}
    confirmation_state = _resolve_turn_confirmation_state(
        transcript_text=turn.user_transcript,
        transcript_confidence=turn.transcript_confidence,
        detected_intent=turn.detected_intent,
        intent_confidence=turn.intent_confidence,
        story_patch=turn_patch,
    )
    return VoiceTurnSummaryResponse(
        id=turn.id,
        session_id=turn.session_id,
@@ -91,6 +212,10 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
        transcription_provider=turn_patch.get("transcription_provider"),
        detected_intent=turn.detected_intent,
        intent_confidence=turn.intent_confidence,
        understanding_summary=confirmation_state["understanding_summary"],
        requires_confirmation=confirmation_state["requires_confirmation"],
        confirmation_reason=confirmation_state["confirmation_reason"],
        confirmation_message=confirmation_state["confirmation_message"],
        assistant_text=turn.assistant_text,
        assistant_audio_ready=session_audio_exists(turn.assistant_audio_path),
        assistant_audio_url=_assistant_audio_url(
@@ -114,8 +239,20 @@ def _session_to_summary(
 ) -> VoiceSessionSummaryResponse:
    if latest_turn is None:
        total_turns = total_turns if total_turns is not None else session.current_turn_index
        latest_confirmation_state = {
            "understanding_summary": None,
            "requires_confirmation": False,
            "confirmation_message": None,
        }
    else:
        total_turns = total_turns if total_turns is not None else latest_turn.turn_index
        latest_confirmation_state = _resolve_turn_confirmation_state(
            transcript_text=latest_turn.user_transcript,
            transcript_confidence=latest_turn.transcript_confidence,
            detected_intent=latest_turn.detected_intent,
            intent_confidence=latest_turn.intent_confidence,
            story_patch=latest_turn.story_patch or {},
        )
    return VoiceSessionSummaryResponse(
        id=session.id,
@@ -131,6 +268,9 @@ def _session_to_summary(
        latest_user_transcript=session.latest_user_transcript,
        latest_assistant_text=session.latest_assistant_text,
        latest_detected_intent=latest_turn.detected_intent if latest_turn else None,
        latest_understanding_summary=latest_confirmation_state["understanding_summary"],
        latest_requires_confirmation=latest_confirmation_state["requires_confirmation"],
        latest_confirmation_message=latest_confirmation_state["confirmation_message"],
        latest_assistant_audio_ready=(
            session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
        ),
@@ -230,11 +370,13 @@ def _detect_intent(
    normalized = transcript_text.replace(" ", "")
    if any(keyword in normalized for keyword in ("保存故事", "存起来", "保存吧", "保存到故事库")):
-        return "save_story", 0.95
+        return "save_story", 0.96
    if any(keyword in normalized for keyword in ("先到这里", "讲完了", "结束吧", "停在这里")):
-        return "end_story", 0.88
+        return "end_story", 0.90
    if len(normalized) <= 1 or normalized in {"嗯", "啊", "呃", "额", "这个", "那个", "不知道"}:
        return "unknown", 0.25
    if current_turn_index == 0:
-        return "start_story", 0.82
+        return "start_story", 0.84
    if any(
        keyword in normalized
        for keyword in (
@@ -248,8 +390,8 @@ def _detect_intent(
            "其实",
        )
    ):
-        return "correct_story", 0.76
+        return "correct_story", 0.78
-    return "continue_story", 0.68
+    return "continue_story", 0.74
 def _recent_story_text(session: VoiceSession) -> str:
@@ -451,6 +593,13 @@ async def _process_pending_turn(
    assistant_result: StoryOutput | None = None
    detected_intent = turn.detected_intent
    intent_confidence = turn.intent_confidence
    confirmation_state = _resolve_turn_confirmation_state(
        transcript_text=transcript_text,
        transcript_confidence=turn.transcript_confidence,
        detected_intent=detected_intent,
        intent_confidence=intent_confidence,
        story_patch=turn.story_patch or {},
    )
    try:
        await _record_session_event(
@@ -466,7 +615,61 @@ async def _process_pending_turn(
            },
        )
-        if detected_intent == "save_story":
+        if confirmation_state["requires_confirmation"]:
            current_state = _default_story_state() | (session.story_state or {})
            assistant_text = confirmation_state["confirmation_message"]
            turn.story_patch = {
                **(turn.story_patch or {}),
                "intent": detected_intent,
                "transcript_text": transcript_text,
                "segment_added": False,
                "working_title": session.working_title,
                "cover_prompt": current_state.get("cover_prompt"),
                "narrative_segments_count": len(
                    list(current_state.get("narrative_segments") or [])
                ),
                "requires_confirmation": True,
                "confirmation_reason": confirmation_state["confirmation_reason"],
                "confirmation_message": confirmation_state["confirmation_message"],
                "understanding_summary": confirmation_state["understanding_summary"],
            }
            turn.assistant_text = assistant_text
            turn.status = "narrative_ready"
            session.latest_assistant_text = assistant_text
            session.status = "waiting_user"
            session.updated_at = _utcnow()
            await db.commit()
            await db.refresh(session)
            await db.refresh(turn)
            await _record_session_event(
                db,
                session_id=session.id,
                turn_id=turn.id,
                event_type="turn_confirmation_requested",
                status="needs_confirmation",
                message="Voice turn needs parent confirmation before the story continues.",
                metadata={
                    "detected_intent": detected_intent,
                    "transcript_confidence": turn.transcript_confidence,
                    "intent_confidence": intent_confidence,
                    "confirmation_reason": confirmation_state["confirmation_reason"],
                },
            )
            await _record_session_event(
                db,
                session_id=session.id,
                turn_id=turn.id,
                event_type="assistant_text_ready",
                status="succeeded",
                message="Assistant clarification prompt generated.",
                metadata={
                    "assistant_text_length": len(assistant_text or ""),
                    "working_title": session.working_title,
                    "requires_confirmation": True,
                },
            )
        elif detected_intent == "save_story":
            assistant_text = "好的，这个故事已经准备好保存到故事库了。"
        elif detected_intent == "end_story":
            assistant_text = "好的，我们先把故事停在这里。想保存的话，现在就可以保存到故事库。"
@@ -479,6 +682,7 @@ async def _process_pending_turn(
            )
            assistant_text = assistant_result.story_text.strip()
        if not confirmation_state["requires_confirmation"]:
            merged_state, story_patch = _merge_story_state(
                session,
                transcript_text=transcript_text,
@@ -520,6 +724,7 @@ async def _process_pending_turn(
                metadata={
                    "assistant_text_length": len(assistant_text or ""),
                    "working_title": session.working_title,
                    "requires_confirmation": False,
                },
            )
    except Exception as exc:
--- a/backend/tests/test_voice_sessions.py
+++ b/backend/tests/test_voice_sessions.py
@@ -6,6 +6,7 @@ from app.core.config import settings
 from app.db.database import get_db
 from app.main import app
 from app.services.adapters.text.models import StoryOutput
 from app.services.voice_transcription_service import VoiceTranscriptionResult
 async def test_voice_session_create_and_fallback_turn_returns_audio(
@@ -272,6 +273,82 @@ async def test_voice_session_uploaded_audio_turn_uses_demo_transcript_hint(
            app.dependency_overrides.clear()
 async def test_voice_session_low_confidence_turn_requests_confirmation(
    db_session,
    auth_token,
 ):
    async def override_get_db():
        yield db_session
    app.dependency_overrides[get_db] = override_get_db
    with (
        patch(
            "app.services.voice_session_service.generate_story_content",
            new_callable=AsyncMock,
        ) as mock_generate,
        patch(
            "app.services.voice_session_service.text_to_speech",
            new_callable=AsyncMock,
        ) as mock_tts,
        patch(
            "app.services.voice_session_service.transcribe_voice_audio",
            new_callable=AsyncMock,
        ) as mock_transcribe,
    ):
        mock_tts.return_value = b"confirmation-audio"
        mock_transcribe.return_value = VoiceTranscriptionResult(
            transcript_text="我想听一个会发光的小恐龙故事",
            confidence=0.41,
            provider="openai",
        )
        transport = ASGITransport(app=app)
        try:
            async with AsyncClient(transport=transport, base_url="http://test") as client:
                client.cookies.set("access_token", auth_token)
                response = await client.post("/api/voice-sessions", json={})
                assert response.status_code == 201
                session_id = response.json()["id"]
                response = await client.post(
                    f"/api/voice-sessions/{session_id}/turns",
                    files={
                        "audio_file": ("turn.webm", b"fake-webm-audio", "audio/webm"),
                    },
                )
                assert response.status_code == 202
                turn_id = response.json()["turn_id"]
                response = await client.get(
                    f"/api/voice-sessions/{session_id}/turns/{turn_id}"
                )
                assert response.status_code == 200
                turn_data = response.json()
                assert turn_data["status"] == "audio_ready"
                assert turn_data["requires_confirmation"] is True
                assert turn_data["understanding_summary"].startswith("本轮系统理解为")
                assert "请家长帮忙确认" in turn_data["confirmation_message"]
                assert turn_data["assistant_text"] == turn_data["confirmation_message"]
                response = await client.get(f"/api/voice-sessions/{session_id}")
                assert response.status_code == 200
                session_data = response.json()
                assert session_data["latest_requires_confirmation"] is True
                assert "请家长帮忙确认" in session_data["latest_confirmation_message"]
                assert session_data["can_finalize"] is False
                assert session_data["story_state"]["narrative_segments"] == []
                assert any(
                    event["event_type"] == "turn_confirmation_requested"
                    for event in session_data["events"]
                )
                mock_generate.assert_not_awaited()
        finally:
            app.dependency_overrides.clear()
 async def test_voice_session_list_orders_recent_sessions_first(
    db_session,
    auth_token,
--- a/docs/technical/voice-co-creation-phase-a-migration-api-draft.md
+++ b/docs/technical/voice-co-creation-phase-a-migration-api-draft.md
@@ -8,7 +8,7 @@
 ## 1. 目的
-这份文档是 [语音共创 Phase A 技术方案](/Users/zt/Code/dreamweaver/docs/technical/voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。
+这份文档是 [语音共创 Phase A 技术方案](./voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。
 它的目标很明确：
--- a/docs/technical/voice-co-creation-phase-a-tech-spec.md
+++ b/docs/technical/voice-co-creation-phase-a-tech-spec.md
@@ -8,7 +8,23 @@
 ## 1. 目标
-这份技术方案用于把 [语音共创模式增量 PRD](/Users/zt/Code/dreamweaver/docs/product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。
+这份技术方案用于把 [语音共创模式增量 PRD](../product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。
 ## 0. 当前实现快照（2026-04-20）
 远端 `main` 已经跑通以下 Phase A 主链路：
 - 独立 Voice Studio 入口页与最近会话恢复
 - `voice_sessions / voice_turns / voice_session_events` 数据模型与 API
 - 文本 fallback 回合、录音上传回合、turn 轮询结果查询
 - turn 级语音补发、失败重试、会话 abandon、finalize -> Story 持久化
 - 会话事件记录与最近 turn / 最近事件展示
 本轮新增收束：
 - 当 `transcript_confidence` 或 `intent_confidence` 偏低时，后端优先返回确认提示，而不是直接把这一轮写进故事正文
 - 前端明确展示“本轮系统理解为”与“建议家长确认后再继续”提示
 - 低置信度确认链路已有后端测试覆盖，可作为下一阶段继续接 ASR 与更细确认交互的基础
 Phase A 的核心目标不是做“完全实时的语音陪伴”，而是验证以下最小闭环：
--- a/frontend/src/types/voiceSession.ts
+++ b/frontend/src/types/voiceSession.ts
@@ -8,6 +8,10 @@ export interface VoiceTurnSummary {
  transcription_provider: string | null
  detected_intent: string
  intent_confidence: number | null
  understanding_summary: string | null
  requires_confirmation: boolean
  confirmation_reason: string | null
  confirmation_message: string | null
  assistant_text: string | null
  assistant_audio_ready: boolean
  assistant_audio_url: string | null
@@ -43,6 +47,9 @@ export interface VoiceSessionSummary {
  latest_user_transcript: string | null
  latest_assistant_text: string | null
  latest_detected_intent: string | null
  latest_understanding_summary: string | null
  latest_requires_confirmation: boolean
  latest_confirmation_message: string | null
  latest_assistant_audio_ready: boolean
  last_turn_status: string | null
  transcription_mode_hint: string | null
--- a/frontend/src/views/VoiceStudio.vue
+++ b/frontend/src/views/VoiceStudio.vue
@@ -162,6 +162,13 @@ function formatDate(dateStr: string) {
  })
 }
 function formatConfidence(value: number | null | undefined) {
  if (typeof value !== 'number') {
    return 'n/a'
  }
  return `${Math.round(value * 100)}%`
 }
 function revokeRecordedAudioUrl() {
  if (recordedAudioUrl.value) {
    URL.revokeObjectURL(recordedAudioUrl.value)
@@ -718,6 +725,19 @@ onBeforeUnmount(() => {
            <div class="mt-6 grid grid-cols-1 gap-6 xl:grid-cols-[minmax(0,1.2fr)_minmax(0,0.8fr)]">
              <div class="space-y-6">
                <div
                  v-if="activeSession.latest_requires_confirmation"
                  class="rounded-2xl border border-amber-200 bg-amber-50 p-4 text-amber-800"
                >
                  <div class="text-sm font-semibold">建议先确认这一轮理解</div>
                  <p class="mt-2 text-sm">
                    {{ activeSession.latest_confirmation_message || '系统对这一轮的理解还不够确定，建议家长先确认后再继续。' }}
                  </p>
                  <p v-if="activeSession.latest_understanding_summary" class="mt-2 text-xs text-amber-700">
                    {{ activeSession.latest_understanding_summary }}
                  </p>
                </div>
                <div class="rounded-2xl border border-gray-100 bg-white p-4">
                  <div class="flex items-center justify-between">
                    <h3 class="font-semibold text-gray-900">文本共创回合</h3>
@@ -839,6 +859,22 @@ onBeforeUnmount(() => {
                        <span class="font-medium text-gray-900">孩子：</span>
                        {{ turn.user_transcript || '暂无转写内容' }}
                      </div>
                      <div v-if="turn.understanding_summary" class="mt-3 text-sm text-slate-600">
                        {{ turn.understanding_summary }}
                      </div>
                      <div
                        v-if="turn.requires_confirmation"
                        class="mt-3 rounded-2xl border border-amber-200 bg-amber-50 px-3 py-3 text-sm text-amber-800"
                      >
                        <div class="font-medium">建议家长确认后再继续</div>
                        <p class="mt-1">
                          {{ turn.confirmation_message || '系统对这一轮的理解还不够确定，建议换一种说法再试一次。' }}
                        </p>
                        <p class="mt-2 text-xs text-amber-700">
                          转写置信度：{{ formatConfidence(turn.transcript_confidence) }} ·
                          意图置信度：{{ formatConfidence(turn.intent_confidence) }}
                        </p>
                      </div>
                      <div v-if="turn.user_audio_url" class="mt-3">
                        <audio class="w-full" :src="turn.user_audio_url" controls></audio>
                      </div>