feat: add voice session confirmation guardrails

2026-04-20 12:29:14 +08:00
parent 4ecf0c09c0
commit dbb512719d
8 changed files with 406 additions and 50 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -82,6 +82,14 @@ class Settings(BaseSettings):
        "zh",
        description="Preferred language hint for voice transcription",
    )
+    voice_turn_low_transcript_confidence: float = Field(
+        0.65,
+        description="Prompt for confirmation when transcript confidence falls below this threshold",
+    )
+    voice_turn_low_intent_confidence: float = Field(
+        0.70,
+        description="Prompt for confirmation when intent confidence falls below this threshold",
+    )
    voice_turn_max_upload_bytes: int = Field(
        5 * 1024 * 1024,
        description="Maximum accepted upload size in bytes for one voice turn audio file",
--- a/backend/app/schemas/voice_session_schemas.py
+++ b/backend/app/schemas/voice_session_schemas.py
@@ -73,6 +73,10 @@ class VoiceTurnSummaryResponse(BaseModel):
    transcription_provider: str | None = None
    detected_intent: str
    intent_confidence: float | None = None
+    understanding_summary: str | None = None
+    requires_confirmation: bool = False
+    confirmation_reason: str | None = None
+    confirmation_message: str | None = None
    assistant_text: str | None = None
    assistant_audio_ready: bool = False
    assistant_audio_url: str | None = None
@@ -99,6 +103,9 @@ class VoiceSessionSummaryResponse(BaseModel):
    latest_user_transcript: str | None = None
    latest_assistant_text: str | None = None
    latest_detected_intent: str | None = None
+    latest_understanding_summary: str | None = None
+    latest_requires_confirmation: bool = False
+    latest_confirmation_message: str | None = None
    latest_assistant_audio_ready: bool = False
    last_turn_status: str | None = None
    transcription_mode_hint: str | None = None
--- a/backend/app/services/voice_session_service.py
+++ b/backend/app/services/voice_session_service.py
@@ -79,8 +79,129 @@ def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> st
    return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio"


+def _format_intent_label(intent: str | None) -> str:
+    labels = {
+        "start_story": "开启故事",
+        "continue_story": "继续讲述",
+        "correct_story": "修正走向",
+        "end_story": "先停在这里",
+        "save_story": "保存当前故事",
+        "unknown": "待确认",
+    }
+    return labels.get(intent or "", "待确认")
+
+
+def _build_understanding_summary(
+    *,
+    transcript_text: str | None,
+    detected_intent: str,
+) -> str | None:
+    normalized_transcript = (transcript_text or "").strip()
+    if detected_intent == "unknown":
+        if normalized_transcript:
+            return f"本轮系统暂时还没完全理解：{normalized_transcript}"
+        return "本轮系统暂时还没完全理解孩子刚才的表达。"
+    if normalized_transcript:
+        return f"本轮系统理解为「{_format_intent_label(detected_intent)}」：{normalized_transcript}"
+    return f"本轮系统理解为「{_format_intent_label(detected_intent)}」"
+
+
+def _build_confirmation_message(
+    *,
+    transcript_text: str | None,
+    detected_intent: str,
+    reasons: list[str],
+) -> str:
+    natural_understanding = ""
+    normalized_transcript = (transcript_text or "").strip()
+    if detected_intent != "unknown":
+        if normalized_transcript:
+            natural_understanding = (
+                f"我现在先理解成你想「{_format_intent_label(detected_intent)}」："
+                f"{normalized_transcript}。"
+            )
+        else:
+            natural_understanding = f"我现在先理解成你想「{_format_intent_label(detected_intent)}」。"
+
+    if "intent_unknown" in reasons:
+        prefix = "我这一次还没有完全听懂。"
+    elif {
+        "low_transcript_confidence",
+        "low_intent_confidence",
+    }.issubset(set(reasons)):
+        prefix = "我这一次听得还不够清楚，也不太确定该怎么接剧情。"
+    elif "low_transcript_confidence" in reasons:
+        prefix = "我这一次可能没有完全听清。"
+    else:
+        prefix = "我这一次还不太确定你是想继续讲，还是想改一下剧情。"
+
+    return (
+        f"{prefix}{natural_understanding}"
+        "请家长帮忙确认一下；如果不对，可以换一种说法再说一遍，我们再继续编下去。"
+    )
+
+
+def _resolve_turn_confirmation_state(
+    *,
+    transcript_text: str | None,
+    transcript_confidence: float | None,
+    detected_intent: str,
+    intent_confidence: float | None,
+    story_patch: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    patch = story_patch or {}
+    requires_confirmation = patch.get("requires_confirmation")
+    confirmation_reason = patch.get("confirmation_reason")
+    confirmation_message = patch.get("confirmation_message")
+    understanding_summary = patch.get("understanding_summary")
+
+    reasons: list[str] = []
+    if detected_intent == "unknown":
+        reasons.append("intent_unknown")
+    if (
+        transcript_confidence is not None
+        and transcript_confidence < settings.voice_turn_low_transcript_confidence
+    ):
+        reasons.append("low_transcript_confidence")
+    if (
+        intent_confidence is not None
+        and intent_confidence < settings.voice_turn_low_intent_confidence
+    ):
+        reasons.append("low_intent_confidence")
+
+    if requires_confirmation is None:
+        requires_confirmation = bool(reasons)
+    if confirmation_reason is None and reasons:
+        confirmation_reason = ",".join(reasons)
+    if understanding_summary is None:
+        understanding_summary = _build_understanding_summary(
+            transcript_text=transcript_text,
+            detected_intent=detected_intent,
+        )
+    if confirmation_message is None and requires_confirmation:
+        confirmation_message = _build_confirmation_message(
+            transcript_text=transcript_text,
+            detected_intent=detected_intent,
+            reasons=reasons or ["intent_unknown"],
+        )
+
+    return {
+        "understanding_summary": understanding_summary,
+        "requires_confirmation": bool(requires_confirmation),
+        "confirmation_reason": confirmation_reason,
+        "confirmation_message": confirmation_message,
+    }
+
+
 def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
    turn_patch = turn.story_patch or {}
+    confirmation_state = _resolve_turn_confirmation_state(
+        transcript_text=turn.user_transcript,
+        transcript_confidence=turn.transcript_confidence,
+        detected_intent=turn.detected_intent,
+        intent_confidence=turn.intent_confidence,
+        story_patch=turn_patch,
+    )
    return VoiceTurnSummaryResponse(
        id=turn.id,
        session_id=turn.session_id,
@@ -91,6 +212,10 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
        transcription_provider=turn_patch.get("transcription_provider"),
        detected_intent=turn.detected_intent,
        intent_confidence=turn.intent_confidence,
+        understanding_summary=confirmation_state["understanding_summary"],
+        requires_confirmation=confirmation_state["requires_confirmation"],
+        confirmation_reason=confirmation_state["confirmation_reason"],
+        confirmation_message=confirmation_state["confirmation_message"],
        assistant_text=turn.assistant_text,
        assistant_audio_ready=session_audio_exists(turn.assistant_audio_path),
        assistant_audio_url=_assistant_audio_url(
@@ -114,8 +239,20 @@ def _session_to_summary(
 ) -> VoiceSessionSummaryResponse:
    if latest_turn is None:
        total_turns = total_turns if total_turns is not None else session.current_turn_index
+        latest_confirmation_state = {
+            "understanding_summary": None,
+            "requires_confirmation": False,
+            "confirmation_message": None,
+        }
    else:
        total_turns = total_turns if total_turns is not None else latest_turn.turn_index
+        latest_confirmation_state = _resolve_turn_confirmation_state(
+            transcript_text=latest_turn.user_transcript,
+            transcript_confidence=latest_turn.transcript_confidence,
+            detected_intent=latest_turn.detected_intent,
+            intent_confidence=latest_turn.intent_confidence,
+            story_patch=latest_turn.story_patch or {},
+        )

    return VoiceSessionSummaryResponse(
        id=session.id,
@@ -131,6 +268,9 @@ def _session_to_summary(
        latest_user_transcript=session.latest_user_transcript,
        latest_assistant_text=session.latest_assistant_text,
        latest_detected_intent=latest_turn.detected_intent if latest_turn else None,
+        latest_understanding_summary=latest_confirmation_state["understanding_summary"],
+        latest_requires_confirmation=latest_confirmation_state["requires_confirmation"],
+        latest_confirmation_message=latest_confirmation_state["confirmation_message"],
        latest_assistant_audio_ready=(
            session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
        ),
@@ -230,11 +370,13 @@ def _detect_intent(
    normalized = transcript_text.replace(" ", "")

    if any(keyword in normalized for keyword in ("保存故事", "存起来", "保存吧", "保存到故事库")):
-        return "save_story", 0.95
+        return "save_story", 0.96
    if any(keyword in normalized for keyword in ("先到这里", "讲完了", "结束吧", "停在这里")):
-        return "end_story", 0.88
+        return "end_story", 0.90
+    if len(normalized) <= 1 or normalized in {"嗯", "啊", "呃", "额", "这个", "那个", "不知道"}:
+        return "unknown", 0.25
    if current_turn_index == 0:
-        return "start_story", 0.82
+        return "start_story", 0.84
    if any(
        keyword in normalized
        for keyword in (
@@ -248,8 +390,8 @@ def _detect_intent(
            "其实",
        )
    ):
-        return "correct_story", 0.76
-    return "continue_story", 0.68
+        return "correct_story", 0.78
+    return "continue_story", 0.74


 def _recent_story_text(session: VoiceSession) -> str:
@@ -451,6 +593,13 @@ async def _process_pending_turn(
    assistant_result: StoryOutput | None = None
    detected_intent = turn.detected_intent
    intent_confidence = turn.intent_confidence
+    confirmation_state = _resolve_turn_confirmation_state(
+        transcript_text=transcript_text,
+        transcript_confidence=turn.transcript_confidence,
+        detected_intent=detected_intent,
+        intent_confidence=intent_confidence,
+        story_patch=turn.story_patch or {},
+    )

    try:
        await _record_session_event(
@@ -466,7 +615,61 @@ async def _process_pending_turn(
            },
        )

-        if detected_intent == "save_story":
+        if confirmation_state["requires_confirmation"]:
+            current_state = _default_story_state() | (session.story_state or {})
+            assistant_text = confirmation_state["confirmation_message"]
+            turn.story_patch = {
+                **(turn.story_patch or {}),
+                "intent": detected_intent,
+                "transcript_text": transcript_text,
+                "segment_added": False,
+                "working_title": session.working_title,
+                "cover_prompt": current_state.get("cover_prompt"),
+                "narrative_segments_count": len(
+                    list(current_state.get("narrative_segments") or [])
+                ),
+                "requires_confirmation": True,
+                "confirmation_reason": confirmation_state["confirmation_reason"],
+                "confirmation_message": confirmation_state["confirmation_message"],
+                "understanding_summary": confirmation_state["understanding_summary"],
+            }
+            turn.assistant_text = assistant_text
+            turn.status = "narrative_ready"
+            session.latest_assistant_text = assistant_text
+            session.status = "waiting_user"
+            session.updated_at = _utcnow()
+            await db.commit()
+            await db.refresh(session)
+            await db.refresh(turn)
+
+            await _record_session_event(
+                db,
+                session_id=session.id,
+                turn_id=turn.id,
+                event_type="turn_confirmation_requested",
+                status="needs_confirmation",
+                message="Voice turn needs parent confirmation before the story continues.",
+                metadata={
+                    "detected_intent": detected_intent,
+                    "transcript_confidence": turn.transcript_confidence,
+                    "intent_confidence": intent_confidence,
+                    "confirmation_reason": confirmation_state["confirmation_reason"],
+                },
+            )
+            await _record_session_event(
+                db,
+                session_id=session.id,
+                turn_id=turn.id,
+                event_type="assistant_text_ready",
+                status="succeeded",
+                message="Assistant clarification prompt generated.",
+                metadata={
+                    "assistant_text_length": len(assistant_text or ""),
+                    "working_title": session.working_title,
+                    "requires_confirmation": True,
+                },
+            )
+        elif detected_intent == "save_story":
            assistant_text = "好的，这个故事已经准备好保存到故事库了。"
        elif detected_intent == "end_story":
            assistant_text = "好的，我们先把故事停在这里。想保存的话，现在就可以保存到故事库。"
@@ -479,6 +682,7 @@ async def _process_pending_turn(
            )
            assistant_text = assistant_result.story_text.strip()

+        if not confirmation_state["requires_confirmation"]:
            merged_state, story_patch = _merge_story_state(
                session,
                transcript_text=transcript_text,
@@ -520,6 +724,7 @@ async def _process_pending_turn(
                metadata={
                    "assistant_text_length": len(assistant_text or ""),
                    "working_title": session.working_title,
+                    "requires_confirmation": False,
                },
            )
    except Exception as exc:
--- a/backend/tests/test_voice_sessions.py
+++ b/backend/tests/test_voice_sessions.py
@@ -6,6 +6,7 @@ from app.core.config import settings
 from app.db.database import get_db
 from app.main import app
 from app.services.adapters.text.models import StoryOutput
+from app.services.voice_transcription_service import VoiceTranscriptionResult


 async def test_voice_session_create_and_fallback_turn_returns_audio(
@@ -272,6 +273,82 @@ async def test_voice_session_uploaded_audio_turn_uses_demo_transcript_hint(
            app.dependency_overrides.clear()


+async def test_voice_session_low_confidence_turn_requests_confirmation(
+    db_session,
+    auth_token,
+):
+    async def override_get_db():
+        yield db_session
+
+    app.dependency_overrides[get_db] = override_get_db
+
+    with (
+        patch(
+            "app.services.voice_session_service.generate_story_content",
+            new_callable=AsyncMock,
+        ) as mock_generate,
+        patch(
+            "app.services.voice_session_service.text_to_speech",
+            new_callable=AsyncMock,
+        ) as mock_tts,
+        patch(
+            "app.services.voice_session_service.transcribe_voice_audio",
+            new_callable=AsyncMock,
+        ) as mock_transcribe,
+    ):
+        mock_tts.return_value = b"confirmation-audio"
+        mock_transcribe.return_value = VoiceTranscriptionResult(
+            transcript_text="我想听一个会发光的小恐龙故事",
+            confidence=0.41,
+            provider="openai",
+        )
+
+        transport = ASGITransport(app=app)
+        try:
+            async with AsyncClient(transport=transport, base_url="http://test") as client:
+                client.cookies.set("access_token", auth_token)
+
+                response = await client.post("/api/voice-sessions", json={})
+                assert response.status_code == 201
+                session_id = response.json()["id"]
+
+                response = await client.post(
+                    f"/api/voice-sessions/{session_id}/turns",
+                    files={
+                        "audio_file": ("turn.webm", b"fake-webm-audio", "audio/webm"),
+                    },
+                )
+                assert response.status_code == 202
+                turn_id = response.json()["turn_id"]
+
+                response = await client.get(
+                    f"/api/voice-sessions/{session_id}/turns/{turn_id}"
+                )
+                assert response.status_code == 200
+                turn_data = response.json()
+                assert turn_data["status"] == "audio_ready"
+                assert turn_data["requires_confirmation"] is True
+                assert turn_data["understanding_summary"].startswith("本轮系统理解为")
+                assert "请家长帮忙确认" in turn_data["confirmation_message"]
+                assert turn_data["assistant_text"] == turn_data["confirmation_message"]
+
+                response = await client.get(f"/api/voice-sessions/{session_id}")
+                assert response.status_code == 200
+                session_data = response.json()
+                assert session_data["latest_requires_confirmation"] is True
+                assert "请家长帮忙确认" in session_data["latest_confirmation_message"]
+                assert session_data["can_finalize"] is False
+                assert session_data["story_state"]["narrative_segments"] == []
+                assert any(
+                    event["event_type"] == "turn_confirmation_requested"
+                    for event in session_data["events"]
+                )
+
+                mock_generate.assert_not_awaited()
+        finally:
+            app.dependency_overrides.clear()
+
+
 async def test_voice_session_list_orders_recent_sessions_first(
    db_session,
    auth_token,
--- a/docs/technical/voice-co-creation-phase-a-migration-api-draft.md
+++ b/docs/technical/voice-co-creation-phase-a-migration-api-draft.md
@@ -8,7 +8,7 @@

 ## 1. 目的

-这份文档是 [语音共创 Phase A 技术方案](/Users/zt/Code/dreamweaver/docs/technical/voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。
+这份文档是 [语音共创 Phase A 技术方案](./voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。

 它的目标很明确：

--- a/docs/technical/voice-co-creation-phase-a-tech-spec.md
+++ b/docs/technical/voice-co-creation-phase-a-tech-spec.md
@@ -8,7 +8,23 @@

 ## 1. 目标

-这份技术方案用于把 [语音共创模式增量 PRD](/Users/zt/Code/dreamweaver/docs/product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。
+这份技术方案用于把 [语音共创模式增量 PRD](../product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。
+
+## 0. 当前实现快照（2026-04-20）
+
+远端 `main` 已经跑通以下 Phase A 主链路：
+
+- 独立 Voice Studio 入口页与最近会话恢复
+- `voice_sessions / voice_turns / voice_session_events` 数据模型与 API
+- 文本 fallback 回合、录音上传回合、turn 轮询结果查询
+- turn 级语音补发、失败重试、会话 abandon、finalize -> Story 持久化
+- 会话事件记录与最近 turn / 最近事件展示
+
+本轮新增收束：
+
+- 当 `transcript_confidence` 或 `intent_confidence` 偏低时，后端优先返回确认提示，而不是直接把这一轮写进故事正文
+- 前端明确展示“本轮系统理解为”与“建议家长确认后再继续”提示
+- 低置信度确认链路已有后端测试覆盖，可作为下一阶段继续接 ASR 与更细确认交互的基础

 Phase A 的核心目标不是做“完全实时的语音陪伴”，而是验证以下最小闭环：

--- a/frontend/src/types/voiceSession.ts
+++ b/frontend/src/types/voiceSession.ts
@@ -8,6 +8,10 @@ export interface VoiceTurnSummary {
  transcription_provider: string | null
  detected_intent: string
  intent_confidence: number | null
+  understanding_summary: string | null
+  requires_confirmation: boolean
+  confirmation_reason: string | null
+  confirmation_message: string | null
  assistant_text: string | null
  assistant_audio_ready: boolean
  assistant_audio_url: string | null
@@ -43,6 +47,9 @@ export interface VoiceSessionSummary {
  latest_user_transcript: string | null
  latest_assistant_text: string | null
  latest_detected_intent: string | null
+  latest_understanding_summary: string | null
+  latest_requires_confirmation: boolean
+  latest_confirmation_message: string | null
  latest_assistant_audio_ready: boolean
  last_turn_status: string | null
  transcription_mode_hint: string | null
--- a/frontend/src/views/VoiceStudio.vue
+++ b/frontend/src/views/VoiceStudio.vue
@@ -162,6 +162,13 @@ function formatDate(dateStr: string) {
  })
 }

+function formatConfidence(value: number | null | undefined) {
+  if (typeof value !== 'number') {
+    return 'n/a'
+  }
+  return `${Math.round(value * 100)}%`
+}
+
 function revokeRecordedAudioUrl() {
  if (recordedAudioUrl.value) {
    URL.revokeObjectURL(recordedAudioUrl.value)
@@ -718,6 +725,19 @@ onBeforeUnmount(() => {

            <div class="mt-6 grid grid-cols-1 gap-6 xl:grid-cols-[minmax(0,1.2fr)_minmax(0,0.8fr)]">
              <div class="space-y-6">
+                <div
+                  v-if="activeSession.latest_requires_confirmation"
+                  class="rounded-2xl border border-amber-200 bg-amber-50 p-4 text-amber-800"
+                >
+                  <div class="text-sm font-semibold">建议先确认这一轮理解</div>
+                  <p class="mt-2 text-sm">
+                    {{ activeSession.latest_confirmation_message || '系统对这一轮的理解还不够确定，建议家长先确认后再继续。' }}
+                  </p>
+                  <p v-if="activeSession.latest_understanding_summary" class="mt-2 text-xs text-amber-700">
+                    {{ activeSession.latest_understanding_summary }}
+                  </p>
+                </div>
+
                <div class="rounded-2xl border border-gray-100 bg-white p-4">
                  <div class="flex items-center justify-between">
                    <h3 class="font-semibold text-gray-900">文本共创回合</h3>
@@ -839,6 +859,22 @@ onBeforeUnmount(() => {
                        <span class="font-medium text-gray-900">孩子：</span>
                        {{ turn.user_transcript || '暂无转写内容' }}
                      </div>
+                      <div v-if="turn.understanding_summary" class="mt-3 text-sm text-slate-600">
+                        {{ turn.understanding_summary }}
+                      </div>
+                      <div
+                        v-if="turn.requires_confirmation"
+                        class="mt-3 rounded-2xl border border-amber-200 bg-amber-50 px-3 py-3 text-sm text-amber-800"
+                      >
+                        <div class="font-medium">建议家长确认后再继续</div>
+                        <p class="mt-1">
+                          {{ turn.confirmation_message || '系统对这一轮的理解还不够确定，建议换一种说法再试一次。' }}
+                        </p>
+                        <p class="mt-2 text-xs text-amber-700">
+                          转写置信度：{{ formatConfidence(turn.transcript_confidence) }} ·
+                          意图置信度：{{ formatConfidence(turn.intent_confidence) }}
+                        </p>
+                      </div>
                      <div v-if="turn.user_audio_url" class="mt-3">
                        <audio class="w-full" :src="turn.user_audio_url" controls></audio>
                      </div>