feat: add voice session confirmation guardrails

2026-04-20 12:29:14 +08:00
parent 4ecf0c09c0
commit dbb512719d
8 changed files with 406 additions and 50 deletions
--- a/backend/app/services/voice_session_service.py
+++ b/backend/app/services/voice_session_service.py
@@ -79,8 +79,129 @@ def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> st
    return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio"


+def _format_intent_label(intent: str | None) -> str:
+    labels = {
+        "start_story": "开启故事",
+        "continue_story": "继续讲述",
+        "correct_story": "修正走向",
+        "end_story": "先停在这里",
+        "save_story": "保存当前故事",
+        "unknown": "待确认",
+    }
+    return labels.get(intent or "", "待确认")
+
+
+def _build_understanding_summary(
+    *,
+    transcript_text: str | None,
+    detected_intent: str,
+) -> str | None:
+    normalized_transcript = (transcript_text or "").strip()
+    if detected_intent == "unknown":
+        if normalized_transcript:
+            return f"本轮系统暂时还没完全理解：{normalized_transcript}"
+        return "本轮系统暂时还没完全理解孩子刚才的表达。"
+    if normalized_transcript:
+        return f"本轮系统理解为「{_format_intent_label(detected_intent)}」：{normalized_transcript}"
+    return f"本轮系统理解为「{_format_intent_label(detected_intent)}」"
+
+
+def _build_confirmation_message(
+    *,
+    transcript_text: str | None,
+    detected_intent: str,
+    reasons: list[str],
+) -> str:
+    natural_understanding = ""
+    normalized_transcript = (transcript_text or "").strip()
+    if detected_intent != "unknown":
+        if normalized_transcript:
+            natural_understanding = (
+                f"我现在先理解成你想「{_format_intent_label(detected_intent)}」："
+                f"{normalized_transcript}。"
+            )
+        else:
+            natural_understanding = f"我现在先理解成你想「{_format_intent_label(detected_intent)}」。"
+
+    if "intent_unknown" in reasons:
+        prefix = "我这一次还没有完全听懂。"
+    elif {
+        "low_transcript_confidence",
+        "low_intent_confidence",
+    }.issubset(set(reasons)):
+        prefix = "我这一次听得还不够清楚，也不太确定该怎么接剧情。"
+    elif "low_transcript_confidence" in reasons:
+        prefix = "我这一次可能没有完全听清。"
+    else:
+        prefix = "我这一次还不太确定你是想继续讲，还是想改一下剧情。"
+
+    return (
+        f"{prefix}{natural_understanding}"
+        "请家长帮忙确认一下；如果不对，可以换一种说法再说一遍，我们再继续编下去。"
+    )
+
+
+def _resolve_turn_confirmation_state(
+    *,
+    transcript_text: str | None,
+    transcript_confidence: float | None,
+    detected_intent: str,
+    intent_confidence: float | None,
+    story_patch: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    patch = story_patch or {}
+    requires_confirmation = patch.get("requires_confirmation")
+    confirmation_reason = patch.get("confirmation_reason")
+    confirmation_message = patch.get("confirmation_message")
+    understanding_summary = patch.get("understanding_summary")
+
+    reasons: list[str] = []
+    if detected_intent == "unknown":
+        reasons.append("intent_unknown")
+    if (
+        transcript_confidence is not None
+        and transcript_confidence < settings.voice_turn_low_transcript_confidence
+    ):
+        reasons.append("low_transcript_confidence")
+    if (
+        intent_confidence is not None
+        and intent_confidence < settings.voice_turn_low_intent_confidence
+    ):
+        reasons.append("low_intent_confidence")
+
+    if requires_confirmation is None:
+        requires_confirmation = bool(reasons)
+    if confirmation_reason is None and reasons:
+        confirmation_reason = ",".join(reasons)
+    if understanding_summary is None:
+        understanding_summary = _build_understanding_summary(
+            transcript_text=transcript_text,
+            detected_intent=detected_intent,
+        )
+    if confirmation_message is None and requires_confirmation:
+        confirmation_message = _build_confirmation_message(
+            transcript_text=transcript_text,
+            detected_intent=detected_intent,
+            reasons=reasons or ["intent_unknown"],
+        )
+
+    return {
+        "understanding_summary": understanding_summary,
+        "requires_confirmation": bool(requires_confirmation),
+        "confirmation_reason": confirmation_reason,
+        "confirmation_message": confirmation_message,
+    }
+
+
 def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
    turn_patch = turn.story_patch or {}
+    confirmation_state = _resolve_turn_confirmation_state(
+        transcript_text=turn.user_transcript,
+        transcript_confidence=turn.transcript_confidence,
+        detected_intent=turn.detected_intent,
+        intent_confidence=turn.intent_confidence,
+        story_patch=turn_patch,
+    )
    return VoiceTurnSummaryResponse(
        id=turn.id,
        session_id=turn.session_id,
@@ -91,6 +212,10 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
        transcription_provider=turn_patch.get("transcription_provider"),
        detected_intent=turn.detected_intent,
        intent_confidence=turn.intent_confidence,
+        understanding_summary=confirmation_state["understanding_summary"],
+        requires_confirmation=confirmation_state["requires_confirmation"],
+        confirmation_reason=confirmation_state["confirmation_reason"],
+        confirmation_message=confirmation_state["confirmation_message"],
        assistant_text=turn.assistant_text,
        assistant_audio_ready=session_audio_exists(turn.assistant_audio_path),
        assistant_audio_url=_assistant_audio_url(
@@ -114,8 +239,20 @@ def _session_to_summary(
 ) -> VoiceSessionSummaryResponse:
    if latest_turn is None:
        total_turns = total_turns if total_turns is not None else session.current_turn_index
+        latest_confirmation_state = {
+            "understanding_summary": None,
+            "requires_confirmation": False,
+            "confirmation_message": None,
+        }
    else:
        total_turns = total_turns if total_turns is not None else latest_turn.turn_index
+        latest_confirmation_state = _resolve_turn_confirmation_state(
+            transcript_text=latest_turn.user_transcript,
+            transcript_confidence=latest_turn.transcript_confidence,
+            detected_intent=latest_turn.detected_intent,
+            intent_confidence=latest_turn.intent_confidence,
+            story_patch=latest_turn.story_patch or {},
+        )

    return VoiceSessionSummaryResponse(
        id=session.id,
@@ -131,6 +268,9 @@ def _session_to_summary(
        latest_user_transcript=session.latest_user_transcript,
        latest_assistant_text=session.latest_assistant_text,
        latest_detected_intent=latest_turn.detected_intent if latest_turn else None,
+        latest_understanding_summary=latest_confirmation_state["understanding_summary"],
+        latest_requires_confirmation=latest_confirmation_state["requires_confirmation"],
+        latest_confirmation_message=latest_confirmation_state["confirmation_message"],
        latest_assistant_audio_ready=(
            session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
        ),
@@ -230,11 +370,13 @@ def _detect_intent(
    normalized = transcript_text.replace(" ", "")

    if any(keyword in normalized for keyword in ("保存故事", "存起来", "保存吧", "保存到故事库")):
-        return "save_story", 0.95
+        return "save_story", 0.96
    if any(keyword in normalized for keyword in ("先到这里", "讲完了", "结束吧", "停在这里")):
-        return "end_story", 0.88
+        return "end_story", 0.90
+    if len(normalized) <= 1 or normalized in {"嗯", "啊", "呃", "额", "这个", "那个", "不知道"}:
+        return "unknown", 0.25
    if current_turn_index == 0:
-        return "start_story", 0.82
+        return "start_story", 0.84
    if any(
        keyword in normalized
        for keyword in (
@@ -248,8 +390,8 @@ def _detect_intent(
            "其实",
        )
    ):
-        return "correct_story", 0.76
-    return "continue_story", 0.68
+        return "correct_story", 0.78
+    return "continue_story", 0.74


 def _recent_story_text(session: VoiceSession) -> str:
@@ -451,6 +593,13 @@ async def _process_pending_turn(
    assistant_result: StoryOutput | None = None
    detected_intent = turn.detected_intent
    intent_confidence = turn.intent_confidence
+    confirmation_state = _resolve_turn_confirmation_state(
+        transcript_text=transcript_text,
+        transcript_confidence=turn.transcript_confidence,
+        detected_intent=detected_intent,
+        intent_confidence=intent_confidence,
+        story_patch=turn.story_patch or {},
+    )

    try:
        await _record_session_event(
@@ -466,7 +615,61 @@ async def _process_pending_turn(
            },
        )

-        if detected_intent == "save_story":
+        if confirmation_state["requires_confirmation"]:
+            current_state = _default_story_state() | (session.story_state or {})
+            assistant_text = confirmation_state["confirmation_message"]
+            turn.story_patch = {
+                **(turn.story_patch or {}),
+                "intent": detected_intent,
+                "transcript_text": transcript_text,
+                "segment_added": False,
+                "working_title": session.working_title,
+                "cover_prompt": current_state.get("cover_prompt"),
+                "narrative_segments_count": len(
+                    list(current_state.get("narrative_segments") or [])
+                ),
+                "requires_confirmation": True,
+                "confirmation_reason": confirmation_state["confirmation_reason"],
+                "confirmation_message": confirmation_state["confirmation_message"],
+                "understanding_summary": confirmation_state["understanding_summary"],
+            }
+            turn.assistant_text = assistant_text
+            turn.status = "narrative_ready"
+            session.latest_assistant_text = assistant_text
+            session.status = "waiting_user"
+            session.updated_at = _utcnow()
+            await db.commit()
+            await db.refresh(session)
+            await db.refresh(turn)
+
+            await _record_session_event(
+                db,
+                session_id=session.id,
+                turn_id=turn.id,
+                event_type="turn_confirmation_requested",
+                status="needs_confirmation",
+                message="Voice turn needs parent confirmation before the story continues.",
+                metadata={
+                    "detected_intent": detected_intent,
+                    "transcript_confidence": turn.transcript_confidence,
+                    "intent_confidence": intent_confidence,
+                    "confirmation_reason": confirmation_state["confirmation_reason"],
+                },
+            )
+            await _record_session_event(
+                db,
+                session_id=session.id,
+                turn_id=turn.id,
+                event_type="assistant_text_ready",
+                status="succeeded",
+                message="Assistant clarification prompt generated.",
+                metadata={
+                    "assistant_text_length": len(assistant_text or ""),
+                    "working_title": session.working_title,
+                    "requires_confirmation": True,
+                },
+            )
+        elif detected_intent == "save_story":
            assistant_text = "好的，这个故事已经准备好保存到故事库了。"
        elif detected_intent == "end_story":
            assistant_text = "好的，我们先把故事停在这里。想保存的话，现在就可以保存到故事库。"
@@ -479,49 +682,51 @@ async def _process_pending_turn(
            )
            assistant_text = assistant_result.story_text.strip()

-        merged_state, story_patch = _merge_story_state(
-            session,
-            transcript_text=transcript_text,
-            intent=detected_intent,
-            assistant_result=assistant_result,
-        )
-        story_patch["transcription_provider"] = (
-            (turn.story_patch or {}).get("transcription_provider")
-        )
-        turn.story_patch = story_patch
-        turn.assistant_text = assistant_text
-        turn.status = "narrative_ready"
-        session.story_state = merged_state
-        session.latest_assistant_text = assistant_text
-        session.status = "waiting_user"
-        session.updated_at = _utcnow()
-        if assistant_result and assistant_result.title and not session.working_title:
-            session.working_title = assistant_result.title
-        await db.commit()
-        await db.refresh(session)
-        await db.refresh(turn)
+        if not confirmation_state["requires_confirmation"]:
+            merged_state, story_patch = _merge_story_state(
+                session,
+                transcript_text=transcript_text,
+                intent=detected_intent,
+                assistant_result=assistant_result,
+            )
+            story_patch["transcription_provider"] = (
+                (turn.story_patch or {}).get("transcription_provider")
+            )
+            turn.story_patch = story_patch
+            turn.assistant_text = assistant_text
+            turn.status = "narrative_ready"
+            session.story_state = merged_state
+            session.latest_assistant_text = assistant_text
+            session.status = "waiting_user"
+            session.updated_at = _utcnow()
+            if assistant_result and assistant_result.title and not session.working_title:
+                session.working_title = assistant_result.title
+            await db.commit()
+            await db.refresh(session)
+            await db.refresh(turn)

-        await _record_session_event(
-            db,
-            session_id=session.id,
-            turn_id=turn.id,
-            event_type="story_patch_applied",
-            status="succeeded",
-            message="Story state updated after one turn.",
-            metadata=story_patch,
-        )
-        await _record_session_event(
-            db,
-            session_id=session.id,
-            turn_id=turn.id,
-            event_type="assistant_text_ready",
-            status="succeeded",
-            message="Assistant text response generated.",
-            metadata={
-                "assistant_text_length": len(assistant_text or ""),
-                "working_title": session.working_title,
-            },
-        )
+            await _record_session_event(
+                db,
+                session_id=session.id,
+                turn_id=turn.id,
+                event_type="story_patch_applied",
+                status="succeeded",
+                message="Story state updated after one turn.",
+                metadata=story_patch,
+            )
+            await _record_session_event(
+                db,
+                session_id=session.id,
+                turn_id=turn.id,
+                event_type="assistant_text_ready",
+                status="succeeded",
+                message="Assistant text response generated.",
+                metadata={
+                    "assistant_text_length": len(assistant_text or ""),
+                    "working_title": session.working_title,
+                    "requires_confirmation": False,
+                },
+            )
    except Exception as exc:
        turn.status = "failed"
        turn.error_message = str(exc)