feat: add voice session confirmation guardrails

This commit is contained in:
2026-04-20 12:29:14 +08:00
parent 4ecf0c09c0
commit dbb512719d
8 changed files with 406 additions and 50 deletions

View File

@@ -79,8 +79,129 @@ def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> st
return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio"
def _format_intent_label(intent: str | None) -> str:
labels = {
"start_story": "开启故事",
"continue_story": "继续讲述",
"correct_story": "修正走向",
"end_story": "先停在这里",
"save_story": "保存当前故事",
"unknown": "待确认",
}
return labels.get(intent or "", "待确认")
def _build_understanding_summary(
*,
transcript_text: str | None,
detected_intent: str,
) -> str | None:
normalized_transcript = (transcript_text or "").strip()
if detected_intent == "unknown":
if normalized_transcript:
return f"本轮系统暂时还没完全理解:{normalized_transcript}"
return "本轮系统暂时还没完全理解孩子刚才的表达。"
if normalized_transcript:
return f"本轮系统理解为「{_format_intent_label(detected_intent)}」:{normalized_transcript}"
return f"本轮系统理解为「{_format_intent_label(detected_intent)}"
def _build_confirmation_message(
*,
transcript_text: str | None,
detected_intent: str,
reasons: list[str],
) -> str:
natural_understanding = ""
normalized_transcript = (transcript_text or "").strip()
if detected_intent != "unknown":
if normalized_transcript:
natural_understanding = (
f"我现在先理解成你想「{_format_intent_label(detected_intent)}」:"
f"{normalized_transcript}"
)
else:
natural_understanding = f"我现在先理解成你想「{_format_intent_label(detected_intent)}」。"
if "intent_unknown" in reasons:
prefix = "我这一次还没有完全听懂。"
elif {
"low_transcript_confidence",
"low_intent_confidence",
}.issubset(set(reasons)):
prefix = "我这一次听得还不够清楚,也不太确定该怎么接剧情。"
elif "low_transcript_confidence" in reasons:
prefix = "我这一次可能没有完全听清。"
else:
prefix = "我这一次还不太确定你是想继续讲,还是想改一下剧情。"
return (
f"{prefix}{natural_understanding}"
"请家长帮忙确认一下;如果不对,可以换一种说法再说一遍,我们再继续编下去。"
)
def _resolve_turn_confirmation_state(
*,
transcript_text: str | None,
transcript_confidence: float | None,
detected_intent: str,
intent_confidence: float | None,
story_patch: dict[str, Any] | None = None,
) -> dict[str, Any]:
patch = story_patch or {}
requires_confirmation = patch.get("requires_confirmation")
confirmation_reason = patch.get("confirmation_reason")
confirmation_message = patch.get("confirmation_message")
understanding_summary = patch.get("understanding_summary")
reasons: list[str] = []
if detected_intent == "unknown":
reasons.append("intent_unknown")
if (
transcript_confidence is not None
and transcript_confidence < settings.voice_turn_low_transcript_confidence
):
reasons.append("low_transcript_confidence")
if (
intent_confidence is not None
and intent_confidence < settings.voice_turn_low_intent_confidence
):
reasons.append("low_intent_confidence")
if requires_confirmation is None:
requires_confirmation = bool(reasons)
if confirmation_reason is None and reasons:
confirmation_reason = ",".join(reasons)
if understanding_summary is None:
understanding_summary = _build_understanding_summary(
transcript_text=transcript_text,
detected_intent=detected_intent,
)
if confirmation_message is None and requires_confirmation:
confirmation_message = _build_confirmation_message(
transcript_text=transcript_text,
detected_intent=detected_intent,
reasons=reasons or ["intent_unknown"],
)
return {
"understanding_summary": understanding_summary,
"requires_confirmation": bool(requires_confirmation),
"confirmation_reason": confirmation_reason,
"confirmation_message": confirmation_message,
}
def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
turn_patch = turn.story_patch or {}
confirmation_state = _resolve_turn_confirmation_state(
transcript_text=turn.user_transcript,
transcript_confidence=turn.transcript_confidence,
detected_intent=turn.detected_intent,
intent_confidence=turn.intent_confidence,
story_patch=turn_patch,
)
return VoiceTurnSummaryResponse(
id=turn.id,
session_id=turn.session_id,
@@ -91,6 +212,10 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
transcription_provider=turn_patch.get("transcription_provider"),
detected_intent=turn.detected_intent,
intent_confidence=turn.intent_confidence,
understanding_summary=confirmation_state["understanding_summary"],
requires_confirmation=confirmation_state["requires_confirmation"],
confirmation_reason=confirmation_state["confirmation_reason"],
confirmation_message=confirmation_state["confirmation_message"],
assistant_text=turn.assistant_text,
assistant_audio_ready=session_audio_exists(turn.assistant_audio_path),
assistant_audio_url=_assistant_audio_url(
@@ -114,8 +239,20 @@ def _session_to_summary(
) -> VoiceSessionSummaryResponse:
if latest_turn is None:
total_turns = total_turns if total_turns is not None else session.current_turn_index
latest_confirmation_state = {
"understanding_summary": None,
"requires_confirmation": False,
"confirmation_message": None,
}
else:
total_turns = total_turns if total_turns is not None else latest_turn.turn_index
latest_confirmation_state = _resolve_turn_confirmation_state(
transcript_text=latest_turn.user_transcript,
transcript_confidence=latest_turn.transcript_confidence,
detected_intent=latest_turn.detected_intent,
intent_confidence=latest_turn.intent_confidence,
story_patch=latest_turn.story_patch or {},
)
return VoiceSessionSummaryResponse(
id=session.id,
@@ -131,6 +268,9 @@ def _session_to_summary(
latest_user_transcript=session.latest_user_transcript,
latest_assistant_text=session.latest_assistant_text,
latest_detected_intent=latest_turn.detected_intent if latest_turn else None,
latest_understanding_summary=latest_confirmation_state["understanding_summary"],
latest_requires_confirmation=latest_confirmation_state["requires_confirmation"],
latest_confirmation_message=latest_confirmation_state["confirmation_message"],
latest_assistant_audio_ready=(
session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
),
@@ -230,11 +370,13 @@ def _detect_intent(
normalized = transcript_text.replace(" ", "")
if any(keyword in normalized for keyword in ("保存故事", "存起来", "保存吧", "保存到故事库")):
return "save_story", 0.95
return "save_story", 0.96
if any(keyword in normalized for keyword in ("先到这里", "讲完了", "结束吧", "停在这里")):
return "end_story", 0.88
return "end_story", 0.90
if len(normalized) <= 1 or normalized in {"", "", "", "", "这个", "那个", "不知道"}:
return "unknown", 0.25
if current_turn_index == 0:
return "start_story", 0.82
return "start_story", 0.84
if any(
keyword in normalized
for keyword in (
@@ -248,8 +390,8 @@ def _detect_intent(
"其实",
)
):
return "correct_story", 0.76
return "continue_story", 0.68
return "correct_story", 0.78
return "continue_story", 0.74
def _recent_story_text(session: VoiceSession) -> str:
@@ -451,6 +593,13 @@ async def _process_pending_turn(
assistant_result: StoryOutput | None = None
detected_intent = turn.detected_intent
intent_confidence = turn.intent_confidence
confirmation_state = _resolve_turn_confirmation_state(
transcript_text=transcript_text,
transcript_confidence=turn.transcript_confidence,
detected_intent=detected_intent,
intent_confidence=intent_confidence,
story_patch=turn.story_patch or {},
)
try:
await _record_session_event(
@@ -466,7 +615,61 @@ async def _process_pending_turn(
},
)
if detected_intent == "save_story":
if confirmation_state["requires_confirmation"]:
current_state = _default_story_state() | (session.story_state or {})
assistant_text = confirmation_state["confirmation_message"]
turn.story_patch = {
**(turn.story_patch or {}),
"intent": detected_intent,
"transcript_text": transcript_text,
"segment_added": False,
"working_title": session.working_title,
"cover_prompt": current_state.get("cover_prompt"),
"narrative_segments_count": len(
list(current_state.get("narrative_segments") or [])
),
"requires_confirmation": True,
"confirmation_reason": confirmation_state["confirmation_reason"],
"confirmation_message": confirmation_state["confirmation_message"],
"understanding_summary": confirmation_state["understanding_summary"],
}
turn.assistant_text = assistant_text
turn.status = "narrative_ready"
session.latest_assistant_text = assistant_text
session.status = "waiting_user"
session.updated_at = _utcnow()
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="turn_confirmation_requested",
status="needs_confirmation",
message="Voice turn needs parent confirmation before the story continues.",
metadata={
"detected_intent": detected_intent,
"transcript_confidence": turn.transcript_confidence,
"intent_confidence": intent_confidence,
"confirmation_reason": confirmation_state["confirmation_reason"],
},
)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_text_ready",
status="succeeded",
message="Assistant clarification prompt generated.",
metadata={
"assistant_text_length": len(assistant_text or ""),
"working_title": session.working_title,
"requires_confirmation": True,
},
)
elif detected_intent == "save_story":
assistant_text = "好的,这个故事已经准备好保存到故事库了。"
elif detected_intent == "end_story":
assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。"
@@ -479,49 +682,51 @@ async def _process_pending_turn(
)
assistant_text = assistant_result.story_text.strip()
merged_state, story_patch = _merge_story_state(
session,
transcript_text=transcript_text,
intent=detected_intent,
assistant_result=assistant_result,
)
story_patch["transcription_provider"] = (
(turn.story_patch or {}).get("transcription_provider")
)
turn.story_patch = story_patch
turn.assistant_text = assistant_text
turn.status = "narrative_ready"
session.story_state = merged_state
session.latest_assistant_text = assistant_text
session.status = "waiting_user"
session.updated_at = _utcnow()
if assistant_result and assistant_result.title and not session.working_title:
session.working_title = assistant_result.title
await db.commit()
await db.refresh(session)
await db.refresh(turn)
if not confirmation_state["requires_confirmation"]:
merged_state, story_patch = _merge_story_state(
session,
transcript_text=transcript_text,
intent=detected_intent,
assistant_result=assistant_result,
)
story_patch["transcription_provider"] = (
(turn.story_patch or {}).get("transcription_provider")
)
turn.story_patch = story_patch
turn.assistant_text = assistant_text
turn.status = "narrative_ready"
session.story_state = merged_state
session.latest_assistant_text = assistant_text
session.status = "waiting_user"
session.updated_at = _utcnow()
if assistant_result and assistant_result.title and not session.working_title:
session.working_title = assistant_result.title
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="story_patch_applied",
status="succeeded",
message="Story state updated after one turn.",
metadata=story_patch,
)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_text_ready",
status="succeeded",
message="Assistant text response generated.",
metadata={
"assistant_text_length": len(assistant_text or ""),
"working_title": session.working_title,
},
)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="story_patch_applied",
status="succeeded",
message="Story state updated after one turn.",
metadata=story_patch,
)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_text_ready",
status="succeeded",
message="Assistant text response generated.",
metadata={
"assistant_text_length": len(assistant_text or ""),
"working_title": session.working_title,
"requires_confirmation": False,
},
)
except Exception as exc:
turn.status = "failed"
turn.error_message = str(exc)