feat: add voice session confirmation guardrails

This commit is contained in:
2026-04-20 12:29:14 +08:00
parent 4ecf0c09c0
commit dbb512719d
8 changed files with 406 additions and 50 deletions

View File

@@ -82,6 +82,14 @@ class Settings(BaseSettings):
"zh",
description="Preferred language hint for voice transcription",
)
voice_turn_low_transcript_confidence: float = Field(
0.65,
description="Prompt for confirmation when transcript confidence falls below this threshold",
)
voice_turn_low_intent_confidence: float = Field(
0.70,
description="Prompt for confirmation when intent confidence falls below this threshold",
)
voice_turn_max_upload_bytes: int = Field(
5 * 1024 * 1024,
description="Maximum accepted upload size in bytes for one voice turn audio file",

View File

@@ -73,6 +73,10 @@ class VoiceTurnSummaryResponse(BaseModel):
transcription_provider: str | None = None
detected_intent: str
intent_confidence: float | None = None
understanding_summary: str | None = None
requires_confirmation: bool = False
confirmation_reason: str | None = None
confirmation_message: str | None = None
assistant_text: str | None = None
assistant_audio_ready: bool = False
assistant_audio_url: str | None = None
@@ -99,6 +103,9 @@ class VoiceSessionSummaryResponse(BaseModel):
latest_user_transcript: str | None = None
latest_assistant_text: str | None = None
latest_detected_intent: str | None = None
latest_understanding_summary: str | None = None
latest_requires_confirmation: bool = False
latest_confirmation_message: str | None = None
latest_assistant_audio_ready: bool = False
last_turn_status: str | None = None
transcription_mode_hint: str | None = None

View File

@@ -79,8 +79,129 @@ def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> st
return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio"
def _format_intent_label(intent: str | None) -> str:
labels = {
"start_story": "开启故事",
"continue_story": "继续讲述",
"correct_story": "修正走向",
"end_story": "先停在这里",
"save_story": "保存当前故事",
"unknown": "待确认",
}
return labels.get(intent or "", "待确认")
def _build_understanding_summary(
*,
transcript_text: str | None,
detected_intent: str,
) -> str | None:
normalized_transcript = (transcript_text or "").strip()
if detected_intent == "unknown":
if normalized_transcript:
return f"本轮系统暂时还没完全理解:{normalized_transcript}"
return "本轮系统暂时还没完全理解孩子刚才的表达。"
if normalized_transcript:
return f"本轮系统理解为「{_format_intent_label(detected_intent)}」:{normalized_transcript}"
return f"本轮系统理解为「{_format_intent_label(detected_intent)}"
def _build_confirmation_message(
*,
transcript_text: str | None,
detected_intent: str,
reasons: list[str],
) -> str:
natural_understanding = ""
normalized_transcript = (transcript_text or "").strip()
if detected_intent != "unknown":
if normalized_transcript:
natural_understanding = (
f"我现在先理解成你想「{_format_intent_label(detected_intent)}」:"
f"{normalized_transcript}"
)
else:
natural_understanding = f"我现在先理解成你想「{_format_intent_label(detected_intent)}」。"
if "intent_unknown" in reasons:
prefix = "我这一次还没有完全听懂。"
elif {
"low_transcript_confidence",
"low_intent_confidence",
}.issubset(set(reasons)):
prefix = "我这一次听得还不够清楚,也不太确定该怎么接剧情。"
elif "low_transcript_confidence" in reasons:
prefix = "我这一次可能没有完全听清。"
else:
prefix = "我这一次还不太确定你是想继续讲,还是想改一下剧情。"
return (
f"{prefix}{natural_understanding}"
"请家长帮忙确认一下;如果不对,可以换一种说法再说一遍,我们再继续编下去。"
)
def _resolve_turn_confirmation_state(
*,
transcript_text: str | None,
transcript_confidence: float | None,
detected_intent: str,
intent_confidence: float | None,
story_patch: dict[str, Any] | None = None,
) -> dict[str, Any]:
patch = story_patch or {}
requires_confirmation = patch.get("requires_confirmation")
confirmation_reason = patch.get("confirmation_reason")
confirmation_message = patch.get("confirmation_message")
understanding_summary = patch.get("understanding_summary")
reasons: list[str] = []
if detected_intent == "unknown":
reasons.append("intent_unknown")
if (
transcript_confidence is not None
and transcript_confidence < settings.voice_turn_low_transcript_confidence
):
reasons.append("low_transcript_confidence")
if (
intent_confidence is not None
and intent_confidence < settings.voice_turn_low_intent_confidence
):
reasons.append("low_intent_confidence")
if requires_confirmation is None:
requires_confirmation = bool(reasons)
if confirmation_reason is None and reasons:
confirmation_reason = ",".join(reasons)
if understanding_summary is None:
understanding_summary = _build_understanding_summary(
transcript_text=transcript_text,
detected_intent=detected_intent,
)
if confirmation_message is None and requires_confirmation:
confirmation_message = _build_confirmation_message(
transcript_text=transcript_text,
detected_intent=detected_intent,
reasons=reasons or ["intent_unknown"],
)
return {
"understanding_summary": understanding_summary,
"requires_confirmation": bool(requires_confirmation),
"confirmation_reason": confirmation_reason,
"confirmation_message": confirmation_message,
}
def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
turn_patch = turn.story_patch or {}
confirmation_state = _resolve_turn_confirmation_state(
transcript_text=turn.user_transcript,
transcript_confidence=turn.transcript_confidence,
detected_intent=turn.detected_intent,
intent_confidence=turn.intent_confidence,
story_patch=turn_patch,
)
return VoiceTurnSummaryResponse(
id=turn.id,
session_id=turn.session_id,
@@ -91,6 +212,10 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
transcription_provider=turn_patch.get("transcription_provider"),
detected_intent=turn.detected_intent,
intent_confidence=turn.intent_confidence,
understanding_summary=confirmation_state["understanding_summary"],
requires_confirmation=confirmation_state["requires_confirmation"],
confirmation_reason=confirmation_state["confirmation_reason"],
confirmation_message=confirmation_state["confirmation_message"],
assistant_text=turn.assistant_text,
assistant_audio_ready=session_audio_exists(turn.assistant_audio_path),
assistant_audio_url=_assistant_audio_url(
@@ -114,8 +239,20 @@ def _session_to_summary(
) -> VoiceSessionSummaryResponse:
if latest_turn is None:
total_turns = total_turns if total_turns is not None else session.current_turn_index
latest_confirmation_state = {
"understanding_summary": None,
"requires_confirmation": False,
"confirmation_message": None,
}
else:
total_turns = total_turns if total_turns is not None else latest_turn.turn_index
latest_confirmation_state = _resolve_turn_confirmation_state(
transcript_text=latest_turn.user_transcript,
transcript_confidence=latest_turn.transcript_confidence,
detected_intent=latest_turn.detected_intent,
intent_confidence=latest_turn.intent_confidence,
story_patch=latest_turn.story_patch or {},
)
return VoiceSessionSummaryResponse(
id=session.id,
@@ -131,6 +268,9 @@ def _session_to_summary(
latest_user_transcript=session.latest_user_transcript,
latest_assistant_text=session.latest_assistant_text,
latest_detected_intent=latest_turn.detected_intent if latest_turn else None,
latest_understanding_summary=latest_confirmation_state["understanding_summary"],
latest_requires_confirmation=latest_confirmation_state["requires_confirmation"],
latest_confirmation_message=latest_confirmation_state["confirmation_message"],
latest_assistant_audio_ready=(
session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
),
@@ -230,11 +370,13 @@ def _detect_intent(
normalized = transcript_text.replace(" ", "")
if any(keyword in normalized for keyword in ("保存故事", "存起来", "保存吧", "保存到故事库")):
return "save_story", 0.95
return "save_story", 0.96
if any(keyword in normalized for keyword in ("先到这里", "讲完了", "结束吧", "停在这里")):
return "end_story", 0.88
return "end_story", 0.90
if len(normalized) <= 1 or normalized in {"", "", "", "", "这个", "那个", "不知道"}:
return "unknown", 0.25
if current_turn_index == 0:
return "start_story", 0.82
return "start_story", 0.84
if any(
keyword in normalized
for keyword in (
@@ -248,8 +390,8 @@ def _detect_intent(
"其实",
)
):
return "correct_story", 0.76
return "continue_story", 0.68
return "correct_story", 0.78
return "continue_story", 0.74
def _recent_story_text(session: VoiceSession) -> str:
@@ -451,6 +593,13 @@ async def _process_pending_turn(
assistant_result: StoryOutput | None = None
detected_intent = turn.detected_intent
intent_confidence = turn.intent_confidence
confirmation_state = _resolve_turn_confirmation_state(
transcript_text=transcript_text,
transcript_confidence=turn.transcript_confidence,
detected_intent=detected_intent,
intent_confidence=intent_confidence,
story_patch=turn.story_patch or {},
)
try:
await _record_session_event(
@@ -466,7 +615,61 @@ async def _process_pending_turn(
},
)
if detected_intent == "save_story":
if confirmation_state["requires_confirmation"]:
current_state = _default_story_state() | (session.story_state or {})
assistant_text = confirmation_state["confirmation_message"]
turn.story_patch = {
**(turn.story_patch or {}),
"intent": detected_intent,
"transcript_text": transcript_text,
"segment_added": False,
"working_title": session.working_title,
"cover_prompt": current_state.get("cover_prompt"),
"narrative_segments_count": len(
list(current_state.get("narrative_segments") or [])
),
"requires_confirmation": True,
"confirmation_reason": confirmation_state["confirmation_reason"],
"confirmation_message": confirmation_state["confirmation_message"],
"understanding_summary": confirmation_state["understanding_summary"],
}
turn.assistant_text = assistant_text
turn.status = "narrative_ready"
session.latest_assistant_text = assistant_text
session.status = "waiting_user"
session.updated_at = _utcnow()
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="turn_confirmation_requested",
status="needs_confirmation",
message="Voice turn needs parent confirmation before the story continues.",
metadata={
"detected_intent": detected_intent,
"transcript_confidence": turn.transcript_confidence,
"intent_confidence": intent_confidence,
"confirmation_reason": confirmation_state["confirmation_reason"],
},
)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_text_ready",
status="succeeded",
message="Assistant clarification prompt generated.",
metadata={
"assistant_text_length": len(assistant_text or ""),
"working_title": session.working_title,
"requires_confirmation": True,
},
)
elif detected_intent == "save_story":
assistant_text = "好的,这个故事已经准备好保存到故事库了。"
elif detected_intent == "end_story":
assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。"
@@ -479,6 +682,7 @@ async def _process_pending_turn(
)
assistant_text = assistant_result.story_text.strip()
if not confirmation_state["requires_confirmation"]:
merged_state, story_patch = _merge_story_state(
session,
transcript_text=transcript_text,
@@ -520,6 +724,7 @@ async def _process_pending_turn(
metadata={
"assistant_text_length": len(assistant_text or ""),
"working_title": session.working_title,
"requires_confirmation": False,
},
)
except Exception as exc:

View File

@@ -6,6 +6,7 @@ from app.core.config import settings
from app.db.database import get_db
from app.main import app
from app.services.adapters.text.models import StoryOutput
from app.services.voice_transcription_service import VoiceTranscriptionResult
async def test_voice_session_create_and_fallback_turn_returns_audio(
@@ -272,6 +273,82 @@ async def test_voice_session_uploaded_audio_turn_uses_demo_transcript_hint(
app.dependency_overrides.clear()
async def test_voice_session_low_confidence_turn_requests_confirmation(
db_session,
auth_token,
):
async def override_get_db():
yield db_session
app.dependency_overrides[get_db] = override_get_db
with (
patch(
"app.services.voice_session_service.generate_story_content",
new_callable=AsyncMock,
) as mock_generate,
patch(
"app.services.voice_session_service.text_to_speech",
new_callable=AsyncMock,
) as mock_tts,
patch(
"app.services.voice_session_service.transcribe_voice_audio",
new_callable=AsyncMock,
) as mock_transcribe,
):
mock_tts.return_value = b"confirmation-audio"
mock_transcribe.return_value = VoiceTranscriptionResult(
transcript_text="我想听一个会发光的小恐龙故事",
confidence=0.41,
provider="openai",
)
transport = ASGITransport(app=app)
try:
async with AsyncClient(transport=transport, base_url="http://test") as client:
client.cookies.set("access_token", auth_token)
response = await client.post("/api/voice-sessions", json={})
assert response.status_code == 201
session_id = response.json()["id"]
response = await client.post(
f"/api/voice-sessions/{session_id}/turns",
files={
"audio_file": ("turn.webm", b"fake-webm-audio", "audio/webm"),
},
)
assert response.status_code == 202
turn_id = response.json()["turn_id"]
response = await client.get(
f"/api/voice-sessions/{session_id}/turns/{turn_id}"
)
assert response.status_code == 200
turn_data = response.json()
assert turn_data["status"] == "audio_ready"
assert turn_data["requires_confirmation"] is True
assert turn_data["understanding_summary"].startswith("本轮系统理解为")
assert "请家长帮忙确认" in turn_data["confirmation_message"]
assert turn_data["assistant_text"] == turn_data["confirmation_message"]
response = await client.get(f"/api/voice-sessions/{session_id}")
assert response.status_code == 200
session_data = response.json()
assert session_data["latest_requires_confirmation"] is True
assert "请家长帮忙确认" in session_data["latest_confirmation_message"]
assert session_data["can_finalize"] is False
assert session_data["story_state"]["narrative_segments"] == []
assert any(
event["event_type"] == "turn_confirmation_requested"
for event in session_data["events"]
)
mock_generate.assert_not_awaited()
finally:
app.dependency_overrides.clear()
async def test_voice_session_list_orders_recent_sessions_first(
db_session,
auth_token,

View File

@@ -8,7 +8,7 @@
## 1. 目的
这份文档是 [语音共创 Phase A 技术方案](/Users/zt/Code/dreamweaver/docs/technical/voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。
这份文档是 [语音共创 Phase A 技术方案](./voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。
它的目标很明确:

View File

@@ -8,7 +8,23 @@
## 1. 目标
这份技术方案用于把 [语音共创模式增量 PRD](/Users/zt/Code/dreamweaver/docs/product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。
这份技术方案用于把 [语音共创模式增量 PRD](../product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。
## 0. 当前实现快照2026-04-20
远端 `main` 已经跑通以下 Phase A 主链路:
- 独立 Voice Studio 入口页与最近会话恢复
- `voice_sessions / voice_turns / voice_session_events` 数据模型与 API
- 文本 fallback 回合、录音上传回合、turn 轮询结果查询
- turn 级语音补发、失败重试、会话 abandon、finalize -> Story 持久化
- 会话事件记录与最近 turn / 最近事件展示
本轮新增收束:
-`transcript_confidence``intent_confidence` 偏低时,后端优先返回确认提示,而不是直接把这一轮写进故事正文
- 前端明确展示“本轮系统理解为”与“建议家长确认后再继续”提示
- 低置信度确认链路已有后端测试覆盖,可作为下一阶段继续接 ASR 与更细确认交互的基础
Phase A 的核心目标不是做“完全实时的语音陪伴”,而是验证以下最小闭环:

View File

@@ -8,6 +8,10 @@ export interface VoiceTurnSummary {
transcription_provider: string | null
detected_intent: string
intent_confidence: number | null
understanding_summary: string | null
requires_confirmation: boolean
confirmation_reason: string | null
confirmation_message: string | null
assistant_text: string | null
assistant_audio_ready: boolean
assistant_audio_url: string | null
@@ -43,6 +47,9 @@ export interface VoiceSessionSummary {
latest_user_transcript: string | null
latest_assistant_text: string | null
latest_detected_intent: string | null
latest_understanding_summary: string | null
latest_requires_confirmation: boolean
latest_confirmation_message: string | null
latest_assistant_audio_ready: boolean
last_turn_status: string | null
transcription_mode_hint: string | null

View File

@@ -162,6 +162,13 @@ function formatDate(dateStr: string) {
})
}
function formatConfidence(value: number | null | undefined) {
if (typeof value !== 'number') {
return 'n/a'
}
return `${Math.round(value * 100)}%`
}
function revokeRecordedAudioUrl() {
if (recordedAudioUrl.value) {
URL.revokeObjectURL(recordedAudioUrl.value)
@@ -718,6 +725,19 @@ onBeforeUnmount(() => {
<div class="mt-6 grid grid-cols-1 gap-6 xl:grid-cols-[minmax(0,1.2fr)_minmax(0,0.8fr)]">
<div class="space-y-6">
<div
v-if="activeSession.latest_requires_confirmation"
class="rounded-2xl border border-amber-200 bg-amber-50 p-4 text-amber-800"
>
<div class="text-sm font-semibold">建议先确认这一轮理解</div>
<p class="mt-2 text-sm">
{{ activeSession.latest_confirmation_message || '系统对这一轮的理解还不够确定,建议家长先确认后再继续。' }}
</p>
<p v-if="activeSession.latest_understanding_summary" class="mt-2 text-xs text-amber-700">
{{ activeSession.latest_understanding_summary }}
</p>
</div>
<div class="rounded-2xl border border-gray-100 bg-white p-4">
<div class="flex items-center justify-between">
<h3 class="font-semibold text-gray-900">文本共创回合</h3>
@@ -839,6 +859,22 @@ onBeforeUnmount(() => {
<span class="font-medium text-gray-900">孩子</span>
{{ turn.user_transcript || '暂无转写内容' }}
</div>
<div v-if="turn.understanding_summary" class="mt-3 text-sm text-slate-600">
{{ turn.understanding_summary }}
</div>
<div
v-if="turn.requires_confirmation"
class="mt-3 rounded-2xl border border-amber-200 bg-amber-50 px-3 py-3 text-sm text-amber-800"
>
<div class="font-medium">建议家长确认后再继续</div>
<p class="mt-1">
{{ turn.confirmation_message || '系统对这一轮的理解还不够确定,建议换一种说法再试一次。' }}
</p>
<p class="mt-2 text-xs text-amber-700">
转写置信度{{ formatConfidence(turn.transcript_confidence) }} ·
意图置信度{{ formatConfidence(turn.intent_confidence) }}
</p>
</div>
<div v-if="turn.user_audio_url" class="mt-3">
<audio class="w-full" :src="turn.user_audio_url" controls></audio>
</div>