diff --git a/backend/app/core/config.py b/backend/app/core/config.py index b78b7ce..a8cd562 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -82,6 +82,14 @@ class Settings(BaseSettings): "zh", description="Preferred language hint for voice transcription", ) + voice_turn_low_transcript_confidence: float = Field( + 0.65, + description="Prompt for confirmation when transcript confidence falls below this threshold", + ) + voice_turn_low_intent_confidence: float = Field( + 0.70, + description="Prompt for confirmation when intent confidence falls below this threshold", + ) voice_turn_max_upload_bytes: int = Field( 5 * 1024 * 1024, description="Maximum accepted upload size in bytes for one voice turn audio file", diff --git a/backend/app/schemas/voice_session_schemas.py b/backend/app/schemas/voice_session_schemas.py index 78434d4..54e4ef3 100644 --- a/backend/app/schemas/voice_session_schemas.py +++ b/backend/app/schemas/voice_session_schemas.py @@ -73,6 +73,10 @@ class VoiceTurnSummaryResponse(BaseModel): transcription_provider: str | None = None detected_intent: str intent_confidence: float | None = None + understanding_summary: str | None = None + requires_confirmation: bool = False + confirmation_reason: str | None = None + confirmation_message: str | None = None assistant_text: str | None = None assistant_audio_ready: bool = False assistant_audio_url: str | None = None @@ -99,6 +103,9 @@ class VoiceSessionSummaryResponse(BaseModel): latest_user_transcript: str | None = None latest_assistant_text: str | None = None latest_detected_intent: str | None = None + latest_understanding_summary: str | None = None + latest_requires_confirmation: bool = False + latest_confirmation_message: str | None = None latest_assistant_audio_ready: bool = False last_turn_status: str | None = None transcription_mode_hint: str | None = None diff --git a/backend/app/services/voice_session_service.py b/backend/app/services/voice_session_service.py index 151a5db..14ad513 100644 --- a/backend/app/services/voice_session_service.py +++ b/backend/app/services/voice_session_service.py @@ -79,8 +79,129 @@ def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> st return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio" +def _format_intent_label(intent: str | None) -> str: + labels = { + "start_story": "开启故事", + "continue_story": "继续讲述", + "correct_story": "修正走向", + "end_story": "先停在这里", + "save_story": "保存当前故事", + "unknown": "待确认", + } + return labels.get(intent or "", "待确认") + + +def _build_understanding_summary( + *, + transcript_text: str | None, + detected_intent: str, +) -> str | None: + normalized_transcript = (transcript_text or "").strip() + if detected_intent == "unknown": + if normalized_transcript: + return f"本轮系统暂时还没完全理解:{normalized_transcript}" + return "本轮系统暂时还没完全理解孩子刚才的表达。" + if normalized_transcript: + return f"本轮系统理解为「{_format_intent_label(detected_intent)}」:{normalized_transcript}" + return f"本轮系统理解为「{_format_intent_label(detected_intent)}」" + + +def _build_confirmation_message( + *, + transcript_text: str | None, + detected_intent: str, + reasons: list[str], +) -> str: + natural_understanding = "" + normalized_transcript = (transcript_text or "").strip() + if detected_intent != "unknown": + if normalized_transcript: + natural_understanding = ( + f"我现在先理解成你想「{_format_intent_label(detected_intent)}」:" + f"{normalized_transcript}。" + ) + else: + natural_understanding = f"我现在先理解成你想「{_format_intent_label(detected_intent)}」。" + + if "intent_unknown" in reasons: + prefix = "我这一次还没有完全听懂。" + elif { + "low_transcript_confidence", + "low_intent_confidence", + }.issubset(set(reasons)): + prefix = "我这一次听得还不够清楚,也不太确定该怎么接剧情。" + elif "low_transcript_confidence" in reasons: + prefix = "我这一次可能没有完全听清。" + else: + prefix = "我这一次还不太确定你是想继续讲,还是想改一下剧情。" + + return ( + f"{prefix}{natural_understanding}" + "请家长帮忙确认一下;如果不对,可以换一种说法再说一遍,我们再继续编下去。" + ) + + +def _resolve_turn_confirmation_state( + *, + transcript_text: str | None, + transcript_confidence: float | None, + detected_intent: str, + intent_confidence: float | None, + story_patch: dict[str, Any] | None = None, +) -> dict[str, Any]: + patch = story_patch or {} + requires_confirmation = patch.get("requires_confirmation") + confirmation_reason = patch.get("confirmation_reason") + confirmation_message = patch.get("confirmation_message") + understanding_summary = patch.get("understanding_summary") + + reasons: list[str] = [] + if detected_intent == "unknown": + reasons.append("intent_unknown") + if ( + transcript_confidence is not None + and transcript_confidence < settings.voice_turn_low_transcript_confidence + ): + reasons.append("low_transcript_confidence") + if ( + intent_confidence is not None + and intent_confidence < settings.voice_turn_low_intent_confidence + ): + reasons.append("low_intent_confidence") + + if requires_confirmation is None: + requires_confirmation = bool(reasons) + if confirmation_reason is None and reasons: + confirmation_reason = ",".join(reasons) + if understanding_summary is None: + understanding_summary = _build_understanding_summary( + transcript_text=transcript_text, + detected_intent=detected_intent, + ) + if confirmation_message is None and requires_confirmation: + confirmation_message = _build_confirmation_message( + transcript_text=transcript_text, + detected_intent=detected_intent, + reasons=reasons or ["intent_unknown"], + ) + + return { + "understanding_summary": understanding_summary, + "requires_confirmation": bool(requires_confirmation), + "confirmation_reason": confirmation_reason, + "confirmation_message": confirmation_message, + } + + def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse: turn_patch = turn.story_patch or {} + confirmation_state = _resolve_turn_confirmation_state( + transcript_text=turn.user_transcript, + transcript_confidence=turn.transcript_confidence, + detected_intent=turn.detected_intent, + intent_confidence=turn.intent_confidence, + story_patch=turn_patch, + ) return VoiceTurnSummaryResponse( id=turn.id, session_id=turn.session_id, @@ -91,6 +212,10 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse: transcription_provider=turn_patch.get("transcription_provider"), detected_intent=turn.detected_intent, intent_confidence=turn.intent_confidence, + understanding_summary=confirmation_state["understanding_summary"], + requires_confirmation=confirmation_state["requires_confirmation"], + confirmation_reason=confirmation_state["confirmation_reason"], + confirmation_message=confirmation_state["confirmation_message"], assistant_text=turn.assistant_text, assistant_audio_ready=session_audio_exists(turn.assistant_audio_path), assistant_audio_url=_assistant_audio_url( @@ -114,8 +239,20 @@ def _session_to_summary( ) -> VoiceSessionSummaryResponse: if latest_turn is None: total_turns = total_turns if total_turns is not None else session.current_turn_index + latest_confirmation_state = { + "understanding_summary": None, + "requires_confirmation": False, + "confirmation_message": None, + } else: total_turns = total_turns if total_turns is not None else latest_turn.turn_index + latest_confirmation_state = _resolve_turn_confirmation_state( + transcript_text=latest_turn.user_transcript, + transcript_confidence=latest_turn.transcript_confidence, + detected_intent=latest_turn.detected_intent, + intent_confidence=latest_turn.intent_confidence, + story_patch=latest_turn.story_patch or {}, + ) return VoiceSessionSummaryResponse( id=session.id, @@ -131,6 +268,9 @@ def _session_to_summary( latest_user_transcript=session.latest_user_transcript, latest_assistant_text=session.latest_assistant_text, latest_detected_intent=latest_turn.detected_intent if latest_turn else None, + latest_understanding_summary=latest_confirmation_state["understanding_summary"], + latest_requires_confirmation=latest_confirmation_state["requires_confirmation"], + latest_confirmation_message=latest_confirmation_state["confirmation_message"], latest_assistant_audio_ready=( session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False ), @@ -230,11 +370,13 @@ def _detect_intent( normalized = transcript_text.replace(" ", "") if any(keyword in normalized for keyword in ("保存故事", "存起来", "保存吧", "保存到故事库")): - return "save_story", 0.95 + return "save_story", 0.96 if any(keyword in normalized for keyword in ("先到这里", "讲完了", "结束吧", "停在这里")): - return "end_story", 0.88 + return "end_story", 0.90 + if len(normalized) <= 1 or normalized in {"嗯", "啊", "呃", "额", "这个", "那个", "不知道"}: + return "unknown", 0.25 if current_turn_index == 0: - return "start_story", 0.82 + return "start_story", 0.84 if any( keyword in normalized for keyword in ( @@ -248,8 +390,8 @@ def _detect_intent( "其实", ) ): - return "correct_story", 0.76 - return "continue_story", 0.68 + return "correct_story", 0.78 + return "continue_story", 0.74 def _recent_story_text(session: VoiceSession) -> str: @@ -451,6 +593,13 @@ async def _process_pending_turn( assistant_result: StoryOutput | None = None detected_intent = turn.detected_intent intent_confidence = turn.intent_confidence + confirmation_state = _resolve_turn_confirmation_state( + transcript_text=transcript_text, + transcript_confidence=turn.transcript_confidence, + detected_intent=detected_intent, + intent_confidence=intent_confidence, + story_patch=turn.story_patch or {}, + ) try: await _record_session_event( @@ -466,7 +615,61 @@ async def _process_pending_turn( }, ) - if detected_intent == "save_story": + if confirmation_state["requires_confirmation"]: + current_state = _default_story_state() | (session.story_state or {}) + assistant_text = confirmation_state["confirmation_message"] + turn.story_patch = { + **(turn.story_patch or {}), + "intent": detected_intent, + "transcript_text": transcript_text, + "segment_added": False, + "working_title": session.working_title, + "cover_prompt": current_state.get("cover_prompt"), + "narrative_segments_count": len( + list(current_state.get("narrative_segments") or []) + ), + "requires_confirmation": True, + "confirmation_reason": confirmation_state["confirmation_reason"], + "confirmation_message": confirmation_state["confirmation_message"], + "understanding_summary": confirmation_state["understanding_summary"], + } + turn.assistant_text = assistant_text + turn.status = "narrative_ready" + session.latest_assistant_text = assistant_text + session.status = "waiting_user" + session.updated_at = _utcnow() + await db.commit() + await db.refresh(session) + await db.refresh(turn) + + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="turn_confirmation_requested", + status="needs_confirmation", + message="Voice turn needs parent confirmation before the story continues.", + metadata={ + "detected_intent": detected_intent, + "transcript_confidence": turn.transcript_confidence, + "intent_confidence": intent_confidence, + "confirmation_reason": confirmation_state["confirmation_reason"], + }, + ) + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="assistant_text_ready", + status="succeeded", + message="Assistant clarification prompt generated.", + metadata={ + "assistant_text_length": len(assistant_text or ""), + "working_title": session.working_title, + "requires_confirmation": True, + }, + ) + elif detected_intent == "save_story": assistant_text = "好的,这个故事已经准备好保存到故事库了。" elif detected_intent == "end_story": assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。" @@ -479,49 +682,51 @@ async def _process_pending_turn( ) assistant_text = assistant_result.story_text.strip() - merged_state, story_patch = _merge_story_state( - session, - transcript_text=transcript_text, - intent=detected_intent, - assistant_result=assistant_result, - ) - story_patch["transcription_provider"] = ( - (turn.story_patch or {}).get("transcription_provider") - ) - turn.story_patch = story_patch - turn.assistant_text = assistant_text - turn.status = "narrative_ready" - session.story_state = merged_state - session.latest_assistant_text = assistant_text - session.status = "waiting_user" - session.updated_at = _utcnow() - if assistant_result and assistant_result.title and not session.working_title: - session.working_title = assistant_result.title - await db.commit() - await db.refresh(session) - await db.refresh(turn) + if not confirmation_state["requires_confirmation"]: + merged_state, story_patch = _merge_story_state( + session, + transcript_text=transcript_text, + intent=detected_intent, + assistant_result=assistant_result, + ) + story_patch["transcription_provider"] = ( + (turn.story_patch or {}).get("transcription_provider") + ) + turn.story_patch = story_patch + turn.assistant_text = assistant_text + turn.status = "narrative_ready" + session.story_state = merged_state + session.latest_assistant_text = assistant_text + session.status = "waiting_user" + session.updated_at = _utcnow() + if assistant_result and assistant_result.title and not session.working_title: + session.working_title = assistant_result.title + await db.commit() + await db.refresh(session) + await db.refresh(turn) - await _record_session_event( - db, - session_id=session.id, - turn_id=turn.id, - event_type="story_patch_applied", - status="succeeded", - message="Story state updated after one turn.", - metadata=story_patch, - ) - await _record_session_event( - db, - session_id=session.id, - turn_id=turn.id, - event_type="assistant_text_ready", - status="succeeded", - message="Assistant text response generated.", - metadata={ - "assistant_text_length": len(assistant_text or ""), - "working_title": session.working_title, - }, - ) + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="story_patch_applied", + status="succeeded", + message="Story state updated after one turn.", + metadata=story_patch, + ) + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="assistant_text_ready", + status="succeeded", + message="Assistant text response generated.", + metadata={ + "assistant_text_length": len(assistant_text or ""), + "working_title": session.working_title, + "requires_confirmation": False, + }, + ) except Exception as exc: turn.status = "failed" turn.error_message = str(exc) diff --git a/backend/tests/test_voice_sessions.py b/backend/tests/test_voice_sessions.py index 3f1811e..06d1630 100644 --- a/backend/tests/test_voice_sessions.py +++ b/backend/tests/test_voice_sessions.py @@ -6,6 +6,7 @@ from app.core.config import settings from app.db.database import get_db from app.main import app from app.services.adapters.text.models import StoryOutput +from app.services.voice_transcription_service import VoiceTranscriptionResult async def test_voice_session_create_and_fallback_turn_returns_audio( @@ -272,6 +273,82 @@ async def test_voice_session_uploaded_audio_turn_uses_demo_transcript_hint( app.dependency_overrides.clear() +async def test_voice_session_low_confidence_turn_requests_confirmation( + db_session, + auth_token, +): + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + + with ( + patch( + "app.services.voice_session_service.generate_story_content", + new_callable=AsyncMock, + ) as mock_generate, + patch( + "app.services.voice_session_service.text_to_speech", + new_callable=AsyncMock, + ) as mock_tts, + patch( + "app.services.voice_session_service.transcribe_voice_audio", + new_callable=AsyncMock, + ) as mock_transcribe, + ): + mock_tts.return_value = b"confirmation-audio" + mock_transcribe.return_value = VoiceTranscriptionResult( + transcript_text="我想听一个会发光的小恐龙故事", + confidence=0.41, + provider="openai", + ) + + transport = ASGITransport(app=app) + try: + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + + response = await client.post("/api/voice-sessions", json={}) + assert response.status_code == 201 + session_id = response.json()["id"] + + response = await client.post( + f"/api/voice-sessions/{session_id}/turns", + files={ + "audio_file": ("turn.webm", b"fake-webm-audio", "audio/webm"), + }, + ) + assert response.status_code == 202 + turn_id = response.json()["turn_id"] + + response = await client.get( + f"/api/voice-sessions/{session_id}/turns/{turn_id}" + ) + assert response.status_code == 200 + turn_data = response.json() + assert turn_data["status"] == "audio_ready" + assert turn_data["requires_confirmation"] is True + assert turn_data["understanding_summary"].startswith("本轮系统理解为") + assert "请家长帮忙确认" in turn_data["confirmation_message"] + assert turn_data["assistant_text"] == turn_data["confirmation_message"] + + response = await client.get(f"/api/voice-sessions/{session_id}") + assert response.status_code == 200 + session_data = response.json() + assert session_data["latest_requires_confirmation"] is True + assert "请家长帮忙确认" in session_data["latest_confirmation_message"] + assert session_data["can_finalize"] is False + assert session_data["story_state"]["narrative_segments"] == [] + assert any( + event["event_type"] == "turn_confirmation_requested" + for event in session_data["events"] + ) + + mock_generate.assert_not_awaited() + finally: + app.dependency_overrides.clear() + + async def test_voice_session_list_orders_recent_sessions_first( db_session, auth_token, diff --git a/docs/technical/voice-co-creation-phase-a-migration-api-draft.md b/docs/technical/voice-co-creation-phase-a-migration-api-draft.md index 7e3d2f9..3792c58 100644 --- a/docs/technical/voice-co-creation-phase-a-migration-api-draft.md +++ b/docs/technical/voice-co-creation-phase-a-migration-api-draft.md @@ -8,7 +8,7 @@ ## 1. 目的 -这份文档是 [语音共创 Phase A 技术方案](/Users/zt/Code/dreamweaver/docs/technical/voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。 +这份文档是 [语音共创 Phase A 技术方案](./voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。 它的目标很明确: diff --git a/docs/technical/voice-co-creation-phase-a-tech-spec.md b/docs/technical/voice-co-creation-phase-a-tech-spec.md index c6dd64d..9968881 100644 --- a/docs/technical/voice-co-creation-phase-a-tech-spec.md +++ b/docs/technical/voice-co-creation-phase-a-tech-spec.md @@ -8,7 +8,23 @@ ## 1. 目标 -这份技术方案用于把 [语音共创模式增量 PRD](/Users/zt/Code/dreamweaver/docs/product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。 +这份技术方案用于把 [语音共创模式增量 PRD](../product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。 + +## 0. 当前实现快照(2026-04-20) + +远端 `main` 已经跑通以下 Phase A 主链路: + +- 独立 Voice Studio 入口页与最近会话恢复 +- `voice_sessions / voice_turns / voice_session_events` 数据模型与 API +- 文本 fallback 回合、录音上传回合、turn 轮询结果查询 +- turn 级语音补发、失败重试、会话 abandon、finalize -> Story 持久化 +- 会话事件记录与最近 turn / 最近事件展示 + +本轮新增收束: + +- 当 `transcript_confidence` 或 `intent_confidence` 偏低时,后端优先返回确认提示,而不是直接把这一轮写进故事正文 +- 前端明确展示“本轮系统理解为”与“建议家长确认后再继续”提示 +- 低置信度确认链路已有后端测试覆盖,可作为下一阶段继续接 ASR 与更细确认交互的基础 Phase A 的核心目标不是做“完全实时的语音陪伴”,而是验证以下最小闭环: diff --git a/frontend/src/types/voiceSession.ts b/frontend/src/types/voiceSession.ts index d18513b..d6fd3e8 100644 --- a/frontend/src/types/voiceSession.ts +++ b/frontend/src/types/voiceSession.ts @@ -8,6 +8,10 @@ export interface VoiceTurnSummary { transcription_provider: string | null detected_intent: string intent_confidence: number | null + understanding_summary: string | null + requires_confirmation: boolean + confirmation_reason: string | null + confirmation_message: string | null assistant_text: string | null assistant_audio_ready: boolean assistant_audio_url: string | null @@ -43,6 +47,9 @@ export interface VoiceSessionSummary { latest_user_transcript: string | null latest_assistant_text: string | null latest_detected_intent: string | null + latest_understanding_summary: string | null + latest_requires_confirmation: boolean + latest_confirmation_message: string | null latest_assistant_audio_ready: boolean last_turn_status: string | null transcription_mode_hint: string | null diff --git a/frontend/src/views/VoiceStudio.vue b/frontend/src/views/VoiceStudio.vue index 645da92..28e517a 100644 --- a/frontend/src/views/VoiceStudio.vue +++ b/frontend/src/views/VoiceStudio.vue @@ -162,6 +162,13 @@ function formatDate(dateStr: string) { }) } +function formatConfidence(value: number | null | undefined) { + if (typeof value !== 'number') { + return 'n/a' + } + return `${Math.round(value * 100)}%` +} + function revokeRecordedAudioUrl() { if (recordedAudioUrl.value) { URL.revokeObjectURL(recordedAudioUrl.value) @@ -718,6 +725,19 @@ onBeforeUnmount(() => {
+
+
建议先确认这一轮理解
+

+ {{ activeSession.latest_confirmation_message || '系统对这一轮的理解还不够确定,建议家长先确认后再继续。' }} +

+

+ {{ activeSession.latest_understanding_summary }} +

+
+

文本共创回合

@@ -839,6 +859,22 @@ onBeforeUnmount(() => { 孩子: {{ turn.user_transcript || '暂无转写内容' }}
+
+ {{ turn.understanding_summary }} +
+
+
建议家长确认后再继续
+

+ {{ turn.confirmation_message || '系统对这一轮的理解还不够确定,建议换一种说法再试一次。' }} +

+

+ 转写置信度:{{ formatConfidence(turn.transcript_confidence) }} · + 意图置信度:{{ formatConfidence(turn.intent_confidence) }} +

+