feat: add voice session confirmation guardrails
This commit is contained in:
@@ -82,6 +82,14 @@ class Settings(BaseSettings):
|
|||||||
"zh",
|
"zh",
|
||||||
description="Preferred language hint for voice transcription",
|
description="Preferred language hint for voice transcription",
|
||||||
)
|
)
|
||||||
|
voice_turn_low_transcript_confidence: float = Field(
|
||||||
|
0.65,
|
||||||
|
description="Prompt for confirmation when transcript confidence falls below this threshold",
|
||||||
|
)
|
||||||
|
voice_turn_low_intent_confidence: float = Field(
|
||||||
|
0.70,
|
||||||
|
description="Prompt for confirmation when intent confidence falls below this threshold",
|
||||||
|
)
|
||||||
voice_turn_max_upload_bytes: int = Field(
|
voice_turn_max_upload_bytes: int = Field(
|
||||||
5 * 1024 * 1024,
|
5 * 1024 * 1024,
|
||||||
description="Maximum accepted upload size in bytes for one voice turn audio file",
|
description="Maximum accepted upload size in bytes for one voice turn audio file",
|
||||||
|
|||||||
@@ -73,6 +73,10 @@ class VoiceTurnSummaryResponse(BaseModel):
|
|||||||
transcription_provider: str | None = None
|
transcription_provider: str | None = None
|
||||||
detected_intent: str
|
detected_intent: str
|
||||||
intent_confidence: float | None = None
|
intent_confidence: float | None = None
|
||||||
|
understanding_summary: str | None = None
|
||||||
|
requires_confirmation: bool = False
|
||||||
|
confirmation_reason: str | None = None
|
||||||
|
confirmation_message: str | None = None
|
||||||
assistant_text: str | None = None
|
assistant_text: str | None = None
|
||||||
assistant_audio_ready: bool = False
|
assistant_audio_ready: bool = False
|
||||||
assistant_audio_url: str | None = None
|
assistant_audio_url: str | None = None
|
||||||
@@ -99,6 +103,9 @@ class VoiceSessionSummaryResponse(BaseModel):
|
|||||||
latest_user_transcript: str | None = None
|
latest_user_transcript: str | None = None
|
||||||
latest_assistant_text: str | None = None
|
latest_assistant_text: str | None = None
|
||||||
latest_detected_intent: str | None = None
|
latest_detected_intent: str | None = None
|
||||||
|
latest_understanding_summary: str | None = None
|
||||||
|
latest_requires_confirmation: bool = False
|
||||||
|
latest_confirmation_message: str | None = None
|
||||||
latest_assistant_audio_ready: bool = False
|
latest_assistant_audio_ready: bool = False
|
||||||
last_turn_status: str | None = None
|
last_turn_status: str | None = None
|
||||||
transcription_mode_hint: str | None = None
|
transcription_mode_hint: str | None = None
|
||||||
|
|||||||
@@ -79,8 +79,129 @@ def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> st
|
|||||||
return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio"
|
return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio"
|
||||||
|
|
||||||
|
|
||||||
|
def _format_intent_label(intent: str | None) -> str:
|
||||||
|
labels = {
|
||||||
|
"start_story": "开启故事",
|
||||||
|
"continue_story": "继续讲述",
|
||||||
|
"correct_story": "修正走向",
|
||||||
|
"end_story": "先停在这里",
|
||||||
|
"save_story": "保存当前故事",
|
||||||
|
"unknown": "待确认",
|
||||||
|
}
|
||||||
|
return labels.get(intent or "", "待确认")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_understanding_summary(
|
||||||
|
*,
|
||||||
|
transcript_text: str | None,
|
||||||
|
detected_intent: str,
|
||||||
|
) -> str | None:
|
||||||
|
normalized_transcript = (transcript_text or "").strip()
|
||||||
|
if detected_intent == "unknown":
|
||||||
|
if normalized_transcript:
|
||||||
|
return f"本轮系统暂时还没完全理解:{normalized_transcript}"
|
||||||
|
return "本轮系统暂时还没完全理解孩子刚才的表达。"
|
||||||
|
if normalized_transcript:
|
||||||
|
return f"本轮系统理解为「{_format_intent_label(detected_intent)}」:{normalized_transcript}"
|
||||||
|
return f"本轮系统理解为「{_format_intent_label(detected_intent)}」"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_confirmation_message(
|
||||||
|
*,
|
||||||
|
transcript_text: str | None,
|
||||||
|
detected_intent: str,
|
||||||
|
reasons: list[str],
|
||||||
|
) -> str:
|
||||||
|
natural_understanding = ""
|
||||||
|
normalized_transcript = (transcript_text or "").strip()
|
||||||
|
if detected_intent != "unknown":
|
||||||
|
if normalized_transcript:
|
||||||
|
natural_understanding = (
|
||||||
|
f"我现在先理解成你想「{_format_intent_label(detected_intent)}」:"
|
||||||
|
f"{normalized_transcript}。"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
natural_understanding = f"我现在先理解成你想「{_format_intent_label(detected_intent)}」。"
|
||||||
|
|
||||||
|
if "intent_unknown" in reasons:
|
||||||
|
prefix = "我这一次还没有完全听懂。"
|
||||||
|
elif {
|
||||||
|
"low_transcript_confidence",
|
||||||
|
"low_intent_confidence",
|
||||||
|
}.issubset(set(reasons)):
|
||||||
|
prefix = "我这一次听得还不够清楚,也不太确定该怎么接剧情。"
|
||||||
|
elif "low_transcript_confidence" in reasons:
|
||||||
|
prefix = "我这一次可能没有完全听清。"
|
||||||
|
else:
|
||||||
|
prefix = "我这一次还不太确定你是想继续讲,还是想改一下剧情。"
|
||||||
|
|
||||||
|
return (
|
||||||
|
f"{prefix}{natural_understanding}"
|
||||||
|
"请家长帮忙确认一下;如果不对,可以换一种说法再说一遍,我们再继续编下去。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_turn_confirmation_state(
|
||||||
|
*,
|
||||||
|
transcript_text: str | None,
|
||||||
|
transcript_confidence: float | None,
|
||||||
|
detected_intent: str,
|
||||||
|
intent_confidence: float | None,
|
||||||
|
story_patch: dict[str, Any] | None = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
patch = story_patch or {}
|
||||||
|
requires_confirmation = patch.get("requires_confirmation")
|
||||||
|
confirmation_reason = patch.get("confirmation_reason")
|
||||||
|
confirmation_message = patch.get("confirmation_message")
|
||||||
|
understanding_summary = patch.get("understanding_summary")
|
||||||
|
|
||||||
|
reasons: list[str] = []
|
||||||
|
if detected_intent == "unknown":
|
||||||
|
reasons.append("intent_unknown")
|
||||||
|
if (
|
||||||
|
transcript_confidence is not None
|
||||||
|
and transcript_confidence < settings.voice_turn_low_transcript_confidence
|
||||||
|
):
|
||||||
|
reasons.append("low_transcript_confidence")
|
||||||
|
if (
|
||||||
|
intent_confidence is not None
|
||||||
|
and intent_confidence < settings.voice_turn_low_intent_confidence
|
||||||
|
):
|
||||||
|
reasons.append("low_intent_confidence")
|
||||||
|
|
||||||
|
if requires_confirmation is None:
|
||||||
|
requires_confirmation = bool(reasons)
|
||||||
|
if confirmation_reason is None and reasons:
|
||||||
|
confirmation_reason = ",".join(reasons)
|
||||||
|
if understanding_summary is None:
|
||||||
|
understanding_summary = _build_understanding_summary(
|
||||||
|
transcript_text=transcript_text,
|
||||||
|
detected_intent=detected_intent,
|
||||||
|
)
|
||||||
|
if confirmation_message is None and requires_confirmation:
|
||||||
|
confirmation_message = _build_confirmation_message(
|
||||||
|
transcript_text=transcript_text,
|
||||||
|
detected_intent=detected_intent,
|
||||||
|
reasons=reasons or ["intent_unknown"],
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"understanding_summary": understanding_summary,
|
||||||
|
"requires_confirmation": bool(requires_confirmation),
|
||||||
|
"confirmation_reason": confirmation_reason,
|
||||||
|
"confirmation_message": confirmation_message,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
|
def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
|
||||||
turn_patch = turn.story_patch or {}
|
turn_patch = turn.story_patch or {}
|
||||||
|
confirmation_state = _resolve_turn_confirmation_state(
|
||||||
|
transcript_text=turn.user_transcript,
|
||||||
|
transcript_confidence=turn.transcript_confidence,
|
||||||
|
detected_intent=turn.detected_intent,
|
||||||
|
intent_confidence=turn.intent_confidence,
|
||||||
|
story_patch=turn_patch,
|
||||||
|
)
|
||||||
return VoiceTurnSummaryResponse(
|
return VoiceTurnSummaryResponse(
|
||||||
id=turn.id,
|
id=turn.id,
|
||||||
session_id=turn.session_id,
|
session_id=turn.session_id,
|
||||||
@@ -91,6 +212,10 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
|
|||||||
transcription_provider=turn_patch.get("transcription_provider"),
|
transcription_provider=turn_patch.get("transcription_provider"),
|
||||||
detected_intent=turn.detected_intent,
|
detected_intent=turn.detected_intent,
|
||||||
intent_confidence=turn.intent_confidence,
|
intent_confidence=turn.intent_confidence,
|
||||||
|
understanding_summary=confirmation_state["understanding_summary"],
|
||||||
|
requires_confirmation=confirmation_state["requires_confirmation"],
|
||||||
|
confirmation_reason=confirmation_state["confirmation_reason"],
|
||||||
|
confirmation_message=confirmation_state["confirmation_message"],
|
||||||
assistant_text=turn.assistant_text,
|
assistant_text=turn.assistant_text,
|
||||||
assistant_audio_ready=session_audio_exists(turn.assistant_audio_path),
|
assistant_audio_ready=session_audio_exists(turn.assistant_audio_path),
|
||||||
assistant_audio_url=_assistant_audio_url(
|
assistant_audio_url=_assistant_audio_url(
|
||||||
@@ -114,8 +239,20 @@ def _session_to_summary(
|
|||||||
) -> VoiceSessionSummaryResponse:
|
) -> VoiceSessionSummaryResponse:
|
||||||
if latest_turn is None:
|
if latest_turn is None:
|
||||||
total_turns = total_turns if total_turns is not None else session.current_turn_index
|
total_turns = total_turns if total_turns is not None else session.current_turn_index
|
||||||
|
latest_confirmation_state = {
|
||||||
|
"understanding_summary": None,
|
||||||
|
"requires_confirmation": False,
|
||||||
|
"confirmation_message": None,
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
total_turns = total_turns if total_turns is not None else latest_turn.turn_index
|
total_turns = total_turns if total_turns is not None else latest_turn.turn_index
|
||||||
|
latest_confirmation_state = _resolve_turn_confirmation_state(
|
||||||
|
transcript_text=latest_turn.user_transcript,
|
||||||
|
transcript_confidence=latest_turn.transcript_confidence,
|
||||||
|
detected_intent=latest_turn.detected_intent,
|
||||||
|
intent_confidence=latest_turn.intent_confidence,
|
||||||
|
story_patch=latest_turn.story_patch or {},
|
||||||
|
)
|
||||||
|
|
||||||
return VoiceSessionSummaryResponse(
|
return VoiceSessionSummaryResponse(
|
||||||
id=session.id,
|
id=session.id,
|
||||||
@@ -131,6 +268,9 @@ def _session_to_summary(
|
|||||||
latest_user_transcript=session.latest_user_transcript,
|
latest_user_transcript=session.latest_user_transcript,
|
||||||
latest_assistant_text=session.latest_assistant_text,
|
latest_assistant_text=session.latest_assistant_text,
|
||||||
latest_detected_intent=latest_turn.detected_intent if latest_turn else None,
|
latest_detected_intent=latest_turn.detected_intent if latest_turn else None,
|
||||||
|
latest_understanding_summary=latest_confirmation_state["understanding_summary"],
|
||||||
|
latest_requires_confirmation=latest_confirmation_state["requires_confirmation"],
|
||||||
|
latest_confirmation_message=latest_confirmation_state["confirmation_message"],
|
||||||
latest_assistant_audio_ready=(
|
latest_assistant_audio_ready=(
|
||||||
session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
|
session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
|
||||||
),
|
),
|
||||||
@@ -230,11 +370,13 @@ def _detect_intent(
|
|||||||
normalized = transcript_text.replace(" ", "")
|
normalized = transcript_text.replace(" ", "")
|
||||||
|
|
||||||
if any(keyword in normalized for keyword in ("保存故事", "存起来", "保存吧", "保存到故事库")):
|
if any(keyword in normalized for keyword in ("保存故事", "存起来", "保存吧", "保存到故事库")):
|
||||||
return "save_story", 0.95
|
return "save_story", 0.96
|
||||||
if any(keyword in normalized for keyword in ("先到这里", "讲完了", "结束吧", "停在这里")):
|
if any(keyword in normalized for keyword in ("先到这里", "讲完了", "结束吧", "停在这里")):
|
||||||
return "end_story", 0.88
|
return "end_story", 0.90
|
||||||
|
if len(normalized) <= 1 or normalized in {"嗯", "啊", "呃", "额", "这个", "那个", "不知道"}:
|
||||||
|
return "unknown", 0.25
|
||||||
if current_turn_index == 0:
|
if current_turn_index == 0:
|
||||||
return "start_story", 0.82
|
return "start_story", 0.84
|
||||||
if any(
|
if any(
|
||||||
keyword in normalized
|
keyword in normalized
|
||||||
for keyword in (
|
for keyword in (
|
||||||
@@ -248,8 +390,8 @@ def _detect_intent(
|
|||||||
"其实",
|
"其实",
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
return "correct_story", 0.76
|
return "correct_story", 0.78
|
||||||
return "continue_story", 0.68
|
return "continue_story", 0.74
|
||||||
|
|
||||||
|
|
||||||
def _recent_story_text(session: VoiceSession) -> str:
|
def _recent_story_text(session: VoiceSession) -> str:
|
||||||
@@ -451,6 +593,13 @@ async def _process_pending_turn(
|
|||||||
assistant_result: StoryOutput | None = None
|
assistant_result: StoryOutput | None = None
|
||||||
detected_intent = turn.detected_intent
|
detected_intent = turn.detected_intent
|
||||||
intent_confidence = turn.intent_confidence
|
intent_confidence = turn.intent_confidence
|
||||||
|
confirmation_state = _resolve_turn_confirmation_state(
|
||||||
|
transcript_text=transcript_text,
|
||||||
|
transcript_confidence=turn.transcript_confidence,
|
||||||
|
detected_intent=detected_intent,
|
||||||
|
intent_confidence=intent_confidence,
|
||||||
|
story_patch=turn.story_patch or {},
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await _record_session_event(
|
await _record_session_event(
|
||||||
@@ -466,7 +615,61 @@ async def _process_pending_turn(
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
if detected_intent == "save_story":
|
if confirmation_state["requires_confirmation"]:
|
||||||
|
current_state = _default_story_state() | (session.story_state or {})
|
||||||
|
assistant_text = confirmation_state["confirmation_message"]
|
||||||
|
turn.story_patch = {
|
||||||
|
**(turn.story_patch or {}),
|
||||||
|
"intent": detected_intent,
|
||||||
|
"transcript_text": transcript_text,
|
||||||
|
"segment_added": False,
|
||||||
|
"working_title": session.working_title,
|
||||||
|
"cover_prompt": current_state.get("cover_prompt"),
|
||||||
|
"narrative_segments_count": len(
|
||||||
|
list(current_state.get("narrative_segments") or [])
|
||||||
|
),
|
||||||
|
"requires_confirmation": True,
|
||||||
|
"confirmation_reason": confirmation_state["confirmation_reason"],
|
||||||
|
"confirmation_message": confirmation_state["confirmation_message"],
|
||||||
|
"understanding_summary": confirmation_state["understanding_summary"],
|
||||||
|
}
|
||||||
|
turn.assistant_text = assistant_text
|
||||||
|
turn.status = "narrative_ready"
|
||||||
|
session.latest_assistant_text = assistant_text
|
||||||
|
session.status = "waiting_user"
|
||||||
|
session.updated_at = _utcnow()
|
||||||
|
await db.commit()
|
||||||
|
await db.refresh(session)
|
||||||
|
await db.refresh(turn)
|
||||||
|
|
||||||
|
await _record_session_event(
|
||||||
|
db,
|
||||||
|
session_id=session.id,
|
||||||
|
turn_id=turn.id,
|
||||||
|
event_type="turn_confirmation_requested",
|
||||||
|
status="needs_confirmation",
|
||||||
|
message="Voice turn needs parent confirmation before the story continues.",
|
||||||
|
metadata={
|
||||||
|
"detected_intent": detected_intent,
|
||||||
|
"transcript_confidence": turn.transcript_confidence,
|
||||||
|
"intent_confidence": intent_confidence,
|
||||||
|
"confirmation_reason": confirmation_state["confirmation_reason"],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
await _record_session_event(
|
||||||
|
db,
|
||||||
|
session_id=session.id,
|
||||||
|
turn_id=turn.id,
|
||||||
|
event_type="assistant_text_ready",
|
||||||
|
status="succeeded",
|
||||||
|
message="Assistant clarification prompt generated.",
|
||||||
|
metadata={
|
||||||
|
"assistant_text_length": len(assistant_text or ""),
|
||||||
|
"working_title": session.working_title,
|
||||||
|
"requires_confirmation": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
elif detected_intent == "save_story":
|
||||||
assistant_text = "好的,这个故事已经准备好保存到故事库了。"
|
assistant_text = "好的,这个故事已经准备好保存到故事库了。"
|
||||||
elif detected_intent == "end_story":
|
elif detected_intent == "end_story":
|
||||||
assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。"
|
assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。"
|
||||||
@@ -479,6 +682,7 @@ async def _process_pending_turn(
|
|||||||
)
|
)
|
||||||
assistant_text = assistant_result.story_text.strip()
|
assistant_text = assistant_result.story_text.strip()
|
||||||
|
|
||||||
|
if not confirmation_state["requires_confirmation"]:
|
||||||
merged_state, story_patch = _merge_story_state(
|
merged_state, story_patch = _merge_story_state(
|
||||||
session,
|
session,
|
||||||
transcript_text=transcript_text,
|
transcript_text=transcript_text,
|
||||||
@@ -520,6 +724,7 @@ async def _process_pending_turn(
|
|||||||
metadata={
|
metadata={
|
||||||
"assistant_text_length": len(assistant_text or ""),
|
"assistant_text_length": len(assistant_text or ""),
|
||||||
"working_title": session.working_title,
|
"working_title": session.working_title,
|
||||||
|
"requires_confirmation": False,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from app.core.config import settings
|
|||||||
from app.db.database import get_db
|
from app.db.database import get_db
|
||||||
from app.main import app
|
from app.main import app
|
||||||
from app.services.adapters.text.models import StoryOutput
|
from app.services.adapters.text.models import StoryOutput
|
||||||
|
from app.services.voice_transcription_service import VoiceTranscriptionResult
|
||||||
|
|
||||||
|
|
||||||
async def test_voice_session_create_and_fallback_turn_returns_audio(
|
async def test_voice_session_create_and_fallback_turn_returns_audio(
|
||||||
@@ -272,6 +273,82 @@ async def test_voice_session_uploaded_audio_turn_uses_demo_transcript_hint(
|
|||||||
app.dependency_overrides.clear()
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
|
||||||
|
async def test_voice_session_low_confidence_turn_requests_confirmation(
|
||||||
|
db_session,
|
||||||
|
auth_token,
|
||||||
|
):
|
||||||
|
async def override_get_db():
|
||||||
|
yield db_session
|
||||||
|
|
||||||
|
app.dependency_overrides[get_db] = override_get_db
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch(
|
||||||
|
"app.services.voice_session_service.generate_story_content",
|
||||||
|
new_callable=AsyncMock,
|
||||||
|
) as mock_generate,
|
||||||
|
patch(
|
||||||
|
"app.services.voice_session_service.text_to_speech",
|
||||||
|
new_callable=AsyncMock,
|
||||||
|
) as mock_tts,
|
||||||
|
patch(
|
||||||
|
"app.services.voice_session_service.transcribe_voice_audio",
|
||||||
|
new_callable=AsyncMock,
|
||||||
|
) as mock_transcribe,
|
||||||
|
):
|
||||||
|
mock_tts.return_value = b"confirmation-audio"
|
||||||
|
mock_transcribe.return_value = VoiceTranscriptionResult(
|
||||||
|
transcript_text="我想听一个会发光的小恐龙故事",
|
||||||
|
confidence=0.41,
|
||||||
|
provider="openai",
|
||||||
|
)
|
||||||
|
|
||||||
|
transport = ASGITransport(app=app)
|
||||||
|
try:
|
||||||
|
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||||
|
client.cookies.set("access_token", auth_token)
|
||||||
|
|
||||||
|
response = await client.post("/api/voice-sessions", json={})
|
||||||
|
assert response.status_code == 201
|
||||||
|
session_id = response.json()["id"]
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"/api/voice-sessions/{session_id}/turns",
|
||||||
|
files={
|
||||||
|
"audio_file": ("turn.webm", b"fake-webm-audio", "audio/webm"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert response.status_code == 202
|
||||||
|
turn_id = response.json()["turn_id"]
|
||||||
|
|
||||||
|
response = await client.get(
|
||||||
|
f"/api/voice-sessions/{session_id}/turns/{turn_id}"
|
||||||
|
)
|
||||||
|
assert response.status_code == 200
|
||||||
|
turn_data = response.json()
|
||||||
|
assert turn_data["status"] == "audio_ready"
|
||||||
|
assert turn_data["requires_confirmation"] is True
|
||||||
|
assert turn_data["understanding_summary"].startswith("本轮系统理解为")
|
||||||
|
assert "请家长帮忙确认" in turn_data["confirmation_message"]
|
||||||
|
assert turn_data["assistant_text"] == turn_data["confirmation_message"]
|
||||||
|
|
||||||
|
response = await client.get(f"/api/voice-sessions/{session_id}")
|
||||||
|
assert response.status_code == 200
|
||||||
|
session_data = response.json()
|
||||||
|
assert session_data["latest_requires_confirmation"] is True
|
||||||
|
assert "请家长帮忙确认" in session_data["latest_confirmation_message"]
|
||||||
|
assert session_data["can_finalize"] is False
|
||||||
|
assert session_data["story_state"]["narrative_segments"] == []
|
||||||
|
assert any(
|
||||||
|
event["event_type"] == "turn_confirmation_requested"
|
||||||
|
for event in session_data["events"]
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_generate.assert_not_awaited()
|
||||||
|
finally:
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
|
||||||
async def test_voice_session_list_orders_recent_sessions_first(
|
async def test_voice_session_list_orders_recent_sessions_first(
|
||||||
db_session,
|
db_session,
|
||||||
auth_token,
|
auth_token,
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
|
|
||||||
## 1. 目的
|
## 1. 目的
|
||||||
|
|
||||||
这份文档是 [语音共创 Phase A 技术方案](/Users/zt/Code/dreamweaver/docs/technical/voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。
|
这份文档是 [语音共创 Phase A 技术方案](./voice-co-creation-phase-a-tech-spec.md) 的下一层实现草案。
|
||||||
|
|
||||||
它的目标很明确:
|
它的目标很明确:
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,23 @@
|
|||||||
|
|
||||||
## 1. 目标
|
## 1. 目标
|
||||||
|
|
||||||
这份技术方案用于把 [语音共创模式增量 PRD](/Users/zt/Code/dreamweaver/docs/product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。
|
这份技术方案用于把 [语音共创模式增量 PRD](../product/voice-co-creation-mode-incremental-prd.md) 收敛成一个可实现的 Phase A MVP。
|
||||||
|
|
||||||
|
## 0. 当前实现快照(2026-04-20)
|
||||||
|
|
||||||
|
远端 `main` 已经跑通以下 Phase A 主链路:
|
||||||
|
|
||||||
|
- 独立 Voice Studio 入口页与最近会话恢复
|
||||||
|
- `voice_sessions / voice_turns / voice_session_events` 数据模型与 API
|
||||||
|
- 文本 fallback 回合、录音上传回合、turn 轮询结果查询
|
||||||
|
- turn 级语音补发、失败重试、会话 abandon、finalize -> Story 持久化
|
||||||
|
- 会话事件记录与最近 turn / 最近事件展示
|
||||||
|
|
||||||
|
本轮新增收束:
|
||||||
|
|
||||||
|
- 当 `transcript_confidence` 或 `intent_confidence` 偏低时,后端优先返回确认提示,而不是直接把这一轮写进故事正文
|
||||||
|
- 前端明确展示“本轮系统理解为”与“建议家长确认后再继续”提示
|
||||||
|
- 低置信度确认链路已有后端测试覆盖,可作为下一阶段继续接 ASR 与更细确认交互的基础
|
||||||
|
|
||||||
Phase A 的核心目标不是做“完全实时的语音陪伴”,而是验证以下最小闭环:
|
Phase A 的核心目标不是做“完全实时的语音陪伴”,而是验证以下最小闭环:
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,10 @@ export interface VoiceTurnSummary {
|
|||||||
transcription_provider: string | null
|
transcription_provider: string | null
|
||||||
detected_intent: string
|
detected_intent: string
|
||||||
intent_confidence: number | null
|
intent_confidence: number | null
|
||||||
|
understanding_summary: string | null
|
||||||
|
requires_confirmation: boolean
|
||||||
|
confirmation_reason: string | null
|
||||||
|
confirmation_message: string | null
|
||||||
assistant_text: string | null
|
assistant_text: string | null
|
||||||
assistant_audio_ready: boolean
|
assistant_audio_ready: boolean
|
||||||
assistant_audio_url: string | null
|
assistant_audio_url: string | null
|
||||||
@@ -43,6 +47,9 @@ export interface VoiceSessionSummary {
|
|||||||
latest_user_transcript: string | null
|
latest_user_transcript: string | null
|
||||||
latest_assistant_text: string | null
|
latest_assistant_text: string | null
|
||||||
latest_detected_intent: string | null
|
latest_detected_intent: string | null
|
||||||
|
latest_understanding_summary: string | null
|
||||||
|
latest_requires_confirmation: boolean
|
||||||
|
latest_confirmation_message: string | null
|
||||||
latest_assistant_audio_ready: boolean
|
latest_assistant_audio_ready: boolean
|
||||||
last_turn_status: string | null
|
last_turn_status: string | null
|
||||||
transcription_mode_hint: string | null
|
transcription_mode_hint: string | null
|
||||||
|
|||||||
@@ -162,6 +162,13 @@ function formatDate(dateStr: string) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function formatConfidence(value: number | null | undefined) {
|
||||||
|
if (typeof value !== 'number') {
|
||||||
|
return 'n/a'
|
||||||
|
}
|
||||||
|
return `${Math.round(value * 100)}%`
|
||||||
|
}
|
||||||
|
|
||||||
function revokeRecordedAudioUrl() {
|
function revokeRecordedAudioUrl() {
|
||||||
if (recordedAudioUrl.value) {
|
if (recordedAudioUrl.value) {
|
||||||
URL.revokeObjectURL(recordedAudioUrl.value)
|
URL.revokeObjectURL(recordedAudioUrl.value)
|
||||||
@@ -718,6 +725,19 @@ onBeforeUnmount(() => {
|
|||||||
|
|
||||||
<div class="mt-6 grid grid-cols-1 gap-6 xl:grid-cols-[minmax(0,1.2fr)_minmax(0,0.8fr)]">
|
<div class="mt-6 grid grid-cols-1 gap-6 xl:grid-cols-[minmax(0,1.2fr)_minmax(0,0.8fr)]">
|
||||||
<div class="space-y-6">
|
<div class="space-y-6">
|
||||||
|
<div
|
||||||
|
v-if="activeSession.latest_requires_confirmation"
|
||||||
|
class="rounded-2xl border border-amber-200 bg-amber-50 p-4 text-amber-800"
|
||||||
|
>
|
||||||
|
<div class="text-sm font-semibold">建议先确认这一轮理解</div>
|
||||||
|
<p class="mt-2 text-sm">
|
||||||
|
{{ activeSession.latest_confirmation_message || '系统对这一轮的理解还不够确定,建议家长先确认后再继续。' }}
|
||||||
|
</p>
|
||||||
|
<p v-if="activeSession.latest_understanding_summary" class="mt-2 text-xs text-amber-700">
|
||||||
|
{{ activeSession.latest_understanding_summary }}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="rounded-2xl border border-gray-100 bg-white p-4">
|
<div class="rounded-2xl border border-gray-100 bg-white p-4">
|
||||||
<div class="flex items-center justify-between">
|
<div class="flex items-center justify-between">
|
||||||
<h3 class="font-semibold text-gray-900">文本共创回合</h3>
|
<h3 class="font-semibold text-gray-900">文本共创回合</h3>
|
||||||
@@ -839,6 +859,22 @@ onBeforeUnmount(() => {
|
|||||||
<span class="font-medium text-gray-900">孩子:</span>
|
<span class="font-medium text-gray-900">孩子:</span>
|
||||||
{{ turn.user_transcript || '暂无转写内容' }}
|
{{ turn.user_transcript || '暂无转写内容' }}
|
||||||
</div>
|
</div>
|
||||||
|
<div v-if="turn.understanding_summary" class="mt-3 text-sm text-slate-600">
|
||||||
|
{{ turn.understanding_summary }}
|
||||||
|
</div>
|
||||||
|
<div
|
||||||
|
v-if="turn.requires_confirmation"
|
||||||
|
class="mt-3 rounded-2xl border border-amber-200 bg-amber-50 px-3 py-3 text-sm text-amber-800"
|
||||||
|
>
|
||||||
|
<div class="font-medium">建议家长确认后再继续</div>
|
||||||
|
<p class="mt-1">
|
||||||
|
{{ turn.confirmation_message || '系统对这一轮的理解还不够确定,建议换一种说法再试一次。' }}
|
||||||
|
</p>
|
||||||
|
<p class="mt-2 text-xs text-amber-700">
|
||||||
|
转写置信度:{{ formatConfidence(turn.transcript_confidence) }} ·
|
||||||
|
意图置信度:{{ formatConfidence(turn.intent_confidence) }}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
<div v-if="turn.user_audio_url" class="mt-3">
|
<div v-if="turn.user_audio_url" class="mt-3">
|
||||||
<audio class="w-full" :src="turn.user_audio_url" controls></audio>
|
<audio class="w-full" :src="turn.user_audio_url" controls></audio>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
Reference in New Issue
Block a user