feat: add voice studio prototype flow

This commit is contained in:
2026-04-19 23:10:16 +08:00
parent f106f740dd
commit 46d6201529
14 changed files with 1745 additions and 212 deletions

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from fastapi import HTTPException
@@ -20,24 +21,23 @@ from app.schemas.voice_session_schemas import (
VoiceTurnAcceptedResponse,
VoiceTurnCreateFallbackRequest,
VoiceTurnSummaryResponse,
VoiceTurnUploadAcceptedResponse,
)
from app.services.adapters.text.models import StoryOutput
from app.services.memory_service import build_enhanced_memory_context
from app.services.provider_router import generate_story_content, text_to_speech
from app.services.story_service import (
create_story_from_result,
validate_profile_and_universe,
)
from app.services.story_service import create_story_from_result, validate_profile_and_universe
from app.services.voice_session_storage import (
build_turn_assistant_audio_path,
read_session_audio,
session_audio_exists,
write_session_audio,
write_uploaded_user_audio,
)
from app.services.voice_transcription_service import transcribe_voice_audio
logger = get_logger(__name__)
ACTIVE_SESSION_STATUSES = {"draft", "active", "processing_turn", "waiting_user"}
CONTINUABLE_SESSION_STATUSES = {"draft", "active", "waiting_user"}
FINAL_SESSION_STATUSES = {"completed", "abandoned"}
@@ -62,13 +62,24 @@ def _session_can_finalize(session: VoiceSession) -> bool:
return bool(segments) and session.status in {"active", "waiting_user"}
def _utcnow() -> datetime:
return datetime.now(timezone.utc)
def _assistant_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> str | None:
if not session_audio_exists(audio_path):
return None
return f"/api/voice-sessions/{session_id}/turns/{turn_id}/audio"
def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> str | None:
if not session_audio_exists(audio_path):
return None
return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio"
def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
turn_patch = turn.story_patch or {}
return VoiceTurnSummaryResponse(
id=turn.id,
session_id=turn.session_id,
@@ -76,6 +87,7 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
status=turn.status,
user_transcript=turn.user_transcript,
transcript_confidence=turn.transcript_confidence,
transcription_provider=turn_patch.get("transcription_provider"),
detected_intent=turn.detected_intent,
intent_confidence=turn.intent_confidence,
assistant_text=turn.assistant_text,
@@ -85,13 +97,25 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse:
turn.id,
turn.assistant_audio_path,
),
user_audio_ready=session_audio_exists(turn.user_audio_path),
user_audio_url=_user_audio_url(turn.session_id, turn.id, turn.user_audio_path),
error_message=turn.error_message,
created_at=turn.created_at,
updated_at=turn.updated_at,
)
def _session_to_summary(session: VoiceSession) -> VoiceSessionSummaryResponse:
def _session_to_summary(
session: VoiceSession,
*,
latest_turn: VoiceTurn | None = None,
total_turns: int | None = None,
) -> VoiceSessionSummaryResponse:
if latest_turn is None:
total_turns = total_turns if total_turns is not None else session.current_turn_index
else:
total_turns = total_turns if total_turns is not None else latest_turn.turn_index
return VoiceSessionSummaryResponse(
id=session.id,
child_profile_id=session.child_profile_id,
@@ -100,10 +124,16 @@ def _session_to_summary(session: VoiceSession) -> VoiceSessionSummaryResponse:
target_mode=session.target_mode,
status=session.status,
current_turn_index=session.current_turn_index,
total_turns=total_turns or 0,
working_title=session.working_title,
story_state=session.story_state or {},
latest_user_transcript=session.latest_user_transcript,
latest_assistant_text=session.latest_assistant_text,
latest_detected_intent=latest_turn.detected_intent if latest_turn else None,
latest_assistant_audio_ready=(
session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False
),
last_turn_status=latest_turn.status if latest_turn else None,
can_continue=_session_can_continue(session),
can_finalize=_session_can_finalize(session),
last_error=session.last_error,
@@ -154,6 +184,20 @@ async def _get_owned_session(
return session
async def _get_latest_turn(
db: AsyncSession,
*,
session_id: str,
) -> VoiceTurn | None:
result = await db.execute(
select(VoiceTurn)
.where(VoiceTurn.session_id == session_id)
.order_by(desc(VoiceTurn.turn_index))
.limit(1)
)
return result.scalar_one_or_none()
async def _get_owned_turn(
db: AsyncSession,
*,
@@ -307,6 +351,283 @@ def _merge_story_state(
return current_state, patch
async def _create_pending_turn(
db: AsyncSession,
*,
session: VoiceSession,
transcript_text: str,
transcript_confidence: float | None,
transcription_provider: str | None,
user_audio_path: str | None = None,
user_audio_mime_type: str | None = None,
user_audio_duration_ms: int | None = None,
) -> tuple[VoiceSession, VoiceTurn]:
if session.status not in CONTINUABLE_SESSION_STATUSES:
raise HTTPException(
status_code=409,
detail="Voice session is not ready for another turn.",
)
next_turn_index = session.current_turn_index + 1
detected_intent, intent_confidence = _detect_intent(
transcript_text,
current_turn_index=session.current_turn_index,
)
turn = VoiceTurn(
session_id=session.id,
turn_index=next_turn_index,
status="transcribing",
user_audio_path=user_audio_path,
user_audio_mime_type=user_audio_mime_type,
user_audio_duration_ms=user_audio_duration_ms,
user_transcript=transcript_text,
transcript_confidence=transcript_confidence,
detected_intent=detected_intent,
intent_confidence=intent_confidence,
story_patch={"transcription_provider": transcription_provider},
)
session.status = "processing_turn"
session.current_turn_index = next_turn_index
session.latest_user_transcript = transcript_text
session.last_error = None
session.updated_at = _utcnow()
db.add(turn)
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="turn_received",
status="received",
message="Voice turn received.",
metadata={
"turn_index": turn.turn_index,
"has_user_audio": bool(user_audio_path),
"transcription_provider": transcription_provider,
},
)
if user_audio_path:
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="turn_audio_uploaded",
status="succeeded",
message="User audio uploaded for one voice turn.",
metadata={
"mime_type": user_audio_mime_type,
"audio_path": user_audio_path,
},
)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="turn_transcribed",
status="succeeded",
message="Voice turn transcript is available.",
metadata={
"transcript_confidence": transcript_confidence,
"transcription_provider": transcription_provider,
},
)
return session, turn
async def _process_pending_turn(
db: AsyncSession,
*,
session: VoiceSession,
turn: VoiceTurn,
transcript_text: str,
user_id: str,
) -> str:
assistant_text: str | None = None
assistant_result: StoryOutput | None = None
detected_intent = turn.detected_intent
intent_confidence = turn.intent_confidence
try:
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="intent_resolved",
status="succeeded",
message="Turn intent resolved.",
metadata={
"detected_intent": detected_intent,
"intent_confidence": intent_confidence,
},
)
if detected_intent == "save_story":
assistant_text = "好的,这个故事已经准备好保存到故事库了。"
elif detected_intent == "end_story":
assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。"
else:
assistant_result = await _generate_assistant_turn(
db,
session=session,
transcript_text=transcript_text,
intent=detected_intent,
)
assistant_text = assistant_result.story_text.strip()
merged_state, story_patch = _merge_story_state(
session,
transcript_text=transcript_text,
intent=detected_intent,
assistant_result=assistant_result,
)
story_patch["transcription_provider"] = (
(turn.story_patch or {}).get("transcription_provider")
)
turn.story_patch = story_patch
turn.assistant_text = assistant_text
turn.status = "narrative_ready"
session.story_state = merged_state
session.latest_assistant_text = assistant_text
session.status = "waiting_user"
session.updated_at = _utcnow()
if assistant_result and assistant_result.title and not session.working_title:
session.working_title = assistant_result.title
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="story_patch_applied",
status="succeeded",
message="Story state updated after one turn.",
metadata=story_patch,
)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_text_ready",
status="succeeded",
message="Assistant text response generated.",
metadata={
"assistant_text_length": len(assistant_text or ""),
"working_title": session.working_title,
},
)
except Exception as exc:
turn.status = "failed"
turn.error_message = str(exc)
session.status = "waiting_user"
session.last_error = str(exc)
session.updated_at = _utcnow()
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="session_failed",
status="failed",
message="Assistant narrative generation failed for one voice turn.",
metadata={"error": str(exc), "turn_index": turn.turn_index},
)
logger.warning(
"voice_turn_generation_failed",
session_id=session.id,
turn_id=turn.id,
error=str(exc),
)
return turn.status
if assistant_text:
try:
audio_bytes = await text_to_speech(
assistant_text,
db=db,
user_id=user_id,
)
saved_path = write_session_audio(
build_turn_assistant_audio_path(session.id, turn.turn_index),
audio_bytes,
)
turn.assistant_audio_path = saved_path
turn.assistant_audio_duration_ms = None
turn.status = "audio_ready"
await db.commit()
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_audio_ready",
status="succeeded",
message="Assistant audio response generated.",
metadata={"audio_path": saved_path},
)
except Exception as exc:
turn.status = "narrative_ready"
turn.error_message = None
session.last_error = None
session.updated_at = _utcnow()
await db.commit()
await db.refresh(turn)
await db.refresh(session)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_audio_failed",
status="failed",
message="Assistant audio generation failed, text response kept.",
metadata={"error": str(exc)},
)
logger.warning(
"voice_turn_audio_failed",
session_id=session.id,
turn_id=turn.id,
error=str(exc),
)
return turn.status
async def list_voice_sessions_service(
user_id: str,
db: AsyncSession,
*,
limit: int = 8,
active_only: bool = False,
) -> list[VoiceSessionSummaryResponse]:
query = (
select(VoiceSession)
.where(VoiceSession.user_id == user_id)
.order_by(desc(VoiceSession.updated_at), desc(VoiceSession.created_at))
.limit(limit)
)
if active_only:
query = query.where(VoiceSession.status.in_(CONTINUABLE_SESSION_STATUSES))
sessions = (await db.execute(query)).scalars().all()
summaries: list[VoiceSessionSummaryResponse] = []
for session in sessions:
latest_turn = await _get_latest_turn(db, session_id=session.id)
summaries.append(
_session_to_summary(
session,
latest_turn=latest_turn,
total_turns=session.current_turn_index,
)
)
return summaries
async def create_voice_session_service(
request: VoiceSessionCreateRequest,
user_id: str,
@@ -373,7 +694,12 @@ async def get_voice_session_detail_service(
).scalars().all()
events = list(reversed(events))
summary = _session_to_summary(session)
latest_turn = turns[-1] if turns else None
summary = _session_to_summary(
session,
latest_turn=latest_turn,
total_turns=session.current_turn_index,
)
return VoiceSessionDetailResponse(
**summary.model_dump(),
recent_turns=[_turn_to_summary(turn) for turn in turns],
@@ -399,208 +725,83 @@ async def create_voice_turn_from_text_service(
user_id: str,
db: AsyncSession,
) -> VoiceTurnAcceptedResponse:
session = await _get_owned_session(db, session_id=session_id, user_id=user_id)
transcript_text = request.transcript_text.strip()
session, turn = await _create_pending_turn(
db,
session=session,
transcript_text=transcript_text,
transcript_confidence=1.0,
transcription_provider="fallback",
user_audio_duration_ms=request.duration_ms,
)
status = await _process_pending_turn(
db,
session=session,
turn=turn,
transcript_text=transcript_text,
user_id=user_id,
)
return VoiceTurnAcceptedResponse(
turn_id=turn.id,
session_id=session.id,
status=status,
)
async def create_voice_turn_from_upload_service(
*,
session_id: str,
user_id: str,
audio_bytes: bytes,
file_name: str,
mime_type: str | None,
duration_ms: int | None,
transcript_hint: str | None,
db: AsyncSession,
) -> VoiceTurnUploadAcceptedResponse:
session = await _get_owned_session(db, session_id=session_id, user_id=user_id)
if session.status not in CONTINUABLE_SESSION_STATUSES:
raise HTTPException(
status_code=409,
detail="Voice session is not ready for another turn.",
)
transcript_text = request.transcript_text.strip()
next_turn_index = session.current_turn_index + 1
detected_intent, intent_confidence = _detect_intent(
transcript_text,
current_turn_index=session.current_turn_index,
)
turn = VoiceTurn(
user_audio_path = write_uploaded_user_audio(
session_id=session.id,
turn_index=next_turn_index,
status="transcribing",
user_audio_duration_ms=request.duration_ms,
user_transcript=transcript_text,
transcript_confidence=1.0,
detected_intent=detected_intent,
intent_confidence=intent_confidence,
file_name=file_name,
mime_type=mime_type,
audio_data=audio_bytes,
)
session.status = "processing_turn"
session.current_turn_index = next_turn_index
session.latest_user_transcript = transcript_text
session.last_error = None
db.add(turn)
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
transcription = await transcribe_voice_audio(
audio_bytes=audio_bytes,
file_name=file_name,
mime_type=mime_type,
transcript_hint=transcript_hint,
)
session, turn = await _create_pending_turn(
db,
session_id=session.id,
turn_id=turn.id,
event_type="turn_received",
status="received",
message="Voice turn fallback text received.",
metadata={"turn_index": turn.turn_index},
session=session,
transcript_text=transcription.transcript_text,
transcript_confidence=transcription.confidence,
transcription_provider=transcription.provider,
user_audio_path=user_audio_path,
user_audio_mime_type=mime_type,
user_audio_duration_ms=duration_ms,
)
await _record_session_event(
status = await _process_pending_turn(
db,
session_id=session.id,
turn_id=turn.id,
event_type="turn_transcribed",
status="succeeded",
message="Fallback transcript accepted.",
metadata={"transcript_confidence": turn.transcript_confidence},
session=session,
turn=turn,
transcript_text=transcription.transcript_text,
user_id=user_id,
)
assistant_text: str | None = None
assistant_result: StoryOutput | None = None
try:
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="intent_resolved",
status="succeeded",
message="Turn intent resolved.",
metadata={
"detected_intent": detected_intent,
"intent_confidence": intent_confidence,
},
)
if detected_intent == "save_story":
assistant_text = "好的,这个故事已经准备好保存到故事库了。"
elif detected_intent == "end_story":
assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。"
else:
assistant_result = await _generate_assistant_turn(
db,
session=session,
transcript_text=transcript_text,
intent=detected_intent,
)
assistant_text = assistant_result.story_text.strip()
merged_state, story_patch = _merge_story_state(
session,
transcript_text=transcript_text,
intent=detected_intent,
assistant_result=assistant_result,
)
turn.story_patch = story_patch
turn.assistant_text = assistant_text
turn.status = "narrative_ready"
session.story_state = merged_state
session.latest_assistant_text = assistant_text
session.status = "waiting_user"
if assistant_result and assistant_result.title and not session.working_title:
session.working_title = assistant_result.title
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="story_patch_applied",
status="succeeded",
message="Story state updated after one turn.",
metadata=story_patch,
)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_text_ready",
status="succeeded",
message="Assistant text response generated.",
metadata={
"assistant_text_length": len(assistant_text or ""),
"working_title": session.working_title,
},
)
except Exception as exc:
turn.status = "failed"
turn.error_message = str(exc)
session.status = "waiting_user"
session.last_error = str(exc)
await db.commit()
await db.refresh(session)
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="session_failed",
status="failed",
message="Assistant narrative generation failed for one voice turn.",
metadata={"error": str(exc), "turn_index": turn.turn_index},
)
logger.warning(
"voice_turn_generation_failed",
session_id=session.id,
turn_id=turn.id,
error=str(exc),
)
return VoiceTurnAcceptedResponse(
turn_id=turn.id,
session_id=session.id,
status=turn.status,
)
if assistant_text:
try:
audio_bytes = await text_to_speech(
assistant_text,
db=db,
user_id=user_id,
)
saved_path = write_session_audio(
build_turn_assistant_audio_path(session.id, turn.turn_index),
audio_bytes,
)
turn.assistant_audio_path = saved_path
turn.assistant_audio_duration_ms = None
turn.status = "audio_ready"
await db.commit()
await db.refresh(turn)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_audio_ready",
status="succeeded",
message="Assistant audio response generated.",
metadata={"audio_path": saved_path},
)
except Exception as exc:
turn.status = "narrative_ready"
turn.error_message = None
session.last_error = None
await db.commit()
await db.refresh(turn)
await db.refresh(session)
await _record_session_event(
db,
session_id=session.id,
turn_id=turn.id,
event_type="assistant_audio_failed",
status="failed",
message="Assistant audio generation failed, text response kept.",
metadata={"error": str(exc)},
)
logger.warning(
"voice_turn_audio_failed",
session_id=session.id,
turn_id=turn.id,
error=str(exc),
)
return VoiceTurnAcceptedResponse(
return VoiceTurnUploadAcceptedResponse(
turn_id=turn.id,
session_id=session.id,
status=turn.status,
status=status,
transcription_provider=transcription.provider,
)
@@ -636,6 +837,23 @@ async def get_voice_turn_audio_service(
return read_session_audio(turn.assistant_audio_path)
async def get_voice_turn_user_audio_service(
session_id: str,
turn_id: str,
user_id: str,
db: AsyncSession,
) -> tuple[bytes, str]:
turn = await _get_owned_turn(
db,
session_id=session_id,
turn_id=turn_id,
user_id=user_id,
)
if not session_audio_exists(turn.user_audio_path):
raise HTTPException(status_code=404, detail="Uploaded user audio not found")
return read_session_audio(turn.user_audio_path), (turn.user_audio_mime_type or "audio/webm")
async def finalize_voice_session_service(
session_id: str,
request: VoiceSessionFinalizeRequest,
@@ -655,6 +873,7 @@ async def finalize_voice_session_service(
raise HTTPException(status_code=409, detail="Voice session is not ready to finalize.")
session.status = "finalizing_story"
session.updated_at = _utcnow()
await db.commit()
await db.refresh(session)
await _record_session_event(
@@ -695,6 +914,7 @@ async def finalize_voice_session_service(
session.final_story_id = story.id
session.status = "completed"
session.last_error = None
session.updated_at = _utcnow()
await db.commit()
await db.refresh(session)
@@ -728,6 +948,7 @@ async def abandon_voice_session_service(
session.status = "abandoned"
session.last_error = request.reason
session.updated_at = _utcnow()
await db.commit()
await db.refresh(session)
@@ -741,4 +962,9 @@ async def abandon_voice_session_service(
metadata={"reason": request.reason},
)
await db.refresh(session)
return _session_to_summary(session)
latest_turn = await _get_latest_turn(db, session_id=session.id)
return _session_to_summary(
session,
latest_turn=latest_turn,
total_turns=session.current_turn_index,
)