From 46d6201529c2b22b6592b6a518a1881e34aec0cc Mon Sep 17 00:00:00 2001 From: Yuyan Date: Sun, 19 Apr 2026 23:10:16 +0800 Subject: [PATCH] feat: add voice studio prototype flow --- backend/app/api/voice_sessions.py | 80 +- backend/app/core/config.py | 12 + backend/app/schemas/voice_session_schemas.py | 16 + backend/app/services/voice_session_service.py | 614 +++++++++----- backend/app/services/voice_session_storage.py | 32 + .../services/voice_transcription_service.py | 134 +++ backend/pyproject.toml | 1 + backend/tests/test_voice_sessions.py | 144 ++++ frontend/src/api/client.ts | 42 +- frontend/src/components/NavBar.vue | 16 + frontend/src/router.ts | 5 + frontend/src/types/voiceSession.ts | 75 ++ frontend/src/views/Home.vue | 15 +- frontend/src/views/VoiceStudio.vue | 771 ++++++++++++++++++ 14 files changed, 1745 insertions(+), 212 deletions(-) create mode 100644 backend/app/services/voice_transcription_service.py create mode 100644 frontend/src/types/voiceSession.ts create mode 100644 frontend/src/views/VoiceStudio.vue diff --git a/backend/app/api/voice_sessions.py b/backend/app/api/voice_sessions.py index f0d0604..badcb99 100644 --- a/backend/app/api/voice_sessions.py +++ b/backend/app/api/voice_sessions.py @@ -1,6 +1,15 @@ """Voice co-creation session APIs.""" -from fastapi import APIRouter, Depends, Response, status +from fastapi import ( + APIRouter, + Depends, + File, + Form, + Query, + Response, + UploadFile, + status, +) from sqlalchemy.ext.asyncio import AsyncSession from app.core.deps import require_user @@ -17,15 +26,19 @@ from app.schemas.voice_session_schemas import ( VoiceTurnAcceptedResponse, VoiceTurnCreateFallbackRequest, VoiceTurnSummaryResponse, + VoiceTurnUploadAcceptedResponse, ) from app.services.voice_session_service import ( abandon_voice_session_service, create_voice_session_service, create_voice_turn_from_text_service, + create_voice_turn_from_upload_service, finalize_voice_session_service, get_voice_session_detail_service, get_voice_turn_audio_service, get_voice_turn_service, + get_voice_turn_user_audio_service, + list_voice_sessions_service, ) router = APIRouter() @@ -53,6 +66,22 @@ async def create_voice_session( return await create_voice_session_service(request, user.id, db) +@router.get("/voice-sessions", response_model=list[VoiceSessionSummaryResponse]) +async def list_voice_sessions( + limit: int = Query(default=8, ge=1, le=20), + active_only: bool = Query(default=False), + user: User = Depends(require_user), + db: AsyncSession = Depends(get_db), +): + """List recent voice co-creation sessions for restore/resume behavior.""" + return await list_voice_sessions_service( + user.id, + db, + limit=limit, + active_only=active_only, + ) + + @router.get("/voice-sessions/{session_id}", response_model=VoiceSessionDetailResponse) async def get_voice_session( session_id: str, @@ -83,6 +112,38 @@ async def create_voice_turn_from_text( return await create_voice_turn_from_text_service(session_id, request, user.id, db) +@router.post( + "/voice-sessions/{session_id}/turns", + response_model=VoiceTurnUploadAcceptedResponse, + status_code=status.HTTP_202_ACCEPTED, +) +async def create_voice_turn_from_upload( + session_id: str, + audio_file: UploadFile = File(...), + duration_ms: int | None = Form(default=None), + transcript_hint: str | None = Form(default=None), + user: User = Depends(require_user), + db: AsyncSession = Depends(get_db), +): + """Create one turn from uploaded audio and configured ASR behavior.""" + await check_rate_limit( + f"voice-turn:{user.id}", + VOICE_SESSION_RATE_LIMIT_REQUESTS, + VOICE_SESSION_RATE_LIMIT_WINDOW, + ) + audio_bytes = await audio_file.read() + return await create_voice_turn_from_upload_service( + session_id=session_id, + user_id=user.id, + audio_bytes=audio_bytes, + file_name=audio_file.filename or "voice-turn.webm", + mime_type=audio_file.content_type, + duration_ms=duration_ms, + transcript_hint=transcript_hint, + db=db, + ) + + @router.get( "/voice-sessions/{session_id}/turns/{turn_id}", response_model=VoiceTurnSummaryResponse, @@ -109,6 +170,23 @@ async def get_voice_turn_audio( return Response(content=audio_bytes, media_type="audio/mpeg") +@router.get("/voice-sessions/{session_id}/turns/{turn_id}/user-audio") +async def get_voice_turn_user_audio( + session_id: str, + turn_id: str, + user: User = Depends(require_user), + db: AsyncSession = Depends(get_db), +): + """Get uploaded user audio for one voice turn.""" + audio_bytes, mime_type = await get_voice_turn_user_audio_service( + session_id, + turn_id, + user.id, + db, + ) + return Response(content=audio_bytes, media_type=mime_type) + + @router.post( "/voice-sessions/{session_id}/finalize", response_model=VoiceSessionFinalizeResponse, diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 9038aaf..ec406ab 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -70,6 +70,18 @@ class Settings(BaseSettings): "storage/voice_sessions", description="Directory for persisted voice co-creation session assets", ) + voice_transcription_mode: str = Field( + "demo", + description="Voice transcription mode: demo, openai, or disabled", + ) + voice_transcription_model: str = Field( + "gpt-4o-mini-transcribe", + description="Model used when voice transcription mode is OpenAI-backed", + ) + voice_transcription_language: str = Field( + "zh", + description="Preferred language hint for voice transcription", + ) story_audio_cache_ttl_days: int = Field( 30, description="TTL in days before cached story audio is pruned", diff --git a/backend/app/schemas/voice_session_schemas.py b/backend/app/schemas/voice_session_schemas.py index 10d06dd..ab1f997 100644 --- a/backend/app/schemas/voice_session_schemas.py +++ b/backend/app/schemas/voice_session_schemas.py @@ -25,6 +25,15 @@ class VoiceTurnCreateFallbackRequest(BaseModel): duration_ms: int | None = Field(default=None, ge=1, le=MAX_VOICE_TURN_DURATION_MS) +class VoiceTurnUploadAcceptedResponse(BaseModel): + """Accepted response for one uploaded-audio voice turn.""" + + turn_id: str + session_id: str + status: str + transcription_provider: str | None = None + + class VoiceSessionFinalizeRequest(BaseModel): """Finalize one voice session into a persisted story.""" @@ -61,11 +70,14 @@ class VoiceTurnSummaryResponse(BaseModel): status: str user_transcript: str | None = None transcript_confidence: float | None = None + transcription_provider: str | None = None detected_intent: str intent_confidence: float | None = None assistant_text: str | None = None assistant_audio_ready: bool = False assistant_audio_url: str | None = None + user_audio_ready: bool = False + user_audio_url: str | None = None error_message: str | None = None created_at: datetime updated_at: datetime @@ -81,10 +93,14 @@ class VoiceSessionSummaryResponse(BaseModel): target_mode: str status: str current_turn_index: int + total_turns: int = 0 working_title: str | None = None story_state: dict[str, Any] = Field(default_factory=dict) latest_user_transcript: str | None = None latest_assistant_text: str | None = None + latest_detected_intent: str | None = None + latest_assistant_audio_ready: bool = False + last_turn_status: str | None = None can_continue: bool = False can_finalize: bool = False last_error: str | None = None diff --git a/backend/app/services/voice_session_service.py b/backend/app/services/voice_session_service.py index c01a63a..c3018cd 100644 --- a/backend/app/services/voice_session_service.py +++ b/backend/app/services/voice_session_service.py @@ -2,6 +2,7 @@ from __future__ import annotations +from datetime import datetime, timezone from typing import Any from fastapi import HTTPException @@ -20,24 +21,23 @@ from app.schemas.voice_session_schemas import ( VoiceTurnAcceptedResponse, VoiceTurnCreateFallbackRequest, VoiceTurnSummaryResponse, + VoiceTurnUploadAcceptedResponse, ) from app.services.adapters.text.models import StoryOutput from app.services.memory_service import build_enhanced_memory_context from app.services.provider_router import generate_story_content, text_to_speech -from app.services.story_service import ( - create_story_from_result, - validate_profile_and_universe, -) +from app.services.story_service import create_story_from_result, validate_profile_and_universe from app.services.voice_session_storage import ( build_turn_assistant_audio_path, read_session_audio, session_audio_exists, write_session_audio, + write_uploaded_user_audio, ) +from app.services.voice_transcription_service import transcribe_voice_audio logger = get_logger(__name__) -ACTIVE_SESSION_STATUSES = {"draft", "active", "processing_turn", "waiting_user"} CONTINUABLE_SESSION_STATUSES = {"draft", "active", "waiting_user"} FINAL_SESSION_STATUSES = {"completed", "abandoned"} @@ -62,13 +62,24 @@ def _session_can_finalize(session: VoiceSession) -> bool: return bool(segments) and session.status in {"active", "waiting_user"} +def _utcnow() -> datetime: + return datetime.now(timezone.utc) + + def _assistant_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> str | None: if not session_audio_exists(audio_path): return None return f"/api/voice-sessions/{session_id}/turns/{turn_id}/audio" +def _user_audio_url(session_id: str, turn_id: str, audio_path: str | None) -> str | None: + if not session_audio_exists(audio_path): + return None + return f"/api/voice-sessions/{session_id}/turns/{turn_id}/user-audio" + + def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse: + turn_patch = turn.story_patch or {} return VoiceTurnSummaryResponse( id=turn.id, session_id=turn.session_id, @@ -76,6 +87,7 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse: status=turn.status, user_transcript=turn.user_transcript, transcript_confidence=turn.transcript_confidence, + transcription_provider=turn_patch.get("transcription_provider"), detected_intent=turn.detected_intent, intent_confidence=turn.intent_confidence, assistant_text=turn.assistant_text, @@ -85,13 +97,25 @@ def _turn_to_summary(turn: VoiceTurn) -> VoiceTurnSummaryResponse: turn.id, turn.assistant_audio_path, ), + user_audio_ready=session_audio_exists(turn.user_audio_path), + user_audio_url=_user_audio_url(turn.session_id, turn.id, turn.user_audio_path), error_message=turn.error_message, created_at=turn.created_at, updated_at=turn.updated_at, ) -def _session_to_summary(session: VoiceSession) -> VoiceSessionSummaryResponse: +def _session_to_summary( + session: VoiceSession, + *, + latest_turn: VoiceTurn | None = None, + total_turns: int | None = None, +) -> VoiceSessionSummaryResponse: + if latest_turn is None: + total_turns = total_turns if total_turns is not None else session.current_turn_index + else: + total_turns = total_turns if total_turns is not None else latest_turn.turn_index + return VoiceSessionSummaryResponse( id=session.id, child_profile_id=session.child_profile_id, @@ -100,10 +124,16 @@ def _session_to_summary(session: VoiceSession) -> VoiceSessionSummaryResponse: target_mode=session.target_mode, status=session.status, current_turn_index=session.current_turn_index, + total_turns=total_turns or 0, working_title=session.working_title, story_state=session.story_state or {}, latest_user_transcript=session.latest_user_transcript, latest_assistant_text=session.latest_assistant_text, + latest_detected_intent=latest_turn.detected_intent if latest_turn else None, + latest_assistant_audio_ready=( + session_audio_exists(latest_turn.assistant_audio_path) if latest_turn else False + ), + last_turn_status=latest_turn.status if latest_turn else None, can_continue=_session_can_continue(session), can_finalize=_session_can_finalize(session), last_error=session.last_error, @@ -154,6 +184,20 @@ async def _get_owned_session( return session +async def _get_latest_turn( + db: AsyncSession, + *, + session_id: str, +) -> VoiceTurn | None: + result = await db.execute( + select(VoiceTurn) + .where(VoiceTurn.session_id == session_id) + .order_by(desc(VoiceTurn.turn_index)) + .limit(1) + ) + return result.scalar_one_or_none() + + async def _get_owned_turn( db: AsyncSession, *, @@ -307,6 +351,283 @@ def _merge_story_state( return current_state, patch +async def _create_pending_turn( + db: AsyncSession, + *, + session: VoiceSession, + transcript_text: str, + transcript_confidence: float | None, + transcription_provider: str | None, + user_audio_path: str | None = None, + user_audio_mime_type: str | None = None, + user_audio_duration_ms: int | None = None, +) -> tuple[VoiceSession, VoiceTurn]: + if session.status not in CONTINUABLE_SESSION_STATUSES: + raise HTTPException( + status_code=409, + detail="Voice session is not ready for another turn.", + ) + + next_turn_index = session.current_turn_index + 1 + detected_intent, intent_confidence = _detect_intent( + transcript_text, + current_turn_index=session.current_turn_index, + ) + turn = VoiceTurn( + session_id=session.id, + turn_index=next_turn_index, + status="transcribing", + user_audio_path=user_audio_path, + user_audio_mime_type=user_audio_mime_type, + user_audio_duration_ms=user_audio_duration_ms, + user_transcript=transcript_text, + transcript_confidence=transcript_confidence, + detected_intent=detected_intent, + intent_confidence=intent_confidence, + story_patch={"transcription_provider": transcription_provider}, + ) + session.status = "processing_turn" + session.current_turn_index = next_turn_index + session.latest_user_transcript = transcript_text + session.last_error = None + session.updated_at = _utcnow() + db.add(turn) + await db.commit() + await db.refresh(session) + await db.refresh(turn) + + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="turn_received", + status="received", + message="Voice turn received.", + metadata={ + "turn_index": turn.turn_index, + "has_user_audio": bool(user_audio_path), + "transcription_provider": transcription_provider, + }, + ) + if user_audio_path: + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="turn_audio_uploaded", + status="succeeded", + message="User audio uploaded for one voice turn.", + metadata={ + "mime_type": user_audio_mime_type, + "audio_path": user_audio_path, + }, + ) + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="turn_transcribed", + status="succeeded", + message="Voice turn transcript is available.", + metadata={ + "transcript_confidence": transcript_confidence, + "transcription_provider": transcription_provider, + }, + ) + return session, turn + + +async def _process_pending_turn( + db: AsyncSession, + *, + session: VoiceSession, + turn: VoiceTurn, + transcript_text: str, + user_id: str, +) -> str: + assistant_text: str | None = None + assistant_result: StoryOutput | None = None + detected_intent = turn.detected_intent + intent_confidence = turn.intent_confidence + + try: + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="intent_resolved", + status="succeeded", + message="Turn intent resolved.", + metadata={ + "detected_intent": detected_intent, + "intent_confidence": intent_confidence, + }, + ) + + if detected_intent == "save_story": + assistant_text = "好的,这个故事已经准备好保存到故事库了。" + elif detected_intent == "end_story": + assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。" + else: + assistant_result = await _generate_assistant_turn( + db, + session=session, + transcript_text=transcript_text, + intent=detected_intent, + ) + assistant_text = assistant_result.story_text.strip() + + merged_state, story_patch = _merge_story_state( + session, + transcript_text=transcript_text, + intent=detected_intent, + assistant_result=assistant_result, + ) + story_patch["transcription_provider"] = ( + (turn.story_patch or {}).get("transcription_provider") + ) + turn.story_patch = story_patch + turn.assistant_text = assistant_text + turn.status = "narrative_ready" + session.story_state = merged_state + session.latest_assistant_text = assistant_text + session.status = "waiting_user" + session.updated_at = _utcnow() + if assistant_result and assistant_result.title and not session.working_title: + session.working_title = assistant_result.title + await db.commit() + await db.refresh(session) + await db.refresh(turn) + + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="story_patch_applied", + status="succeeded", + message="Story state updated after one turn.", + metadata=story_patch, + ) + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="assistant_text_ready", + status="succeeded", + message="Assistant text response generated.", + metadata={ + "assistant_text_length": len(assistant_text or ""), + "working_title": session.working_title, + }, + ) + except Exception as exc: + turn.status = "failed" + turn.error_message = str(exc) + session.status = "waiting_user" + session.last_error = str(exc) + session.updated_at = _utcnow() + await db.commit() + await db.refresh(session) + await db.refresh(turn) + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="session_failed", + status="failed", + message="Assistant narrative generation failed for one voice turn.", + metadata={"error": str(exc), "turn_index": turn.turn_index}, + ) + logger.warning( + "voice_turn_generation_failed", + session_id=session.id, + turn_id=turn.id, + error=str(exc), + ) + return turn.status + + if assistant_text: + try: + audio_bytes = await text_to_speech( + assistant_text, + db=db, + user_id=user_id, + ) + saved_path = write_session_audio( + build_turn_assistant_audio_path(session.id, turn.turn_index), + audio_bytes, + ) + turn.assistant_audio_path = saved_path + turn.assistant_audio_duration_ms = None + turn.status = "audio_ready" + await db.commit() + await db.refresh(turn) + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="assistant_audio_ready", + status="succeeded", + message="Assistant audio response generated.", + metadata={"audio_path": saved_path}, + ) + except Exception as exc: + turn.status = "narrative_ready" + turn.error_message = None + session.last_error = None + session.updated_at = _utcnow() + await db.commit() + await db.refresh(turn) + await db.refresh(session) + await _record_session_event( + db, + session_id=session.id, + turn_id=turn.id, + event_type="assistant_audio_failed", + status="failed", + message="Assistant audio generation failed, text response kept.", + metadata={"error": str(exc)}, + ) + logger.warning( + "voice_turn_audio_failed", + session_id=session.id, + turn_id=turn.id, + error=str(exc), + ) + + return turn.status + + +async def list_voice_sessions_service( + user_id: str, + db: AsyncSession, + *, + limit: int = 8, + active_only: bool = False, +) -> list[VoiceSessionSummaryResponse]: + query = ( + select(VoiceSession) + .where(VoiceSession.user_id == user_id) + .order_by(desc(VoiceSession.updated_at), desc(VoiceSession.created_at)) + .limit(limit) + ) + if active_only: + query = query.where(VoiceSession.status.in_(CONTINUABLE_SESSION_STATUSES)) + + sessions = (await db.execute(query)).scalars().all() + summaries: list[VoiceSessionSummaryResponse] = [] + for session in sessions: + latest_turn = await _get_latest_turn(db, session_id=session.id) + summaries.append( + _session_to_summary( + session, + latest_turn=latest_turn, + total_turns=session.current_turn_index, + ) + ) + return summaries + + async def create_voice_session_service( request: VoiceSessionCreateRequest, user_id: str, @@ -373,7 +694,12 @@ async def get_voice_session_detail_service( ).scalars().all() events = list(reversed(events)) - summary = _session_to_summary(session) + latest_turn = turns[-1] if turns else None + summary = _session_to_summary( + session, + latest_turn=latest_turn, + total_turns=session.current_turn_index, + ) return VoiceSessionDetailResponse( **summary.model_dump(), recent_turns=[_turn_to_summary(turn) for turn in turns], @@ -399,208 +725,83 @@ async def create_voice_turn_from_text_service( user_id: str, db: AsyncSession, ) -> VoiceTurnAcceptedResponse: + session = await _get_owned_session(db, session_id=session_id, user_id=user_id) + transcript_text = request.transcript_text.strip() + session, turn = await _create_pending_turn( + db, + session=session, + transcript_text=transcript_text, + transcript_confidence=1.0, + transcription_provider="fallback", + user_audio_duration_ms=request.duration_ms, + ) + status = await _process_pending_turn( + db, + session=session, + turn=turn, + transcript_text=transcript_text, + user_id=user_id, + ) + return VoiceTurnAcceptedResponse( + turn_id=turn.id, + session_id=session.id, + status=status, + ) + + +async def create_voice_turn_from_upload_service( + *, + session_id: str, + user_id: str, + audio_bytes: bytes, + file_name: str, + mime_type: str | None, + duration_ms: int | None, + transcript_hint: str | None, + db: AsyncSession, +) -> VoiceTurnUploadAcceptedResponse: session = await _get_owned_session(db, session_id=session_id, user_id=user_id) if session.status not in CONTINUABLE_SESSION_STATUSES: raise HTTPException( status_code=409, detail="Voice session is not ready for another turn.", ) - - transcript_text = request.transcript_text.strip() next_turn_index = session.current_turn_index + 1 - detected_intent, intent_confidence = _detect_intent( - transcript_text, - current_turn_index=session.current_turn_index, - ) - - turn = VoiceTurn( + user_audio_path = write_uploaded_user_audio( session_id=session.id, turn_index=next_turn_index, - status="transcribing", - user_audio_duration_ms=request.duration_ms, - user_transcript=transcript_text, - transcript_confidence=1.0, - detected_intent=detected_intent, - intent_confidence=intent_confidence, + file_name=file_name, + mime_type=mime_type, + audio_data=audio_bytes, ) - session.status = "processing_turn" - session.current_turn_index = next_turn_index - session.latest_user_transcript = transcript_text - session.last_error = None - db.add(turn) - await db.commit() - await db.refresh(session) - await db.refresh(turn) - - await _record_session_event( + transcription = await transcribe_voice_audio( + audio_bytes=audio_bytes, + file_name=file_name, + mime_type=mime_type, + transcript_hint=transcript_hint, + ) + session, turn = await _create_pending_turn( db, - session_id=session.id, - turn_id=turn.id, - event_type="turn_received", - status="received", - message="Voice turn fallback text received.", - metadata={"turn_index": turn.turn_index}, + session=session, + transcript_text=transcription.transcript_text, + transcript_confidence=transcription.confidence, + transcription_provider=transcription.provider, + user_audio_path=user_audio_path, + user_audio_mime_type=mime_type, + user_audio_duration_ms=duration_ms, ) - await _record_session_event( + status = await _process_pending_turn( db, - session_id=session.id, - turn_id=turn.id, - event_type="turn_transcribed", - status="succeeded", - message="Fallback transcript accepted.", - metadata={"transcript_confidence": turn.transcript_confidence}, + session=session, + turn=turn, + transcript_text=transcription.transcript_text, + user_id=user_id, ) - - assistant_text: str | None = None - assistant_result: StoryOutput | None = None - - try: - await _record_session_event( - db, - session_id=session.id, - turn_id=turn.id, - event_type="intent_resolved", - status="succeeded", - message="Turn intent resolved.", - metadata={ - "detected_intent": detected_intent, - "intent_confidence": intent_confidence, - }, - ) - - if detected_intent == "save_story": - assistant_text = "好的,这个故事已经准备好保存到故事库了。" - elif detected_intent == "end_story": - assistant_text = "好的,我们先把故事停在这里。想保存的话,现在就可以保存到故事库。" - else: - assistant_result = await _generate_assistant_turn( - db, - session=session, - transcript_text=transcript_text, - intent=detected_intent, - ) - assistant_text = assistant_result.story_text.strip() - - merged_state, story_patch = _merge_story_state( - session, - transcript_text=transcript_text, - intent=detected_intent, - assistant_result=assistant_result, - ) - turn.story_patch = story_patch - turn.assistant_text = assistant_text - turn.status = "narrative_ready" - session.story_state = merged_state - session.latest_assistant_text = assistant_text - session.status = "waiting_user" - if assistant_result and assistant_result.title and not session.working_title: - session.working_title = assistant_result.title - await db.commit() - await db.refresh(session) - await db.refresh(turn) - - await _record_session_event( - db, - session_id=session.id, - turn_id=turn.id, - event_type="story_patch_applied", - status="succeeded", - message="Story state updated after one turn.", - metadata=story_patch, - ) - await _record_session_event( - db, - session_id=session.id, - turn_id=turn.id, - event_type="assistant_text_ready", - status="succeeded", - message="Assistant text response generated.", - metadata={ - "assistant_text_length": len(assistant_text or ""), - "working_title": session.working_title, - }, - ) - except Exception as exc: - turn.status = "failed" - turn.error_message = str(exc) - session.status = "waiting_user" - session.last_error = str(exc) - await db.commit() - await db.refresh(session) - await db.refresh(turn) - await _record_session_event( - db, - session_id=session.id, - turn_id=turn.id, - event_type="session_failed", - status="failed", - message="Assistant narrative generation failed for one voice turn.", - metadata={"error": str(exc), "turn_index": turn.turn_index}, - ) - logger.warning( - "voice_turn_generation_failed", - session_id=session.id, - turn_id=turn.id, - error=str(exc), - ) - return VoiceTurnAcceptedResponse( - turn_id=turn.id, - session_id=session.id, - status=turn.status, - ) - - if assistant_text: - try: - audio_bytes = await text_to_speech( - assistant_text, - db=db, - user_id=user_id, - ) - saved_path = write_session_audio( - build_turn_assistant_audio_path(session.id, turn.turn_index), - audio_bytes, - ) - turn.assistant_audio_path = saved_path - turn.assistant_audio_duration_ms = None - turn.status = "audio_ready" - await db.commit() - await db.refresh(turn) - await _record_session_event( - db, - session_id=session.id, - turn_id=turn.id, - event_type="assistant_audio_ready", - status="succeeded", - message="Assistant audio response generated.", - metadata={"audio_path": saved_path}, - ) - except Exception as exc: - turn.status = "narrative_ready" - turn.error_message = None - session.last_error = None - await db.commit() - await db.refresh(turn) - await db.refresh(session) - await _record_session_event( - db, - session_id=session.id, - turn_id=turn.id, - event_type="assistant_audio_failed", - status="failed", - message="Assistant audio generation failed, text response kept.", - metadata={"error": str(exc)}, - ) - logger.warning( - "voice_turn_audio_failed", - session_id=session.id, - turn_id=turn.id, - error=str(exc), - ) - - return VoiceTurnAcceptedResponse( + return VoiceTurnUploadAcceptedResponse( turn_id=turn.id, session_id=session.id, - status=turn.status, + status=status, + transcription_provider=transcription.provider, ) @@ -636,6 +837,23 @@ async def get_voice_turn_audio_service( return read_session_audio(turn.assistant_audio_path) +async def get_voice_turn_user_audio_service( + session_id: str, + turn_id: str, + user_id: str, + db: AsyncSession, +) -> tuple[bytes, str]: + turn = await _get_owned_turn( + db, + session_id=session_id, + turn_id=turn_id, + user_id=user_id, + ) + if not session_audio_exists(turn.user_audio_path): + raise HTTPException(status_code=404, detail="Uploaded user audio not found") + return read_session_audio(turn.user_audio_path), (turn.user_audio_mime_type or "audio/webm") + + async def finalize_voice_session_service( session_id: str, request: VoiceSessionFinalizeRequest, @@ -655,6 +873,7 @@ async def finalize_voice_session_service( raise HTTPException(status_code=409, detail="Voice session is not ready to finalize.") session.status = "finalizing_story" + session.updated_at = _utcnow() await db.commit() await db.refresh(session) await _record_session_event( @@ -695,6 +914,7 @@ async def finalize_voice_session_service( session.final_story_id = story.id session.status = "completed" session.last_error = None + session.updated_at = _utcnow() await db.commit() await db.refresh(session) @@ -728,6 +948,7 @@ async def abandon_voice_session_service( session.status = "abandoned" session.last_error = request.reason + session.updated_at = _utcnow() await db.commit() await db.refresh(session) @@ -741,4 +962,9 @@ async def abandon_voice_session_service( metadata={"reason": request.reason}, ) await db.refresh(session) - return _session_to_summary(session) + latest_turn = await _get_latest_turn(db, session_id=session.id) + return _session_to_summary( + session, + latest_turn=latest_turn, + total_turns=session.current_turn_index, + ) diff --git a/backend/app/services/voice_session_storage.py b/backend/app/services/voice_session_storage.py index 5aa3e03..253efad 100644 --- a/backend/app/services/voice_session_storage.py +++ b/backend/app/services/voice_session_storage.py @@ -26,6 +26,38 @@ def build_turn_assistant_audio_path(session_id: str, turn_index: int) -> Path: return session_storage_dir(session_id) / f"turn-{turn_index:03d}-assistant.mp3" +def _normalize_audio_suffix(file_name: str | None, mime_type: str | None) -> str: + if file_name and "." in file_name: + return file_name.rsplit(".", 1)[-1].lower() + + if mime_type == "audio/webm": + return "webm" + if mime_type == "audio/wav": + return "wav" + if mime_type == "audio/mpeg": + return "mp3" + if mime_type == "audio/mp4": + return "m4a" + return "bin" + + +def write_uploaded_user_audio( + *, + session_id: str, + turn_index: int, + file_name: str | None, + mime_type: str | None, + audio_data: bytes, +) -> str: + """Persist one uploaded user-audio turn and return the saved file path.""" + + suffix = _normalize_audio_suffix(file_name, mime_type) + return write_session_audio( + build_turn_user_audio_path(session_id, turn_index, suffix), + audio_data, + ) + + def write_session_audio(path: Path, audio_data: bytes) -> str: """Persist session audio bytes atomically and return the saved path.""" diff --git a/backend/app/services/voice_transcription_service.py b/backend/app/services/voice_transcription_service.py new file mode 100644 index 0000000..2054add --- /dev/null +++ b/backend/app/services/voice_transcription_service.py @@ -0,0 +1,134 @@ +"""Voice transcription helpers for co-creation sessions.""" + +from __future__ import annotations + +from dataclasses import dataclass +from io import BytesIO + +from fastapi import HTTPException +from openai import AsyncOpenAI + +from app.core.config import settings +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass(frozen=True) +class VoiceTranscriptionResult: + """Normalized transcription result for one uploaded voice turn.""" + + transcript_text: str + confidence: float | None = None + provider: str = "demo" + + +def _normalize_transcript(transcript_text: str) -> str: + return transcript_text.strip() + + +async def _transcribe_demo( + *, + audio_bytes: bytes, + mime_type: str | None, + transcript_hint: str | None, +) -> VoiceTranscriptionResult: + hint = _normalize_transcript(transcript_hint or "") + if hint: + return VoiceTranscriptionResult( + transcript_text=hint, + confidence=1.0, + provider="demo", + ) + + if mime_type and mime_type.startswith("text/"): + text = _normalize_transcript(audio_bytes.decode("utf-8", errors="ignore")) + if text: + return VoiceTranscriptionResult( + transcript_text=text, + confidence=1.0, + provider="demo", + ) + + raise HTTPException( + status_code=503, + detail=( + "当前环境未配置真实语音转写,请先使用文本共创模式," + "或在开发模式下提供 transcript_hint。" + ), + ) + + +async def _transcribe_openai( + *, + audio_bytes: bytes, + file_name: str, + mime_type: str | None, + transcript_hint: str | None, +) -> VoiceTranscriptionResult: + if not settings.openai_api_key: + raise HTTPException( + status_code=503, + detail="OPENAI_API_KEY 未配置,无法使用 OpenAI 语音转写。", + ) + + client = AsyncOpenAI(api_key=settings.openai_api_key) + audio_file = BytesIO(audio_bytes) + audio_file.name = file_name + + prompt = transcript_hint.strip() if transcript_hint else None + + try: + response = await client.audio.transcriptions.create( + model=settings.voice_transcription_model, + file=audio_file, + language=settings.voice_transcription_language, + prompt=prompt, + ) + except Exception as exc: + logger.warning("voice_transcription_openai_failed", error=str(exc)) + raise HTTPException( + status_code=503, + detail="语音转写服务暂时不可用,请稍后重试。", + ) from exc + + transcript_text = _normalize_transcript(getattr(response, "text", "") or "") + if not transcript_text: + raise HTTPException(status_code=502, detail="语音转写结果为空,请重试。") + + return VoiceTranscriptionResult( + transcript_text=transcript_text, + confidence=None, + provider="openai", + ) + + +async def transcribe_voice_audio( + *, + audio_bytes: bytes, + file_name: str, + mime_type: str | None, + transcript_hint: str | None = None, +) -> VoiceTranscriptionResult: + """Transcribe one uploaded audio turn according to the configured mode.""" + + mode = (settings.voice_transcription_mode or "demo").strip().lower() + + if mode == "disabled": + raise HTTPException( + status_code=503, + detail="当前环境已禁用语音转写,请先使用文本共创模式。", + ) + if mode == "openai": + return await _transcribe_openai( + audio_bytes=audio_bytes, + file_name=file_name, + mime_type=mime_type, + transcript_hint=transcript_hint, + ) + + return await _transcribe_demo( + audio_bytes=audio_bytes, + mime_type=mime_type, + transcript_hint=transcript_hint, + ) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 8e3bf4b..2f14558 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "redis>=5.0.0", "edge-tts>=6.1.0", "openai>=1.0.0", + "python-multipart>=0.0.9", ] [project.optional-dependencies] diff --git a/backend/tests/test_voice_sessions.py b/backend/tests/test_voice_sessions.py index 30fe708..50a0707 100644 --- a/backend/tests/test_voice_sessions.py +++ b/backend/tests/test_voice_sessions.py @@ -199,3 +199,147 @@ async def test_voice_session_abandon_blocks_future_turns( assert response.status_code == 409 finally: app.dependency_overrides.clear() + + +async def test_voice_session_uploaded_audio_turn_uses_demo_transcript_hint( + db_session, + auth_token, +): + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + + with ( + patch( + "app.services.voice_session_service.generate_story_content", + new_callable=AsyncMock, + ) as mock_generate, + patch( + "app.services.voice_session_service.text_to_speech", + new_callable=AsyncMock, + ) as mock_tts, + ): + mock_generate.return_value = StoryOutput( + mode="generated", + title="小鲸鱼找朋友", + story_text="小鲸鱼在海面上遇见了一只会唱歌的海鸥。", + cover_prompt_suggestion="温暖儿童绘本封面,小鲸鱼和海鸥", + ) + mock_tts.return_value = b"fake-upload-audio" + + transport = ASGITransport(app=app) + try: + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + + response = await client.post("/api/voice-sessions", json={}) + assert response.status_code == 201 + session_id = response.json()["id"] + + response = await client.post( + f"/api/voice-sessions/{session_id}/turns", + files={ + "audio_file": ("turn.webm", b"fake-webm-audio", "audio/webm"), + }, + data={ + "duration_ms": "3200", + "transcript_hint": "我想听一个小鲸鱼找朋友的故事", + }, + ) + assert response.status_code == 202 + turn_data = response.json() + assert turn_data["status"] == "audio_ready" + assert turn_data["transcription_provider"] == "demo" + + turn_id = turn_data["turn_id"] + response = await client.get( + f"/api/voice-sessions/{session_id}/turns/{turn_id}" + ) + assert response.status_code == 200 + detail = response.json() + assert detail["user_audio_ready"] is True + assert detail["user_audio_url"].endswith("/user-audio") + assert detail["transcription_provider"] == "demo" + assert detail["assistant_audio_ready"] is True + + response = await client.get(detail["user_audio_url"]) + assert response.status_code == 200 + assert response.content == b"fake-webm-audio" + assert response.headers["content-type"] == "audio/webm" + finally: + app.dependency_overrides.clear() + + +async def test_voice_session_list_orders_recent_sessions_first( + db_session, + auth_token, +): + async def override_get_db(): + yield db_session + + app.dependency_overrides[get_db] = override_get_db + + with ( + patch( + "app.services.voice_session_service.generate_story_content", + new_callable=AsyncMock, + ) as mock_generate, + patch( + "app.services.voice_session_service.text_to_speech", + new_callable=AsyncMock, + ) as mock_tts, + ): + mock_generate.side_effect = [ + StoryOutput( + mode="generated", + title="第一场冒险", + story_text="第一段故事。", + cover_prompt_suggestion="封面一", + ), + StoryOutput( + mode="generated", + title="第二场冒险", + story_text="第二段故事。", + cover_prompt_suggestion="封面二", + ), + ] + mock_tts.side_effect = [b"audio-1", b"audio-2"] + + transport = ASGITransport(app=app) + try: + async with AsyncClient(transport=transport, base_url="http://test") as client: + client.cookies.set("access_token", auth_token) + + response = await client.post("/api/voice-sessions", json={}) + first_session_id = response.json()["id"] + await client.post( + f"/api/voice-sessions/{first_session_id}/turns/fallback", + json={"transcript_text": "第一个故事"}, + ) + + response = await client.post("/api/voice-sessions", json={}) + second_session_id = response.json()["id"] + await client.post( + f"/api/voice-sessions/{second_session_id}/turns/fallback", + json={"transcript_text": "第二个故事"}, + ) + + response = await client.get("/api/voice-sessions?limit=8") + assert response.status_code == 200 + sessions = response.json() + assert len(sessions) >= 2 + assert sessions[0]["id"] == second_session_id + assert sessions[1]["id"] == first_session_id + assert sessions[0]["total_turns"] == 1 + assert sessions[0]["last_turn_status"] == "audio_ready" + + response = await client.get("/api/voice-sessions?active_only=true") + assert response.status_code == 200 + active_sessions = response.json() + assert {item["id"] for item in active_sessions} >= { + first_session_id, + second_session_id, + } + finally: + app.dependency_overrides.clear() diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index 918f353..8efa6d7 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -1,19 +1,22 @@ -const BASE_URL = '' - -class ApiClient { - async request(url: string, options: RequestInit = {}): Promise { - const response = await fetch(`${BASE_URL}${url}`, { - ...options, - credentials: 'include', - headers: { - 'Content-Type': 'application/json', - ...options.headers, - }, - }) - - if (!response.ok) { - const error = await response.json().catch(() => ({ detail: '请求失败' })) - throw new Error(error.detail || '请求失败') +const BASE_URL = '' + +class ApiClient { + async request(url: string, options: RequestInit = {}): Promise { + const headers = new Headers(options.headers || {}) + const isFormData = options.body instanceof FormData + if (!isFormData && !headers.has('Content-Type')) { + headers.set('Content-Type', 'application/json') + } + + const response = await fetch(`${BASE_URL}${url}`, { + ...options, + credentials: 'include', + headers, + }) + + if (!response.ok) { + const error = await response.json().catch(() => ({ detail: '请求失败' })) + throw new Error(error.detail || '请求失败') } return response.json() @@ -30,6 +33,13 @@ class ApiClient { }) } + postForm(url: string, data: FormData): Promise { + return this.request(url, { + method: 'POST', + body: data, + }) + } + put(url: string, data?: unknown): Promise { return this.request(url, { method: 'PUT', diff --git a/frontend/src/components/NavBar.vue b/frontend/src/components/NavBar.vue index 36cd7f8..f2a86b7 100644 --- a/frontend/src/components/NavBar.vue +++ b/frontend/src/components/NavBar.vue @@ -7,6 +7,7 @@ import { ArrowRightOnRectangleIcon, BookOpenIcon, GlobeAltIcon, + MicrophoneIcon, MoonIcon, SparklesIcon, StarIcon, @@ -101,6 +102,13 @@ onMounted(() => { {{ t('app.navProfiles') }} + + + + 语音共创 + import('./views/MyStories.vue'), }, + { + path: '/voice-studio', + name: 'voice-studio', + component: () => import('./views/VoiceStudio.vue'), + }, { path: '/profiles', name: 'profiles', diff --git a/frontend/src/types/voiceSession.ts b/frontend/src/types/voiceSession.ts new file mode 100644 index 0000000..51c204d --- /dev/null +++ b/frontend/src/types/voiceSession.ts @@ -0,0 +1,75 @@ +export interface VoiceTurnSummary { + id: string + session_id: string + turn_index: number + status: string + user_transcript: string | null + transcript_confidence: number | null + transcription_provider: string | null + detected_intent: string + intent_confidence: number | null + assistant_text: string | null + assistant_audio_ready: boolean + assistant_audio_url: string | null + user_audio_ready: boolean + user_audio_url: string | null + error_message: string | null + created_at: string + updated_at: string +} + +export interface VoiceSessionEvent { + id: number + session_id: string + turn_id: string | null + event_type: string + status: string + message: string | null + event_metadata: Record + created_at: string +} + +export interface VoiceSessionSummary { + id: string + child_profile_id: string | null + universe_id: string | null + final_story_id: number | null + target_mode: string + status: string + current_turn_index: number + total_turns: number + working_title: string | null + story_state: Record + latest_user_transcript: string | null + latest_assistant_text: string | null + latest_detected_intent: string | null + latest_assistant_audio_ready: boolean + last_turn_status: string | null + can_continue: boolean + can_finalize: boolean + last_error: string | null + created_at: string + updated_at: string +} + +export interface VoiceSessionDetail extends VoiceSessionSummary { + recent_turns: VoiceTurnSummary[] + events: VoiceSessionEvent[] +} + +export interface VoiceTurnAcceptedResponse { + turn_id: string + session_id: string + status: string +} + +export interface VoiceTurnUploadAcceptedResponse extends VoiceTurnAcceptedResponse { + transcription_provider: string | null +} + +export interface VoiceSessionFinalizeResponse { + session_id: string + status: string + story_id: number | null + generation_job_id: string | null +} diff --git a/frontend/src/views/Home.vue b/frontend/src/views/Home.vue index 4a3b516..ba3d759 100644 --- a/frontend/src/views/Home.vue +++ b/frontend/src/views/Home.vue @@ -7,7 +7,8 @@ import BaseButton from '../components/ui/BaseButton.vue' import LoginDialog from '../components/ui/LoginDialog.vue' import { SparklesIcon, - ArrowRightOnRectangleIcon + ArrowRightOnRectangleIcon, + MicrophoneIcon, } from '@heroicons/vue/24/outline' const { locale } = useI18n() @@ -36,6 +37,14 @@ function openCreateModal() { router.push({ path: '/my-stories', query: { openCreate: 'true' } }) } +function openVoiceStudio() { + if (!userStore.user) { + showLoginDialog.value = true + return + } + router.push('/voice-studio') +} + function scrollToFeatures() { document.getElementById('features')?.scrollIntoView({ behavior: 'smooth' }) } @@ -139,6 +148,10 @@ function scrollToFeatures() { 开始创作故事 + + + 进入语音共创 + diff --git a/frontend/src/views/VoiceStudio.vue b/frontend/src/views/VoiceStudio.vue new file mode 100644 index 0000000..7b757e5 --- /dev/null +++ b/frontend/src/views/VoiceStudio.vue @@ -0,0 +1,771 @@ + + +