From 29643f6683a4e88192437248efb1335b9d558599 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 22 Dec 2025 14:41:57 -0600 Subject: [PATCH] upgrade TTS to Gemini TTS with voice selection and preview MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Gemini TTS service with 30 voices and 24 languages - Add TTS API endpoints for voice listing and preview - Add per-language voice selection in job creation form - Add voice override at QC approval stage - Add VoiceSelector and VoicePreviewButton components - Update TTSPreferences model with provider and voice mapping 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/app/api/v1/routes_jobs.py | 20 +- backend/app/api/v1/routes_tts.py | 104 +++++++ backend/app/core/config.py | 95 ++++++- backend/app/main.py | 2 + backend/app/models/job.py | 8 + backend/app/schemas/job.py | 5 +- backend/app/services/gemini_tts.py | 256 ++++++++++++++++++ backend/app/services/tts.py | 69 +++-- backend/app/tasks/translate_and_synthesize.py | 38 ++- .../src/components/VoicePreviewButton.tsx | 138 ++++++++++ frontend/src/components/VoiceSelector.tsx | 216 +++++++++++++++ frontend/src/hooks/useJob.ts | 15 +- frontend/src/lib/api.ts | 30 +- frontend/src/routes/admin/QCDetail.tsx | 68 ++++- frontend/src/routes/jobs/NewJob.tsx | 47 +++- frontend/src/types/api.ts | 19 ++ 16 files changed, 1075 insertions(+), 55 deletions(-) create mode 100644 backend/app/api/v1/routes_tts.py create mode 100644 backend/app/services/gemini_tts.py create mode 100644 frontend/src/components/VoicePreviewButton.tsx create mode 100644 frontend/src/components/VoiceSelector.tsx diff --git a/backend/app/api/v1/routes_jobs.py b/backend/app/api/v1/routes_jobs.py index b53122b..db01ba0 100644 --- a/backend/app/api/v1/routes_jobs.py +++ b/backend/app/api/v1/routes_jobs.py @@ -369,15 +369,23 @@ async def approve_source( source_language = job_doc["source"].get("language", "en") new_status = JobStatus.APPROVED_ENGLISH if source_language == "en" else JobStatus.APPROVED_SOURCE + # Build update operations + update_set = { + "status": new_status.value, + "review.notes": request.notes or "", + "review.reviewer_id": str(current_user.id), + "updated_at": datetime.utcnow() + } + + # If TTS preferences override provided, update requested_outputs.tts_preferences + if request.tts_preferences: + update_set["requested_outputs.tts_preferences"] = request.tts_preferences.model_dump() + logger.info(f"Updating TTS preferences for job {job_id}: {request.tts_preferences}") + result = await db.jobs.find_one_and_update( {"_id": job_id, "status": JobStatus.PENDING_QC.value}, { - "$set": { - "status": new_status.value, - "review.notes": request.notes or "", - "review.reviewer_id": str(current_user.id), - "updated_at": datetime.utcnow() - }, + "$set": update_set, "$push": { "review.history": { "at": datetime.utcnow(), diff --git a/backend/app/api/v1/routes_tts.py b/backend/app/api/v1/routes_tts.py new file mode 100644 index 0000000..b7eca84 --- /dev/null +++ b/backend/app/api/v1/routes_tts.py @@ -0,0 +1,104 @@ +from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import Response +from pydantic import BaseModel + +from ...core.config import settings +from ...core.logging import get_logger +from ...services.gemini_tts import gemini_tts_service +from ..deps import get_current_user + +logger = get_logger(__name__) + +router = APIRouter(prefix="/tts", tags=["tts"]) + + +class VoicePreviewRequest(BaseModel): + """Request to generate a voice preview""" + voice_name: str + language: str = "en" + + +class VoicesResponse(BaseModel): + """Available TTS voices""" + voices: list[str] + default: str + + +class LanguagesResponse(BaseModel): + """Supported TTS languages""" + languages: dict[str, str] # code -> display name + preview_samples: dict[str, str] # code -> sample text + + +@router.get("/voices", response_model=VoicesResponse) +async def list_voices( + current_user=Depends(get_current_user) +) -> VoicesResponse: + """ + List all available Gemini TTS voices. + """ + return VoicesResponse( + voices=settings.gemini_tts_voices, + default=settings.gemini_tts_default_voice + ) + + +@router.get("/languages", response_model=LanguagesResponse) +async def list_languages( + current_user=Depends(get_current_user) +) -> LanguagesResponse: + """ + List all supported TTS languages with display names and preview samples. + """ + return LanguagesResponse( + languages=settings.gemini_tts_language_names, + preview_samples=settings.gemini_tts_preview_samples + ) + + +@router.post("/preview") +async def preview_voice( + request: VoicePreviewRequest, + current_user=Depends(get_current_user) +) -> Response: + """ + Generate a voice preview audio sample. + Returns MP3 audio data. + """ + # Validate voice name + if request.voice_name not in settings.gemini_tts_voices: + raise HTTPException( + status_code=400, + detail=f"Invalid voice name. Available voices: {', '.join(settings.gemini_tts_voices)}" + ) + + # Validate language + if request.language not in settings.gemini_tts_languages: + raise HTTPException( + status_code=400, + detail=f"Unsupported language. Available languages: {', '.join(settings.gemini_tts_languages.keys())}" + ) + + try: + logger.info(f"Generating voice preview: voice={request.voice_name}, language={request.language}") + + # Generate preview audio + audio_data = await gemini_tts_service.synthesize_preview( + voice_name=request.voice_name, + language=request.language + ) + + return Response( + content=audio_data, + media_type="audio/mpeg", + headers={ + "Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3" + } + ) + + except Exception as e: + logger.error(f"Voice preview generation failed: {e}") + raise HTTPException( + status_code=500, + detail=f"Failed to generate voice preview: {str(e)}" + ) from e diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 2b82d0b..05cf1fe 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -39,20 +39,109 @@ class Settings(BaseSettings): google_tts_credentials: str = "" # TTS Voice Configuration - tts_provider: str = "google" # "google" or "elevenlabs" + tts_provider: str = "gemini" # "gemini", "google", or "elevenlabs" google_tts_voices: dict[str, str] = { "en-US": "en-US-Neural2-D", - "es-ES": "es-ES-Neural2-A", + "es-ES": "es-ES-Neural2-A", "fr-FR": "fr-FR-Neural2-A", "de-DE": "de-DE-Neural2-B" } elevenlabs_voices: dict[str, str] = { "en-US": "21m00Tcm4TlvDq8ikWAM", "es-ES": "VR6AewLTigWG4xSOukaG", - "fr-FR": "TxGEqnHWrfWFTfGW9XjX", + "fr-FR": "TxGEqnHWrfWFTfGW9XjX", "de-DE": "pNInz6obpgDQGcFmaJgB" } + # Gemini TTS Configuration + gemini_tts_model: str = "gemini-2.5-flash-preview-tts" + gemini_tts_default_voice: str = "Kore" + gemini_tts_voices: list[str] = [ + "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede", + "Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba", + "Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar", + "Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi", + "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat" + ] + gemini_tts_languages: dict[str, str] = { + "en": "en-US", + "es": "es-US", + "fr": "fr-FR", + "de": "de-DE", + "it": "it-IT", + "pt": "pt-BR", + "ja": "ja-JP", + "ko": "ko-KR", + "ar": "ar-EG", + "hi": "hi-IN", + "id": "id-ID", + "nl": "nl-NL", + "pl": "pl-PL", + "ru": "ru-RU", + "th": "th-TH", + "tr": "tr-TR", + "vi": "vi-VN", + "ro": "ro-RO", + "uk": "uk-UA", + "bn": "bn-BD", + "mr": "mr-IN", + "ta": "ta-IN", + "te": "te-IN", + "zh": "zh-CN" + } + gemini_tts_language_names: dict[str, str] = { + "en": "English", + "es": "Spanish", + "fr": "French", + "de": "German", + "it": "Italian", + "pt": "Portuguese", + "ja": "Japanese", + "ko": "Korean", + "ar": "Arabic", + "hi": "Hindi", + "id": "Indonesian", + "nl": "Dutch", + "pl": "Polish", + "ru": "Russian", + "th": "Thai", + "tr": "Turkish", + "vi": "Vietnamese", + "ro": "Romanian", + "uk": "Ukrainian", + "bn": "Bengali", + "mr": "Marathi", + "ta": "Tamil", + "te": "Telugu", + "zh": "Chinese" + } + gemini_tts_preview_samples: dict[str, str] = { + "en": "This is a preview of the audio description voice.", + "es": "Esta es una vista previa de la voz de audiodescripcion.", + "fr": "Ceci est un apercu de la voix de l'audiodescription.", + "de": "Dies ist eine Vorschau der Audiodeskriptionsstimme.", + "it": "Questa e un'anteprima della voce dell'audiodescrizione.", + "pt": "Esta e uma previa da voz da audiodescricao.", + "ja": "これは音声解説の声のプレビューです。", + "ko": "이것은 오디오 설명 음성의 미리보기입니다.", + "ar": "هذه معاينة لصوت الوصف الصوتي.", + "hi": "यह ऑडियो विवरण आवाज का पूर्वावलोकन है।", + "id": "Ini adalah pratinjau suara deskripsi audio.", + "nl": "Dit is een voorbeeld van de audiodescriptiestem.", + "pl": "To jest podglad glosu audiodeskrypcji.", + "ru": "Это предварительный просмотр голоса аудиоописания.", + "th": "นี่คือตัวอย่างเสียงบรรยายภาพ", + "tr": "Bu, sesli betimleme sesinin bir onizlemesidir.", + "vi": "Day la ban xem truoc giong mo ta am thanh.", + "ro": "Aceasta este o previzualizare a vocii descrierii audio.", + "uk": "Це попередній перегляд голосу аудіоопису.", + "bn": "এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।", + "mr": "हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे.", + "ta": "இது ஆடியோ விளக்க குரலின் முன்னோட்டம்.", + "te": "ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ.", + "zh": "这是音频描述语音的预览。" + } + # Email sendgrid_api_key: str email_from: str diff --git a/backend/app/main.py b/backend/app/main.py index ad3324b..70ed45c 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -14,6 +14,7 @@ from .api.v1.routes_admin import router as admin_router from .api.v1.routes_auth import router as auth_router from .api.v1.routes_files import router as files_router from .api.v1.routes_jobs import router as jobs_router +from .api.v1.routes_tts import router as tts_router from .api.v1.routes_websockets import router as websockets_router from .services.websocket import connection_manager from .core.config import settings @@ -241,6 +242,7 @@ async def validation_middleware(request, call_next): app.include_router(auth_router, prefix="/api/v1") app.include_router(files_router, prefix="/api/v1") app.include_router(jobs_router, prefix="/api/v1") +app.include_router(tts_router, prefix="/api/v1") app.include_router(admin_router, prefix="/api/v1") app.include_router(websockets_router, prefix="/api/v1") diff --git a/backend/app/models/job.py b/backend/app/models/job.py index 5ea6c9a..793d962 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -35,12 +35,20 @@ class Source(BaseModel): detected_language: Optional[str] = None # AI-detected language from Gemini +class TTSPreferences(BaseModel): + """TTS voice preferences for audio description generation""" + provider: Literal["gemini", "google", "elevenlabs"] = "gemini" + default_voice: str = "Kore" # Default Gemini voice + voices_per_language: dict[str, str] = {} # {"en": "Kore", "es": "Aoede"} + + class RequestedOutputs(BaseModel): captions_vtt: bool = True audio_description_vtt: bool = True audio_description_mp3: bool = True languages: list[str] = [] transcreation: list[str] = [] + tts_preferences: Optional[TTSPreferences] = None class LangOutput(BaseModel): diff --git a/backend/app/schemas/job.py b/backend/app/schemas/job.py index 3015889..0ed9ed3 100644 --- a/backend/app/schemas/job.py +++ b/backend/app/schemas/job.py @@ -1,8 +1,8 @@ -from typing import Any, Optional, Union +from typing import Any, Literal, Optional, Union from pydantic import BaseModel -from ..models.job import JobStatus, LangOutput, RequestedOutputs, Review +from ..models.job import JobStatus, LangOutput, RequestedOutputs, Review, TTSPreferences class JobResponse(BaseModel): @@ -43,6 +43,7 @@ class ApproveEnglishRequest(BaseModel): class ApproveSourceRequest(BaseModel): """Request to approve source language content (works for any language)""" notes: Optional[str] = None + tts_preferences: Optional[TTSPreferences] = None # Override TTS voice settings class RejectJobRequest(BaseModel): diff --git a/backend/app/services/gemini_tts.py b/backend/app/services/gemini_tts.py new file mode 100644 index 0000000..2fd2384 --- /dev/null +++ b/backend/app/services/gemini_tts.py @@ -0,0 +1,256 @@ +import io +import wave + +from google import genai +from google.genai import types +from pydub import AudioSegment + +from ..core.config import settings +from ..core.logging import get_logger + +logger = get_logger(__name__) + + +class GeminiTTSService: + """Text-to-Speech service using Gemini TTS API""" + + def __init__(self): + self.client = genai.Client(api_key=settings.gemini_api_key) + self.model = settings.gemini_tts_model + self.default_voice = settings.gemini_tts_default_voice + logger.info(f"Gemini TTS service initialized with model: {self.model}") + + async def synthesize_text( + self, + text: str, + voice_name: str, + language: str = "en" + ) -> bytes: + """ + Synthesize text to audio using Gemini TTS. + Returns MP3 audio bytes. + """ + if not text.strip(): + raise ValueError("Text cannot be empty") + + # Validate voice + if voice_name not in settings.gemini_tts_voices: + logger.warning(f"Unknown voice '{voice_name}', using default '{self.default_voice}'") + voice_name = self.default_voice + + try: + # Generate audio using Gemini TTS + response = self.client.models.generate_content( + model=self.model, + contents=text, + config=types.GenerateContentConfig( + response_modalities=["AUDIO"], + speech_config=types.SpeechConfig( + voice_config=types.VoiceConfig( + prebuilt_voice_config=types.PrebuiltVoiceConfig( + voice_name=voice_name, + ) + ) + ), + ) + ) + + # Extract PCM audio data from response + if not response.candidates or not response.candidates[0].content.parts: + raise ValueError("No audio data in Gemini TTS response") + + pcm_data = response.candidates[0].content.parts[0].inline_data.data + + # Convert PCM to MP3 + mp3_data = self._pcm_to_mp3(pcm_data) + + return mp3_data + + except Exception as e: + logger.error(f"Gemini TTS synthesis failed: {e}") + raise + + async def synthesize_preview( + self, + voice_name: str, + language: str = "en" + ) -> bytes: + """ + Generate a preview audio sample for voice selection. + Uses language-specific sample text. + """ + # Get preview sample text for the language + sample_text = settings.gemini_tts_preview_samples.get( + language, + settings.gemini_tts_preview_samples.get("en", "This is a voice preview.") + ) + + return await self.synthesize_text(sample_text, voice_name, language) + + async def synthesize_audio_description( + self, + ad_vtt_content: str, + language: str = "en", + voice_name: str | None = None + ) -> bytes: + """ + Synthesize full audio description from VTT content. + Maintains timing alignment with original VTT cues. + """ + if voice_name is None: + voice_name = self.default_voice + + # Validate voice + if voice_name not in settings.gemini_tts_voices: + logger.warning(f"Unknown voice '{voice_name}', using default '{self.default_voice}'") + voice_name = self.default_voice + + # Parse VTT cues + cues = self._parse_ad_cues(ad_vtt_content) + + if not cues: + raise ValueError("No audio description cues found in VTT content") + + logger.info(f"Synthesizing {len(cues)} audio description cues with voice '{voice_name}'") + + # Synthesize each cue with precise timing anchoring + audio_segments = [] + current_audio_position = 0.0 + + for i, cue in enumerate(cues): + target_start_time = cue["start_time"] + + # Add silence to reach the exact VTT start time + if target_start_time > current_audio_position: + silence_duration = target_start_time - current_audio_position + silence = AudioSegment.silent(duration=int(silence_duration * 1000)) + audio_segments.append(silence) + current_audio_position = target_start_time + + # Synthesize this cue's text + text = cue["text"].strip() + if text: + # Ensure proper punctuation for natural TTS flow + if not text.endswith(('.', '!', '?')): + text += "." + + try: + audio_data = await self.synthesize_text(text, voice_name, language) + + # Convert to AudioSegment and get actual duration + audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3") + audio_segments.append(audio_segment) + + # Update position based on actual audio duration + actual_audio_duration = len(audio_segment) / 1000.0 + current_audio_position += actual_audio_duration + + except Exception as e: + logger.warning(f"Failed to synthesize cue {i}: {e}") + # Add silence for failed cue + cue_duration = cue["end_time"] - cue["start_time"] + silence = AudioSegment.silent(duration=int(cue_duration * 1000)) + audio_segments.append(silence) + current_audio_position += cue_duration + + # Combine all segments + if audio_segments: + final_audio = sum(audio_segments, AudioSegment.empty()) + else: + final_audio = AudioSegment.silent(duration=1000) + + # Export to MP3 + output_buffer = io.BytesIO() + final_audio.export(output_buffer, format="mp3", bitrate="128k") + + logger.info(f"Audio description synthesized: {len(output_buffer.getvalue())} bytes") + return output_buffer.getvalue() + + def _pcm_to_mp3(self, pcm_data: bytes) -> bytes: + """ + Convert raw PCM audio (24kHz, 16-bit, mono) to MP3. + Gemini TTS outputs PCM at 24000 Hz sample rate. + """ + # Create WAV from PCM data + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, "wb") as wf: + wf.setnchannels(1) # Mono + wf.setsampwidth(2) # 16-bit (2 bytes) + wf.setframerate(24000) # 24kHz + wf.writeframes(pcm_data) + + # Convert WAV to MP3 using pydub + wav_buffer.seek(0) + audio_segment = AudioSegment.from_wav(wav_buffer) + + # Export as MP3 + mp3_buffer = io.BytesIO() + audio_segment.export(mp3_buffer, format="mp3", bitrate="128k") + + return mp3_buffer.getvalue() + + def _parse_ad_cues(self, vtt_content: str) -> list[dict]: + """Parse audio description VTT and extract timing + text""" + lines = vtt_content.strip().split('\n') + cues = [] + + i = 0 + while i < len(lines): + line = lines[i].strip() + + # Skip header and empty lines + if line == "WEBVTT" or line == "" or line.startswith("NOTE"): + i += 1 + continue + + # Check for timing line + if " --> " in line: + timing_parts = line.split(" --> ") + start_time = self._parse_timestamp(timing_parts[0].strip()) + end_time = self._parse_timestamp(timing_parts[1].strip()) + + # Get text from next line(s) + i += 1 + text_lines = [] + while i < len(lines) and lines[i].strip() != "": + text_lines.append(lines[i].strip()) + i += 1 + + if text_lines: + cues.append({ + "start_time": start_time, + "end_time": end_time, + "text": " ".join(text_lines) + }) + else: + i += 1 + + return cues + + def _parse_timestamp(self, timestamp: str) -> float: + """Convert VTT timestamp to seconds""" + parts = timestamp.split(":") + + if len(parts) == 3: # HH:MM:SS.mmm + hours, minutes, seconds = parts + elif len(parts) == 2: # MM:SS.mmm + hours, minutes, seconds = "0", parts[0], parts[1] + else: + raise ValueError(f"Invalid timestamp format: {timestamp}") + + sec_parts = seconds.split(".") + seconds_val = int(sec_parts[0]) + milliseconds = int(sec_parts[1]) if len(sec_parts) > 1 else 0 + + total_seconds = ( + int(hours) * 3600 + + int(minutes) * 60 + + seconds_val + + milliseconds / 1000.0 + ) + + return total_seconds + + +# Global service instance +gemini_tts_service = GeminiTTSService() diff --git a/backend/app/services/tts.py b/backend/app/services/tts.py index b335ee5..ff06109 100644 --- a/backend/app/services/tts.py +++ b/backend/app/services/tts.py @@ -7,47 +7,74 @@ from pydub import AudioSegment from ..core.config import settings from ..core.logging import get_logger +from .gemini_tts import gemini_tts_service logger = get_logger(__name__) + class TTSService: def __init__(self): - # Initialize Google TTS (uses GOOGLE_APPLICATION_CREDENTIALS env var) - # The same GCP credentials used for GCS also work for TTS + # Check Gemini TTS availability (uses same API key as other Gemini services) + self.gemini_available = bool(settings.gemini_api_key) + + # Initialize Google Cloud TTS (uses GOOGLE_APPLICATION_CREDENTIALS env var) try: self.google_client = texttospeech.TextToSpeechClient() - logger.info("Google TTS client initialized successfully") + logger.info("Google Cloud TTS client initialized successfully") except Exception as e: - logger.warning(f"Google TTS credentials not configured: {e}") + logger.warning(f"Google Cloud TTS credentials not configured: {e}") self.google_client = None # Check ElevenLabs availability self.elevenlabs_available = bool(settings.elevenlabs_api_key) + # Log configured provider + logger.info(f"TTS provider configured: {settings.tts_provider}") + async def synthesize_audio_description( self, ad_vtt_content: str, language_code: str = "en-US", - voice_name: Optional[str] = None + voice_name: Optional[str] = None, + provider: Optional[str] = None ) -> bytes: """ - Generate MP3 audio from audio description VTT content - Synthesizes each cue separately and stitches them together with timing - Uses Google TTS with ElevenLabs fallback + Generate MP3 audio from audio description VTT content. + Synthesizes each cue separately and stitches them together with timing. + + Provider priority: specified provider > settings.tts_provider > fallback chain + Fallback chain: Gemini -> Google Cloud TTS -> ElevenLabs """ - # Try Google TTS first, fallback to ElevenLabs - try: - if self.google_client: - return await self._synthesize_with_google(ad_vtt_content, language_code, voice_name) - elif self.elevenlabs_available: - return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name) - else: - raise ValueError("No TTS service configured") - except Exception as e: - if self.elevenlabs_available and self.google_client: - logger.warning(f"Google TTS failed, trying ElevenLabs: {e}") - return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name) - raise + # Determine which provider to use + active_provider = provider or settings.tts_provider + + # Extract simple language code for Gemini (e.g., "en-US" -> "en") + simple_lang = language_code.split("-")[0] if "-" in language_code else language_code + + # Try the configured provider first, then fallback + if active_provider == "gemini" and self.gemini_available: + try: + logger.info(f"Using Gemini TTS for language: {simple_lang}, voice: {voice_name}") + return await gemini_tts_service.synthesize_audio_description( + ad_vtt_content, simple_lang, voice_name + ) + except Exception as e: + logger.warning(f"Gemini TTS failed, falling back: {e}") + # Fall through to Google/ElevenLabs + + if active_provider == "google" or (active_provider == "gemini" and self.google_client): + try: + if self.google_client: + logger.info(f"Using Google Cloud TTS for language: {language_code}") + return await self._synthesize_with_google(ad_vtt_content, language_code, voice_name) + except Exception as e: + logger.warning(f"Google Cloud TTS failed: {e}") + + if self.elevenlabs_available: + logger.info(f"Using ElevenLabs TTS for language: {language_code}") + return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name) + + raise ValueError("No TTS service available") async def _synthesize_with_google( self, diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py index 5c492c1..0b2cfcc 100644 --- a/backend/app/tasks/translate_and_synthesize.py +++ b/backend/app/tasks/translate_and_synthesize.py @@ -269,7 +269,9 @@ async def _async_translate_and_synthesize(job_id: str): # Generate TTS for languages that need MP3 if job_doc["requested_outputs"]["audio_description_mp3"]: - await _generate_tts_for_languages(job_id, updated_outputs, db, source_language) + # Get TTS preferences from job + tts_preferences = job_doc["requested_outputs"].get("tts_preferences", {}) + await _generate_tts_for_languages(job_id, updated_outputs, db, source_language, tts_preferences) # Update final status await db.jobs.update_one( @@ -323,33 +325,53 @@ async def _async_translate_and_synthesize(job_id: str): client.close() -async def _generate_tts_for_languages(job_id: str, outputs: dict[str, Any], db, source_language: str = "en"): +async def _generate_tts_for_languages( + job_id: str, + outputs: dict[str, Any], + db, + source_language: str = "en", + tts_preferences: dict = None +): """Generate TTS audio for each language's audio description""" + if tts_preferences is None: + tts_preferences = {} # Always generate source language MP3 first if source_language in outputs and "ad_vtt_gcs" in outputs[source_language]: - await _generate_language_tts(job_id, source_language, outputs[source_language], db) + await _generate_language_tts(job_id, source_language, outputs[source_language], db, tts_preferences) # Generate for other languages for language, lang_output in outputs.items(): if language != source_language and "ad_vtt_gcs" in lang_output: - await _generate_language_tts(job_id, language, lang_output, db) + await _generate_language_tts(job_id, language, lang_output, db, tts_preferences) -async def _generate_language_tts(job_id: str, language: str, lang_output: dict, db): +async def _generate_language_tts(job_id: str, language: str, lang_output: dict, db, tts_preferences: dict = None): """Generate TTS for a specific language""" + if tts_preferences is None: + tts_preferences = {} + try: # Download AD VTT content ad_blob_path = lang_output["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") ad_blob = gcs_service.bucket.blob(ad_blob_path) ad_vtt_content = ad_blob.download_as_text() + # Get voice for this language from preferences + voices_per_language = tts_preferences.get("voices_per_language", {}) + voice_name = voices_per_language.get(language, tts_preferences.get("default_voice")) + provider = tts_preferences.get("provider", "gemini") + # Generate MP3 with retry language_code = f"{language}-US" if language == "en" else f"{language}-{language.upper()}" - + + logger.info(f"Generating TTS for {language} with voice={voice_name}, provider={provider}") + async def synthesize(): - return await tts_service.synthesize_audio_description(ad_vtt_content, language_code) - + return await tts_service.synthesize_audio_description( + ad_vtt_content, language_code, voice_name=voice_name, provider=provider + ) + mp3_data = await retry_with_backoff(synthesize, max_retries=3) # Upload MP3 to GCS diff --git a/frontend/src/components/VoicePreviewButton.tsx b/frontend/src/components/VoicePreviewButton.tsx new file mode 100644 index 0000000..21aa6b0 --- /dev/null +++ b/frontend/src/components/VoicePreviewButton.tsx @@ -0,0 +1,138 @@ +import { useState, useRef } from 'react'; +import { api } from '../lib/api'; + +interface VoicePreviewButtonProps { + voiceName: string; + language: string; + disabled?: boolean; +} + +export function VoicePreviewButton({ voiceName, language, disabled }: VoicePreviewButtonProps) { + const [isLoading, setIsLoading] = useState(false); + const [isPlaying, setIsPlaying] = useState(false); + const [error, setError] = useState(null); + const audioRef = useRef(null); + const audioUrlRef = useRef(null); + + const handlePreview = async () => { + setError(null); + + // If already playing, stop + if (isPlaying && audioRef.current) { + audioRef.current.pause(); + audioRef.current.currentTime = 0; + setIsPlaying(false); + return; + } + + // If we have cached audio, play it + if (audioUrlRef.current && audioRef.current) { + audioRef.current.play(); + setIsPlaying(true); + return; + } + + // Fetch new audio + setIsLoading(true); + try { + const blob = await api.previewVoice(voiceName, language); + const url = URL.createObjectURL(blob); + + // Clean up old URL if exists + if (audioUrlRef.current) { + URL.revokeObjectURL(audioUrlRef.current); + } + + audioUrlRef.current = url; + + // Create and play audio + const audio = new Audio(url); + audioRef.current = audio; + + audio.onended = () => { + setIsPlaying(false); + }; + + audio.onerror = () => { + setError('Failed to play audio'); + setIsPlaying(false); + }; + + await audio.play(); + setIsPlaying(true); + } catch (err) { + setError('Failed to generate preview'); + console.error('Voice preview error:', err); + } finally { + setIsLoading(false); + } + }; + + // Cleanup on unmount + // Note: We don't add cleanup in useEffect to allow audio caching within component lifecycle + + return ( +
+ + {error && {error}} +
+ ); +} diff --git a/frontend/src/components/VoiceSelector.tsx b/frontend/src/components/VoiceSelector.tsx new file mode 100644 index 0000000..483e9b4 --- /dev/null +++ b/frontend/src/components/VoiceSelector.tsx @@ -0,0 +1,216 @@ +import { useState, useEffect } from 'react'; +import { api } from '../lib/api'; +import { VoicePreviewButton } from './VoicePreviewButton'; +import type { TTSPreferences, VoicesResponse, LanguagesResponse } from '../types/api'; + +interface VoiceSelectorProps { + selectedLanguages: string[]; + preferences: TTSPreferences; + onChange: (preferences: TTSPreferences) => void; + disabled?: boolean; +} + +export function VoiceSelector({ + selectedLanguages, + preferences, + onChange, + disabled +}: VoiceSelectorProps) { + const [voices, setVoices] = useState(null); + const [languages, setLanguages] = useState(null); + const [activeLanguage, setActiveLanguage] = useState(selectedLanguages[0] || 'en'); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + + // Fetch voices and languages on mount + useEffect(() => { + const fetchData = async () => { + try { + setIsLoading(true); + const [voicesData, languagesData] = await Promise.all([ + api.getVoices(), + api.getLanguages() + ]); + setVoices(voicesData); + setLanguages(languagesData); + } catch (err) { + setError('Failed to load voice options'); + console.error('Voice selector error:', err); + } finally { + setIsLoading(false); + } + }; + + fetchData(); + }, []); + + // Update active language when selected languages change + useEffect(() => { + if (selectedLanguages.length > 0 && !selectedLanguages.includes(activeLanguage)) { + setActiveLanguage(selectedLanguages[0]); + } + }, [selectedLanguages, activeLanguage]); + + const handleDefaultVoiceChange = (voice: string) => { + onChange({ + ...preferences, + default_voice: voice + }); + }; + + const handleLanguageVoiceChange = (language: string, voice: string) => { + const newVoicesPerLanguage = { + ...preferences.voices_per_language, + [language]: voice + }; + + // If voice matches default, remove from per-language overrides + if (voice === preferences.default_voice) { + delete newVoicesPerLanguage[language]; + } + + onChange({ + ...preferences, + voices_per_language: newVoicesPerLanguage + }); + }; + + const getVoiceForLanguage = (language: string): string => { + return preferences.voices_per_language[language] || preferences.default_voice; + }; + + if (isLoading) { + return ( +
+ + + + + Loading voice options... +
+ ); + } + + if (error || !voices || !languages) { + return ( +
+ {error || 'Failed to load voice options'} +
+ ); + } + + // Filter languages to only show selected ones, always include English + const displayLanguages = selectedLanguages.length > 0 + ? ['en', ...selectedLanguages.filter(l => l !== 'en')] + : ['en']; + + return ( +
+ {/* Default Voice Selection */} +
+ +

+ This voice will be used for all languages unless overridden below. +

+
+ + +
+
+ + {/* Per-Language Voice Overrides */} + {displayLanguages.length > 1 && ( +
+ +

+ Optionally choose different voices for specific languages. +

+ + {/* Language Tabs */} +
+ {displayLanguages.map((lang) => ( + + ))} +
+ + {/* Voice Selection for Active Language */} +
+ + +
+ + {/* Sample Text Display */} +
+ Preview text: + + "{languages.preview_samples[activeLanguage] || languages.preview_samples['en']}" + +
+
+ )} +
+ ); +} diff --git a/frontend/src/hooks/useJob.ts b/frontend/src/hooks/useJob.ts index b7cd408..059135a 100644 --- a/frontend/src/hooks/useJob.ts +++ b/frontend/src/hooks/useJob.ts @@ -1,10 +1,11 @@ import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; import { apiClient } from '../lib/api'; -import type { - Job, +import type { + Job, JobCreateRequest, VttUpdateRequest, - BulkDeleteRequest + BulkDeleteRequest, + TTSPreferences } from '../types/api'; // Query hooks @@ -88,8 +89,8 @@ export function useApproveEnglish() { const queryClient = useQueryClient(); return useMutation({ - mutationFn: ({ id, notes }: { id: string; notes?: string }) => - apiClient.approveEnglish(id, notes), + mutationFn: ({ id, notes, tts_preferences }: { id: string; notes?: string; tts_preferences?: TTSPreferences }) => + apiClient.approveSource(id, notes, tts_preferences), onSuccess: (_, { id }) => { queryClient.invalidateQueries({ queryKey: ['jobs', id] }); queryClient.invalidateQueries({ queryKey: ['jobs'] }); @@ -101,8 +102,8 @@ export function useApproveSource() { const queryClient = useQueryClient(); return useMutation({ - mutationFn: ({ id, notes }: { id: string; notes?: string }) => - apiClient.approveSource(id, notes), + mutationFn: ({ id, notes, tts_preferences }: { id: string; notes?: string; tts_preferences?: TTSPreferences }) => + apiClient.approveSource(id, notes, tts_preferences), onSuccess: (_, { id }) => { queryClient.invalidateQueries({ queryKey: ['jobs', id] }); queryClient.invalidateQueries({ queryKey: ['jobs'] }); diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 8e897fe..d0f806f 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -21,6 +21,9 @@ import type { UpdateUserRequest, ResetPasswordResponse, AdminStatsResponse, + VoicesResponse, + LanguagesResponse, + TTSPreferences, } from '../types/api'; const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || 'http://localhost:8000'; @@ -175,8 +178,11 @@ class ApiClient { return this.approveSource(id, notes); } - async approveSource(id: string, notes?: string): Promise { - const response = await this.client.post(`/jobs/${id}/actions/approve_source`, { notes }); + async approveSource(id: string, notes?: string, tts_preferences?: TTSPreferences): Promise { + const response = await this.client.post(`/jobs/${id}/actions/approve_source`, { + notes, + tts_preferences + }); return response.data; } @@ -287,6 +293,26 @@ class ApiClient { const response = await this.client.get('/admin/stats'); return response.data; } + + // TTS endpoints + async getVoices(): Promise { + const response = await this.client.get('/tts/voices'); + return response.data; + } + + async getLanguages(): Promise { + const response = await this.client.get('/tts/languages'); + return response.data; + } + + async previewVoice(voiceName: string, language: string): Promise { + const response = await this.client.post( + '/tts/preview', + { voice_name: voiceName, language }, + { responseType: 'blob' } + ); + return response.data; + } } export const apiClient = new ApiClient(); diff --git a/frontend/src/routes/admin/QCDetail.tsx b/frontend/src/routes/admin/QCDetail.tsx index 3b6a6d2..f23b801 100644 --- a/frontend/src/routes/admin/QCDetail.tsx +++ b/frontend/src/routes/admin/QCDetail.tsx @@ -4,7 +4,9 @@ import { useJob, useApproveEnglish, useRejectJob, useJobVttContent, useUpdateJob import { StatusBadge } from '../../components/StatusBadge'; import { VttEditor } from '../../components/VttEditor/VttEditor'; import { VideoWithCaptions } from '../../components/VideoWithCaptions'; +import { VoiceSelector } from '../../components/VoiceSelector'; import { useToastContext } from '../../contexts/ToastContext'; +import type { TTSPreferences } from '../../types/api'; export function QCDetail() { const { id } = useParams<{ id: string }>(); @@ -30,6 +32,12 @@ export function QCDetail() { const [timingOffset, setTimingOffset] = useState(0); const [adjustCaptions, setAdjustCaptions] = useState(true); const [adjustAudioDescription, setAdjustAudioDescription] = useState(true); + const [showVoiceSettings, setShowVoiceSettings] = useState(false); + const [ttsPreferences, setTtsPreferences] = useState({ + provider: 'gemini', + default_voice: 'Kore', + voices_per_language: {} + }); const isProcessing = approveEnglishMutation.isPending || rejectJobMutation.isPending || updateVttMutation.isPending || adjustTimingMutation.isPending; @@ -47,6 +55,13 @@ export function QCDetail() { } }, [vttContent]); + // Initialize TTS preferences from job when loaded + useEffect(() => { + if (job?.requested_outputs?.tts_preferences) { + setTtsPreferences(job.requested_outputs.tts_preferences); + } + }, [job]); + // Keyboard shortcuts useEffect(() => { const handleKeyPress = (event: KeyboardEvent) => { @@ -131,16 +146,20 @@ export function QCDetail() { const handleApprove = async () => { if (!id) return; - + // Save any pending changes first if (hasUnsavedChanges) { await saveVttChanges(); } - + try { - await approveEnglishMutation.mutateAsync({ - id, - notes: reviewNotes + // Only pass TTS preferences if MP3 generation is requested + const ttsPrefsToSend = job?.requested_outputs?.audio_description_mp3 ? ttsPreferences : undefined; + + await approveEnglishMutation.mutateAsync({ + id, + notes: reviewNotes, + tts_preferences: ttsPrefsToSend }); toast.toastOnly.success('Job approved successfully'); navigate('/admin/qc'); @@ -486,6 +505,45 @@ export function QCDetail() { )} + {/* Voice Settings - Only show if MP3 generation is requested */} + {job?.requested_outputs?.audio_description_mp3 && ( +
+ + {showVoiceSettings && ( +
+ +
+ )} +
+ )} + {/* Review Notes */}
+ {/* Voice Settings - Collapsible */} + {audioDescriptionMp3 && ( +
+ + {showVoiceSettings && ( +
+ +
+ )} +
+ )} + {/* Target Languages */}