From 1e177a6d5cd8a6a9c9b7122a910d77bb065ba868 Mon Sep 17 00:00:00 2001 From: Vadym Samoilenko Date: Tue, 3 Mar 2026 13:58:56 +0000 Subject: [PATCH] feat: add ElevenLabs voice selection to frontend and backend Add dynamic ElevenLabs voice catalog with provider toggle in the UI, allowing users to browse ElevenLabs voices, configure stability and similarity boost settings, and preview/synthesize with ElevenLabs TTS. Backend: - New elevenlabs_voices.py service with 1-hour cached API fetching - TTS routes support ?provider= query param for voices and options - Preview endpoint routes to ElevenLabs or Gemini based on provider - stability/similarity_boost params flow through TTS synthesis pipeline - TTSPreferences model extended with ElevenLabs-specific fields - Deprecated hardcoded elevenlabs_voices config (now fetched dynamically) Frontend: - Provider toggle (Gemini/ElevenLabs) in VoiceSelector - ElevenLabsSettingsPanel with stability and similarity boost sliders - VoicePreviewButton supports provider-specific preview parameters - API client passes provider param to voices, options, and preview endpoints - New VoiceInfo, ProviderVoicesResponse, ProviderOptionsResponse types Co-Authored-By: Claude Opus 4.6 --- backend/app/api/v1/routes_tts.py | 171 +++++++++++++++--- backend/app/core/config.py | 9 +- backend/app/models/job.py | 3 + backend/app/services/elevenlabs_voices.py | 101 +++++++++++ backend/app/services/tts.py | 39 +++- backend/app/tasks/tts_synthesis.py | 23 ++- .../components/ElevenLabsSettingsPanel.tsx | 64 +++++++ frontend/src/components/TTSSettingsPanel.tsx | 8 +- .../src/components/VoicePreviewButton.tsx | 20 +- frontend/src/components/VoiceSelector.tsx | 118 +++++++++--- frontend/src/lib/api.ts | 23 ++- frontend/src/routes/admin/QCDetail.tsx | 4 +- frontend/src/routes/jobs/NewJob.tsx | 4 +- frontend/src/types/api.ts | 38 ++++ 14 files changed, 537 insertions(+), 88 deletions(-) create mode 100644 backend/app/services/elevenlabs_voices.py create mode 100644 frontend/src/components/ElevenLabsSettingsPanel.tsx diff --git a/backend/app/api/v1/routes_tts.py b/backend/app/api/v1/routes_tts.py index 87bd4c2..c715de3 100644 --- a/backend/app/api/v1/routes_tts.py +++ b/backend/app/api/v1/routes_tts.py @@ -1,12 +1,14 @@ from typing import Literal, Optional -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Query from fastapi.responses import Response from pydantic import BaseModel, Field from ...core.config import settings from ...core.logging import get_logger from ...services.gemini_tts import gemini_tts_service +from ...services.elevenlabs_voices import elevenlabs_voice_service +from ...services.tts import tts_service from ...core.dependencies import get_current_user logger = get_logger(__name__) @@ -18,17 +20,33 @@ class VoicePreviewRequest(BaseModel): """Request to generate a voice preview""" voice_name: str language: str = "en" + provider: Literal["gemini", "elevenlabs"] = "gemini" + # Gemini-specific model: Literal["flash", "pro"] = "flash" speed: float = Field(default=1.0, ge=0.5, le=2.0) style_preset: Literal[ "neutral", "calm", "energetic", "professional", "warm", "documentary", "custom" ] = "neutral" custom_style_prompt: Optional[str] = None + # ElevenLabs-specific + stability: Optional[float] = Field(default=None, ge=0.0, le=1.0) + similarity_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0) -class VoicesResponse(BaseModel): - """Available TTS voices""" - voices: list[str] +class VoiceInfo(BaseModel): + """Structured voice information for any provider.""" + id: str + name: str + description: Optional[str] = None + preview_url: Optional[str] = None + labels: Optional[dict[str, str]] = None + category: Optional[str] = None + + +class ProviderVoicesResponse(BaseModel): + """Available TTS voices for a specific provider.""" + provider: str + voices: list[VoiceInfo] default: str @@ -52,23 +70,63 @@ class SpeedRange(BaseModel): step: float -class TTSOptionsResponse(BaseModel): - """Available TTS configuration options""" - models: list[TTSOptionItem] - style_presets: list[TTSOptionItem] - speed_range: SpeedRange +class FloatRange(BaseModel): + """Generic float range for sliders.""" + min: float + max: float + default: float + step: float -@router.get("/voices", response_model=VoicesResponse) +class ProviderOptionsResponse(BaseModel): + """Available TTS configuration options for a provider.""" + provider: str + # Gemini-specific + models: Optional[list[TTSOptionItem]] = None + style_presets: Optional[list[TTSOptionItem]] = None + speed_range: Optional[SpeedRange] = None + # ElevenLabs-specific + stability_range: Optional[FloatRange] = None + similarity_boost_range: Optional[FloatRange] = None + + +@router.get("/voices", response_model=ProviderVoicesResponse) async def list_voices( - current_user=Depends(get_current_user) -) -> VoicesResponse: + provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"), + current_user=Depends(get_current_user), +) -> ProviderVoicesResponse: """ - List all available Gemini TTS voices. + List available TTS voices for the specified provider. """ - return VoicesResponse( - voices=settings.gemini_tts_voices, - default=settings.gemini_tts_default_voice + if provider == "elevenlabs": + el_voices = await elevenlabs_voice_service.get_voices() + voices = [ + VoiceInfo( + id=v.voice_id, + name=v.name, + description=v.description or None, + preview_url=v.preview_url or None, + labels=v.labels or None, + category=v.category or None, + ) + for v in el_voices + ] + default_id = voices[0].id if voices else "" + return ProviderVoicesResponse( + provider="elevenlabs", + voices=voices, + default=default_id, + ) + + # Default: Gemini + voices = [ + VoiceInfo(id=name, name=name) + for name in settings.gemini_tts_voices + ] + return ProviderVoicesResponse( + provider="gemini", + voices=voices, + default=settings.gemini_tts_default_voice, ) @@ -85,14 +143,24 @@ async def list_languages( ) -@router.get("/options", response_model=TTSOptionsResponse) +@router.get("/options", response_model=ProviderOptionsResponse) async def get_tts_options( - current_user=Depends(get_current_user) -) -> TTSOptionsResponse: + provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"), + current_user=Depends(get_current_user), +) -> ProviderOptionsResponse: """ - Get available TTS configuration options including models, style presets, and speed range. + Get available TTS configuration options for the specified provider. """ - return TTSOptionsResponse( + if provider == "elevenlabs": + return ProviderOptionsResponse( + provider="elevenlabs", + stability_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05), + similarity_boost_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05), + ) + + # Default: Gemini + return ProviderOptionsResponse( + provider="gemini", models=[ TTSOptionItem(value="flash", label="Flash (Fast, Cost-efficient)"), TTSOptionItem(value="pro", label="Pro (Higher Quality)"), @@ -111,7 +179,7 @@ async def get_tts_options( max=settings.gemini_tts_speed_max, default=settings.gemini_tts_speed_default, step=settings.gemini_tts_speed_step - ) + ), ) @@ -124,6 +192,14 @@ async def preview_voice( Generate a voice preview audio sample with all TTS settings applied. Returns MP3 audio data. """ + if request.provider == "elevenlabs": + return await _preview_elevenlabs(request) + + return await _preview_gemini(request) + + +async def _preview_gemini(request: VoicePreviewRequest) -> Response: + """Generate a Gemini TTS voice preview.""" # Validate voice name if request.voice_name not in settings.gemini_tts_voices: raise HTTPException( @@ -146,11 +222,10 @@ async def preview_voice( try: logger.info( - f"Generating voice preview: voice={request.voice_name}, language={request.language}, " + f"Generating Gemini voice preview: voice={request.voice_name}, language={request.language}, " f"model={request.model}, speed={request.speed}x, style={request.style_preset}" ) - # Generate preview audio with all settings audio_data = await gemini_tts_service.synthesize_preview( voice_name=request.voice_name, language=request.language, @@ -168,7 +243,53 @@ async def preview_voice( ) except Exception as e: - logger.error(f"Voice preview generation failed: {e}") + logger.error(f"Gemini voice preview generation failed: {e}") + raise HTTPException( + status_code=500, + detail=f"Failed to generate voice preview: {str(e)}" + ) from e + + +async def _preview_elevenlabs(request: VoicePreviewRequest) -> Response: + """Generate an ElevenLabs TTS voice preview.""" + if not tts_service.elevenlabs_available: + raise HTTPException( + status_code=400, + detail="ElevenLabs TTS is not configured" + ) + + # Get sample text for the language + sample_text = settings.gemini_tts_preview_samples.get( + request.language, + settings.gemini_tts_preview_samples.get("en", "This is a preview of the audio description voice.") + ) + + stability = request.stability if request.stability is not None else 0.5 + similarity_boost = request.similarity_boost if request.similarity_boost is not None else 0.5 + + try: + logger.info( + f"Generating ElevenLabs voice preview: voice={request.voice_name}, language={request.language}, " + f"stability={stability}, similarity_boost={similarity_boost}" + ) + + audio_data = await tts_service._synthesize_text_elevenlabs( + text=sample_text, + voice_id=request.voice_name, + stability=stability, + similarity_boost=similarity_boost, + ) + + return Response( + content=audio_data, + media_type="audio/mpeg", + headers={ + "Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3" + } + ) + + except Exception as e: + logger.error(f"ElevenLabs voice preview generation failed: {e}") raise HTTPException( status_code=500, detail=f"Failed to generate voice preview: {str(e)}" diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 86e7025..e82befc 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -45,12 +45,9 @@ class Settings(BaseSettings): "fr-FR": "fr-FR-Neural2-A", "de-DE": "de-DE-Neural2-B" } - elevenlabs_voices: dict[str, str] = { - "en-US": "21m00Tcm4TlvDq8ikWAM", - "es-ES": "VR6AewLTigWG4xSOukaG", - "fr-FR": "TxGEqnHWrfWFTfGW9XjX", - "de-DE": "pNInz6obpgDQGcFmaJgB" - } + # Deprecated: ElevenLabs voices are now fetched dynamically via the API. + # This fallback map is only used by _get_elevenlabs_voice() when no voice_name is provided. + elevenlabs_voices: dict[str, str] = {} # Gemini TTS Configuration gemini_tts_model: str = "gemini-2.5-flash-preview-tts" diff --git a/backend/app/models/job.py b/backend/app/models/job.py index 2aa0490..caea03d 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -51,6 +51,9 @@ class TTSPreferences(BaseModel): "neutral", "calm", "energetic", "professional", "warm", "documentary", "custom" ] = "neutral" custom_style_prompt: Optional[str] = None # Used when style_preset is "custom" + # ElevenLabs-specific settings + stability: Optional[float] = None # 0.0-1.0, default 0.5 when used + similarity_boost: Optional[float] = None # 0.0-1.0, default 0.5 when used class RequestedOutputs(BaseModel): diff --git a/backend/app/services/elevenlabs_voices.py b/backend/app/services/elevenlabs_voices.py new file mode 100644 index 0000000..e03e331 --- /dev/null +++ b/backend/app/services/elevenlabs_voices.py @@ -0,0 +1,101 @@ +""" +ElevenLabs Voice Catalog Service. + +Fetches and caches available voices from the ElevenLabs API. +""" + +import time +from dataclasses import dataclass, field +from typing import Optional + +import aiohttp + +from ..core.config import settings +from ..core.logging import get_logger + +logger = get_logger(__name__) + +CACHE_TTL_SECONDS = 3600 # 1 hour + + +@dataclass +class ElevenLabsVoice: + """Structured voice data from ElevenLabs.""" + voice_id: str + name: str + category: str = "" + description: str = "" + preview_url: str = "" + labels: dict[str, str] = field(default_factory=dict) + + +class ElevenLabsVoiceService: + def __init__(self): + self._cache: list[ElevenLabsVoice] = [] + self._cache_time: float = 0.0 + + def _is_cache_valid(self) -> bool: + return bool(self._cache) and (time.time() - self._cache_time) < CACHE_TTL_SECONDS + + async def get_voices(self) -> list[ElevenLabsVoice]: + """ + Fetch voices from ElevenLabs API with in-memory cache (1-hour TTL). + Falls back to stale cache on API failure. + """ + if self._is_cache_valid(): + return self._cache + + if not settings.elevenlabs_api_key: + logger.warning("ElevenLabs API key not configured") + return self._cache # Return stale cache or empty + + try: + voices = await self._fetch_voices() + self._cache = voices + self._cache_time = time.time() + logger.info(f"Fetched {len(voices)} voices from ElevenLabs API") + return voices + except Exception as e: + logger.warning(f"Failed to fetch ElevenLabs voices, using stale cache: {e}") + return self._cache # Stale cache fallback + + async def _fetch_voices(self) -> list[ElevenLabsVoice]: + """Fetch voices from the ElevenLabs API.""" + url = "https://api.elevenlabs.io/v1/voices" + headers = { + "xi-api-key": settings.elevenlabs_api_key, + "Accept": "application/json", + } + + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=headers) as response: + if response.status != 200: + error_text = await response.text() + raise ValueError(f"ElevenLabs API error: {response.status} - {error_text}") + + data = await response.json() + + voices: list[ElevenLabsVoice] = [] + for v in data.get("voices", []): + voices.append(ElevenLabsVoice( + voice_id=v.get("voice_id", ""), + name=v.get("name", ""), + category=v.get("category", ""), + description=v.get("description", ""), + preview_url=v.get("preview_url", ""), + labels=v.get("labels", {}), + )) + + return voices + + async def get_voice_by_id(self, voice_id: str) -> Optional[ElevenLabsVoice]: + """Look up a specific voice by ID.""" + voices = await self.get_voices() + for v in voices: + if v.voice_id == voice_id: + return v + return None + + +# Singleton instance +elevenlabs_voice_service = ElevenLabsVoiceService() diff --git a/backend/app/services/tts.py b/backend/app/services/tts.py index fd49528..dd66e35 100644 --- a/backend/app/services/tts.py +++ b/backend/app/services/tts.py @@ -51,7 +51,9 @@ class TTSService: provider: Optional[str] = None, model: str = "flash", speed: float = 1.0, - style_prompt: str = "" + style_prompt: str = "", + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> bytes: """ Generate MP3 audio from audio description VTT content. @@ -104,7 +106,10 @@ class TTSService: if self.elevenlabs_available: logger.info(f"Using ElevenLabs TTS for language: {language_code}") - return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name) + return await self._synthesize_with_elevenlabs( + ad_vtt_content, language_code, voice_name, + stability=stability, similarity_boost=similarity_boost, + ) raise ValueError("No TTS service available") @@ -116,7 +121,9 @@ class TTSService: provider: Optional[str] = None, model: str = "flash", speed: float = 1.0, - style_prompt: str = "" + style_prompt: str = "", + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> tuple[bytes, list[TTSCueSegment]]: """ Generate MP3 audio from audio description VTT content AND return individual segments. @@ -168,7 +175,10 @@ class TTSService: audio_data = await self._synthesize_text_google(text, language_code, voice_name) elif self.elevenlabs_available: voice_id = self._get_elevenlabs_voice(language_code, voice_name) - audio_data = await self._synthesize_text_elevenlabs(text, voice_id) + audio_data = await self._synthesize_text_elevenlabs( + text, voice_id, + stability=stability, similarity_boost=similarity_boost, + ) else: raise ValueError("No TTS service available") @@ -277,7 +287,9 @@ class TTSService: self, ad_vtt_content: str, language_code: str = "en-US", - voice_name: Optional[str] = None + voice_name: Optional[str] = None, + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> bytes: """Generate MP3 using ElevenLabs TTS""" # Parse VTT cues @@ -307,7 +319,10 @@ class TTSService: # Synthesize this cue with ElevenLabs text = cue["text"].strip() if text: - audio_data = await self._synthesize_text_elevenlabs(text, voice_id) + audio_data = await self._synthesize_text_elevenlabs( + text, voice_id, + stability=stability, similarity_boost=similarity_boost, + ) # Convert to AudioSegment and get actual duration audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3") @@ -360,7 +375,13 @@ class TTSService: return response.audio_content - async def _synthesize_text_elevenlabs(self, text: str, voice_id: str) -> bytes: + async def _synthesize_text_elevenlabs( + self, + text: str, + voice_id: str, + stability: float = 0.5, + similarity_boost: float = 0.5, + ) -> bytes: """Synthesize text using ElevenLabs API""" url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}" @@ -374,8 +395,8 @@ class TTSService: "text": text, "model_id": "eleven_multilingual_v2", "voice_settings": { - "stability": 0.5, - "similarity_boost": 0.5, + "stability": stability, + "similarity_boost": similarity_boost, "style": 0.0, "use_speaker_boost": True } diff --git a/backend/app/tasks/tts_synthesis.py b/backend/app/tasks/tts_synthesis.py index 66cdb3f..07c4aee 100644 --- a/backend/app/tasks/tts_synthesis.py +++ b/backend/app/tasks/tts_synthesis.py @@ -44,7 +44,9 @@ def synthesize_cue_task( provider: str, model: str, speed: float, - style_prompt: str + style_prompt: str, + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> dict: """ Synthesize a single AD cue and upload to GCS immediately. @@ -84,7 +86,9 @@ def synthesize_cue_task( provider=provider, model=model, speed=speed, - style_prompt=style_prompt + style_prompt=style_prompt, + stability=stability, + similarity_boost=similarity_boost, ) ) @@ -154,7 +158,9 @@ async def _synthesize_single_cue( provider: str, model: str, speed: float, - style_prompt: str + style_prompt: str, + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> tuple[bytes, float]: """ Synthesize a single cue's text to audio. @@ -186,7 +192,10 @@ async def _synthesize_single_cue( elif provider == "elevenlabs": language_code = f"{simple_lang}-US" if simple_lang == "en" else f"{simple_lang}-{simple_lang.upper()}" voice_id = tts_service._get_elevenlabs_voice(language_code, voice_name) - audio_bytes = await tts_service._synthesize_text_elevenlabs(text, voice_id) + audio_bytes = await tts_service._synthesize_text_elevenlabs( + text, voice_id, + stability=stability, similarity_boost=similarity_boost, + ) else: raise ValueError(f"Unknown TTS provider: {provider}") @@ -262,6 +271,8 @@ def dispatch_language_tts( speed = tts_preferences.get("speed", 1.0) style_preset = tts_preferences.get("style_preset", "neutral") custom_style_prompt = tts_preferences.get("custom_style_prompt") + stability = tts_preferences.get("stability") if tts_preferences.get("stability") is not None else 0.5 + similarity_boost = tts_preferences.get("similarity_boost") if tts_preferences.get("similarity_boost") is not None else 0.5 # Resolve style prompt from preset or custom if style_preset == "custom" and custom_style_prompt: @@ -287,7 +298,9 @@ def dispatch_language_tts( provider=provider, model=model, speed=speed, - style_prompt=style_prompt + style_prompt=style_prompt, + stability=stability, + similarity_boost=similarity_boost, ) for i, cue in enumerate(cues) if cue.get("text", "").strip() # Skip empty cues diff --git a/frontend/src/components/ElevenLabsSettingsPanel.tsx b/frontend/src/components/ElevenLabsSettingsPanel.tsx new file mode 100644 index 0000000..92c7196 --- /dev/null +++ b/frontend/src/components/ElevenLabsSettingsPanel.tsx @@ -0,0 +1,64 @@ +import type { TTSPreferences } from '../types/api'; + +interface ElevenLabsSettingsPanelProps { + preferences: TTSPreferences; + onChange: (preferences: TTSPreferences) => void; + disabled?: boolean; +} + +export function ElevenLabsSettingsPanel({ preferences, onChange, disabled }: ElevenLabsSettingsPanelProps) { + const stability = preferences.stability ?? 0.5; + const similarityBoost = preferences.similarity_boost ?? 0.5; + + return ( +
+ {/* Stability Slider */} +
+ +

+ Controls how consistent the voice is between regenerations. Higher values are more stable. +

+ onChange({ ...preferences, stability: parseFloat(e.target.value) })} + disabled={disabled} + className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed" + /> +
+ More Variable + More Stable +
+
+ + {/* Similarity Boost Slider */} +
+ +

+ Controls how closely the voice tries to match the original. Higher values increase similarity. +

+ onChange({ ...preferences, similarity_boost: parseFloat(e.target.value) })} + disabled={disabled} + className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed" + /> +
+ Low + High +
+
+
+ ); +} diff --git a/frontend/src/components/TTSSettingsPanel.tsx b/frontend/src/components/TTSSettingsPanel.tsx index 78dd743..060412a 100644 --- a/frontend/src/components/TTSSettingsPanel.tsx +++ b/frontend/src/components/TTSSettingsPanel.tsx @@ -1,6 +1,6 @@ import { useState, useEffect } from 'react'; import { api } from '../lib/api'; -import type { TTSPreferences, TTSOptionsResponse, TTSModel, TTSStylePreset } from '../types/api'; +import type { TTSPreferences, ProviderOptionsResponse, TTSModel, TTSStylePreset } from '../types/api'; interface TTSSettingsPanelProps { preferences: TTSPreferences; @@ -9,7 +9,7 @@ interface TTSSettingsPanelProps { } export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSettingsPanelProps) { - const [options, setOptions] = useState(null); + const [options, setOptions] = useState(null); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); @@ -18,7 +18,7 @@ export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSetting const fetchOptions = async () => { try { setLoading(true); - const data = await api.getTTSOptions(); + const data = await api.getTTSOptions('gemini'); setOptions(data); setError(null); } catch (err) { @@ -50,7 +50,7 @@ export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSetting ); } - if (!options) return null; + if (!options || !options.models || !options.style_presets || !options.speed_range) return null; return (
diff --git a/frontend/src/components/VoicePreviewButton.tsx b/frontend/src/components/VoicePreviewButton.tsx index 4afe917..2a5b095 100644 --- a/frontend/src/components/VoicePreviewButton.tsx +++ b/frontend/src/components/VoicePreviewButton.tsx @@ -1,25 +1,31 @@ import { useState, useRef, useEffect } from 'react'; import { api } from '../lib/api'; -import type { TTSStylePreset } from '../types/api'; +import type { TTSStylePreset, TTSProvider } from '../types/api'; interface VoicePreviewButtonProps { voiceName: string; language: string; disabled?: boolean; + provider?: TTSProvider; model?: string; speed?: number; stylePreset?: TTSStylePreset; customStylePrompt?: string; + stability?: number; + similarityBoost?: number; } export function VoicePreviewButton({ voiceName, language, disabled, + provider, model, speed, stylePreset, - customStylePrompt + customStylePrompt, + stability, + similarityBoost, }: VoicePreviewButtonProps) { const [isLoading, setIsLoading] = useState(false); const [isPlaying, setIsPlaying] = useState(false); @@ -41,7 +47,7 @@ export function VoicePreviewButton({ } setIsPlaying(false); setError(null); - }, [voiceName, language, model, speed, stylePreset, customStylePrompt]); + }, [voiceName, language, provider, model, speed, stylePreset, customStylePrompt, stability, similarityBoost]); const handlePreview = async () => { setError(null); @@ -70,7 +76,10 @@ export function VoicePreviewButton({ model, speed, stylePreset, - customStylePrompt + customStylePrompt, + provider, + stability, + similarityBoost, ); const url = URL.createObjectURL(blob); @@ -104,9 +113,6 @@ export function VoicePreviewButton({ } }; - // Cleanup on unmount - // Note: We don't add cleanup in useEffect to allow audio caching within component lifecycle - return (
+ +
+
+ {/* Default Voice Selection */}
- {/* TTS Settings (Model, Speed, Style) */} - + {/* TTS Settings - Provider-specific */} + {preferences.provider === 'elevenlabs' ? ( + + ) : ( + + )} {/* Per-Language Voice Overrides */} {displayLanguages.length > 1 && ( @@ -202,9 +272,10 @@ export function VoiceSelector({ className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm" > {voices.voices.map((voice) => ( - ))} @@ -212,10 +283,13 @@ export function VoiceSelector({ voiceName={getVoiceForLanguage(activeLanguage)} language={activeLanguage} disabled={disabled} + provider={preferences.provider} model={preferences.model} speed={preferences.speed} stylePreset={preferences.style_preset} customStylePrompt={preferences.custom_style_prompt} + stability={preferences.stability} + similarityBoost={preferences.similarity_boost} /> diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index b935817..70fa1ca 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -25,11 +25,12 @@ import type { UpdateUserRequest, ResetPasswordResponse, AdminStatsResponse, - VoicesResponse, + ProviderVoicesResponse, LanguagesResponse, TTSPreferences, - TTSOptionsResponse, + ProviderOptionsResponse, TTSStylePreset, + TTSProvider, AccessibleVideoMethod, ReviewNote, ReviewNoteCreateRequest, @@ -349,8 +350,8 @@ class ApiClient { } // TTS endpoints - async getVoices(): Promise { - const response = await this.client.get('/tts/voices'); + async getVoices(provider: TTSProvider = 'gemini'): Promise { + const response = await this.client.get(`/tts/voices?provider=${provider}`); return response.data; } @@ -359,8 +360,8 @@ class ApiClient { return response.data; } - async getTTSOptions(): Promise { - const response = await this.client.get('/tts/options'); + async getTTSOptions(provider: TTSProvider = 'gemini'): Promise { + const response = await this.client.get(`/tts/options?provider=${provider}`); return response.data; } @@ -370,17 +371,23 @@ class ApiClient { model?: string, speed?: number, stylePreset?: TTSStylePreset, - customStylePrompt?: string + customStylePrompt?: string, + provider?: TTSProvider, + stability?: number, + similarityBoost?: number, ): Promise { const response = await this.client.post( '/tts/preview', { voice_name: voiceName, language, + provider: provider || 'gemini', model: model || 'flash', speed: speed || 1.0, style_preset: stylePreset || 'neutral', - custom_style_prompt: customStylePrompt + custom_style_prompt: customStylePrompt, + stability: stability, + similarity_boost: similarityBoost, }, { responseType: 'blob' } ); diff --git a/frontend/src/routes/admin/QCDetail.tsx b/frontend/src/routes/admin/QCDetail.tsx index 4e9d4b8..2b8a510 100644 --- a/frontend/src/routes/admin/QCDetail.tsx +++ b/frontend/src/routes/admin/QCDetail.tsx @@ -91,7 +91,9 @@ export function QCDetail() { model: 'flash', speed: 1.0, style_preset: 'neutral', - custom_style_prompt: undefined + custom_style_prompt: undefined, + stability: undefined, + similarity_boost: undefined, }); const [originalTtsPreferences, setOriginalTtsPreferences] = useState(null); diff --git a/frontend/src/routes/jobs/NewJob.tsx b/frontend/src/routes/jobs/NewJob.tsx index 55a7ac1..117ba28 100644 --- a/frontend/src/routes/jobs/NewJob.tsx +++ b/frontend/src/routes/jobs/NewJob.tsx @@ -44,7 +44,9 @@ export function NewJob() { model: 'flash', speed: 1.0, style_preset: 'neutral', - custom_style_prompt: undefined + custom_style_prompt: undefined, + stability: undefined, + similarity_boost: undefined, }); const [accessibleVideoMethod, setAccessibleVideoMethod] = useState('pause_insert'); diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index 5c8e167..8d76ef4 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -55,6 +55,9 @@ export interface TTSPreferences { speed: number; style_preset: TTSStylePreset; custom_style_prompt?: string; + // ElevenLabs-specific settings + stability?: number; + similarity_boost?: number; } export interface RequestedOutputs { @@ -69,6 +72,22 @@ export interface RequestedOutputs { translation_mode?: TranslationMode; // "video_native" (default) or "traditional" } +export interface VoiceInfo { + id: string; + name: string; + description?: string; + preview_url?: string; + labels?: Record; + category?: string; +} + +export interface ProviderVoicesResponse { + provider: string; + voices: VoiceInfo[]; + default: string; +} + +/** @deprecated Use ProviderVoicesResponse instead */ export interface VoicesResponse { voices: string[]; default: string; @@ -91,6 +110,25 @@ export interface SpeedRange { step: number; } +export interface FloatRange { + min: number; + max: number; + default: number; + step: number; +} + +export interface ProviderOptionsResponse { + provider: string; + // Gemini-specific + models?: TTSOptionItem[]; + style_presets?: TTSOptionItem[]; + speed_range?: SpeedRange; + // ElevenLabs-specific + stability_range?: FloatRange; + similarity_boost_range?: FloatRange; +} + +/** @deprecated Use ProviderOptionsResponse instead */ export interface TTSOptionsResponse { models: TTSOptionItem[]; style_presets: TTSOptionItem[];