diff --git a/backend/app/api/v1/routes_tts.py b/backend/app/api/v1/routes_tts.py index 87bd4c2..c715de3 100644 --- a/backend/app/api/v1/routes_tts.py +++ b/backend/app/api/v1/routes_tts.py @@ -1,12 +1,14 @@ from typing import Literal, Optional -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Query from fastapi.responses import Response from pydantic import BaseModel, Field from ...core.config import settings from ...core.logging import get_logger from ...services.gemini_tts import gemini_tts_service +from ...services.elevenlabs_voices import elevenlabs_voice_service +from ...services.tts import tts_service from ...core.dependencies import get_current_user logger = get_logger(__name__) @@ -18,17 +20,33 @@ class VoicePreviewRequest(BaseModel): """Request to generate a voice preview""" voice_name: str language: str = "en" + provider: Literal["gemini", "elevenlabs"] = "gemini" + # Gemini-specific model: Literal["flash", "pro"] = "flash" speed: float = Field(default=1.0, ge=0.5, le=2.0) style_preset: Literal[ "neutral", "calm", "energetic", "professional", "warm", "documentary", "custom" ] = "neutral" custom_style_prompt: Optional[str] = None + # ElevenLabs-specific + stability: Optional[float] = Field(default=None, ge=0.0, le=1.0) + similarity_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0) -class VoicesResponse(BaseModel): - """Available TTS voices""" - voices: list[str] +class VoiceInfo(BaseModel): + """Structured voice information for any provider.""" + id: str + name: str + description: Optional[str] = None + preview_url: Optional[str] = None + labels: Optional[dict[str, str]] = None + category: Optional[str] = None + + +class ProviderVoicesResponse(BaseModel): + """Available TTS voices for a specific provider.""" + provider: str + voices: list[VoiceInfo] default: str @@ -52,23 +70,63 @@ class SpeedRange(BaseModel): step: float -class TTSOptionsResponse(BaseModel): - """Available TTS configuration options""" - models: list[TTSOptionItem] - style_presets: list[TTSOptionItem] - speed_range: SpeedRange +class FloatRange(BaseModel): + """Generic float range for sliders.""" + min: float + max: float + default: float + step: float -@router.get("/voices", response_model=VoicesResponse) +class ProviderOptionsResponse(BaseModel): + """Available TTS configuration options for a provider.""" + provider: str + # Gemini-specific + models: Optional[list[TTSOptionItem]] = None + style_presets: Optional[list[TTSOptionItem]] = None + speed_range: Optional[SpeedRange] = None + # ElevenLabs-specific + stability_range: Optional[FloatRange] = None + similarity_boost_range: Optional[FloatRange] = None + + +@router.get("/voices", response_model=ProviderVoicesResponse) async def list_voices( - current_user=Depends(get_current_user) -) -> VoicesResponse: + provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"), + current_user=Depends(get_current_user), +) -> ProviderVoicesResponse: """ - List all available Gemini TTS voices. + List available TTS voices for the specified provider. """ - return VoicesResponse( - voices=settings.gemini_tts_voices, - default=settings.gemini_tts_default_voice + if provider == "elevenlabs": + el_voices = await elevenlabs_voice_service.get_voices() + voices = [ + VoiceInfo( + id=v.voice_id, + name=v.name, + description=v.description or None, + preview_url=v.preview_url or None, + labels=v.labels or None, + category=v.category or None, + ) + for v in el_voices + ] + default_id = voices[0].id if voices else "" + return ProviderVoicesResponse( + provider="elevenlabs", + voices=voices, + default=default_id, + ) + + # Default: Gemini + voices = [ + VoiceInfo(id=name, name=name) + for name in settings.gemini_tts_voices + ] + return ProviderVoicesResponse( + provider="gemini", + voices=voices, + default=settings.gemini_tts_default_voice, ) @@ -85,14 +143,24 @@ async def list_languages( ) -@router.get("/options", response_model=TTSOptionsResponse) +@router.get("/options", response_model=ProviderOptionsResponse) async def get_tts_options( - current_user=Depends(get_current_user) -) -> TTSOptionsResponse: + provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"), + current_user=Depends(get_current_user), +) -> ProviderOptionsResponse: """ - Get available TTS configuration options including models, style presets, and speed range. + Get available TTS configuration options for the specified provider. """ - return TTSOptionsResponse( + if provider == "elevenlabs": + return ProviderOptionsResponse( + provider="elevenlabs", + stability_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05), + similarity_boost_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05), + ) + + # Default: Gemini + return ProviderOptionsResponse( + provider="gemini", models=[ TTSOptionItem(value="flash", label="Flash (Fast, Cost-efficient)"), TTSOptionItem(value="pro", label="Pro (Higher Quality)"), @@ -111,7 +179,7 @@ async def get_tts_options( max=settings.gemini_tts_speed_max, default=settings.gemini_tts_speed_default, step=settings.gemini_tts_speed_step - ) + ), ) @@ -124,6 +192,14 @@ async def preview_voice( Generate a voice preview audio sample with all TTS settings applied. Returns MP3 audio data. """ + if request.provider == "elevenlabs": + return await _preview_elevenlabs(request) + + return await _preview_gemini(request) + + +async def _preview_gemini(request: VoicePreviewRequest) -> Response: + """Generate a Gemini TTS voice preview.""" # Validate voice name if request.voice_name not in settings.gemini_tts_voices: raise HTTPException( @@ -146,11 +222,10 @@ async def preview_voice( try: logger.info( - f"Generating voice preview: voice={request.voice_name}, language={request.language}, " + f"Generating Gemini voice preview: voice={request.voice_name}, language={request.language}, " f"model={request.model}, speed={request.speed}x, style={request.style_preset}" ) - # Generate preview audio with all settings audio_data = await gemini_tts_service.synthesize_preview( voice_name=request.voice_name, language=request.language, @@ -168,7 +243,53 @@ async def preview_voice( ) except Exception as e: - logger.error(f"Voice preview generation failed: {e}") + logger.error(f"Gemini voice preview generation failed: {e}") + raise HTTPException( + status_code=500, + detail=f"Failed to generate voice preview: {str(e)}" + ) from e + + +async def _preview_elevenlabs(request: VoicePreviewRequest) -> Response: + """Generate an ElevenLabs TTS voice preview.""" + if not tts_service.elevenlabs_available: + raise HTTPException( + status_code=400, + detail="ElevenLabs TTS is not configured" + ) + + # Get sample text for the language + sample_text = settings.gemini_tts_preview_samples.get( + request.language, + settings.gemini_tts_preview_samples.get("en", "This is a preview of the audio description voice.") + ) + + stability = request.stability if request.stability is not None else 0.5 + similarity_boost = request.similarity_boost if request.similarity_boost is not None else 0.5 + + try: + logger.info( + f"Generating ElevenLabs voice preview: voice={request.voice_name}, language={request.language}, " + f"stability={stability}, similarity_boost={similarity_boost}" + ) + + audio_data = await tts_service._synthesize_text_elevenlabs( + text=sample_text, + voice_id=request.voice_name, + stability=stability, + similarity_boost=similarity_boost, + ) + + return Response( + content=audio_data, + media_type="audio/mpeg", + headers={ + "Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3" + } + ) + + except Exception as e: + logger.error(f"ElevenLabs voice preview generation failed: {e}") raise HTTPException( status_code=500, detail=f"Failed to generate voice preview: {str(e)}" diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 86e7025..e82befc 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -45,12 +45,9 @@ class Settings(BaseSettings): "fr-FR": "fr-FR-Neural2-A", "de-DE": "de-DE-Neural2-B" } - elevenlabs_voices: dict[str, str] = { - "en-US": "21m00Tcm4TlvDq8ikWAM", - "es-ES": "VR6AewLTigWG4xSOukaG", - "fr-FR": "TxGEqnHWrfWFTfGW9XjX", - "de-DE": "pNInz6obpgDQGcFmaJgB" - } + # Deprecated: ElevenLabs voices are now fetched dynamically via the API. + # This fallback map is only used by _get_elevenlabs_voice() when no voice_name is provided. + elevenlabs_voices: dict[str, str] = {} # Gemini TTS Configuration gemini_tts_model: str = "gemini-2.5-flash-preview-tts" diff --git a/backend/app/models/job.py b/backend/app/models/job.py index 2aa0490..caea03d 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -51,6 +51,9 @@ class TTSPreferences(BaseModel): "neutral", "calm", "energetic", "professional", "warm", "documentary", "custom" ] = "neutral" custom_style_prompt: Optional[str] = None # Used when style_preset is "custom" + # ElevenLabs-specific settings + stability: Optional[float] = None # 0.0-1.0, default 0.5 when used + similarity_boost: Optional[float] = None # 0.0-1.0, default 0.5 when used class RequestedOutputs(BaseModel): diff --git a/backend/app/services/elevenlabs_voices.py b/backend/app/services/elevenlabs_voices.py new file mode 100644 index 0000000..e03e331 --- /dev/null +++ b/backend/app/services/elevenlabs_voices.py @@ -0,0 +1,101 @@ +""" +ElevenLabs Voice Catalog Service. + +Fetches and caches available voices from the ElevenLabs API. +""" + +import time +from dataclasses import dataclass, field +from typing import Optional + +import aiohttp + +from ..core.config import settings +from ..core.logging import get_logger + +logger = get_logger(__name__) + +CACHE_TTL_SECONDS = 3600 # 1 hour + + +@dataclass +class ElevenLabsVoice: + """Structured voice data from ElevenLabs.""" + voice_id: str + name: str + category: str = "" + description: str = "" + preview_url: str = "" + labels: dict[str, str] = field(default_factory=dict) + + +class ElevenLabsVoiceService: + def __init__(self): + self._cache: list[ElevenLabsVoice] = [] + self._cache_time: float = 0.0 + + def _is_cache_valid(self) -> bool: + return bool(self._cache) and (time.time() - self._cache_time) < CACHE_TTL_SECONDS + + async def get_voices(self) -> list[ElevenLabsVoice]: + """ + Fetch voices from ElevenLabs API with in-memory cache (1-hour TTL). + Falls back to stale cache on API failure. + """ + if self._is_cache_valid(): + return self._cache + + if not settings.elevenlabs_api_key: + logger.warning("ElevenLabs API key not configured") + return self._cache # Return stale cache or empty + + try: + voices = await self._fetch_voices() + self._cache = voices + self._cache_time = time.time() + logger.info(f"Fetched {len(voices)} voices from ElevenLabs API") + return voices + except Exception as e: + logger.warning(f"Failed to fetch ElevenLabs voices, using stale cache: {e}") + return self._cache # Stale cache fallback + + async def _fetch_voices(self) -> list[ElevenLabsVoice]: + """Fetch voices from the ElevenLabs API.""" + url = "https://api.elevenlabs.io/v1/voices" + headers = { + "xi-api-key": settings.elevenlabs_api_key, + "Accept": "application/json", + } + + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=headers) as response: + if response.status != 200: + error_text = await response.text() + raise ValueError(f"ElevenLabs API error: {response.status} - {error_text}") + + data = await response.json() + + voices: list[ElevenLabsVoice] = [] + for v in data.get("voices", []): + voices.append(ElevenLabsVoice( + voice_id=v.get("voice_id", ""), + name=v.get("name", ""), + category=v.get("category", ""), + description=v.get("description", ""), + preview_url=v.get("preview_url", ""), + labels=v.get("labels", {}), + )) + + return voices + + async def get_voice_by_id(self, voice_id: str) -> Optional[ElevenLabsVoice]: + """Look up a specific voice by ID.""" + voices = await self.get_voices() + for v in voices: + if v.voice_id == voice_id: + return v + return None + + +# Singleton instance +elevenlabs_voice_service = ElevenLabsVoiceService() diff --git a/backend/app/services/tts.py b/backend/app/services/tts.py index fd49528..dd66e35 100644 --- a/backend/app/services/tts.py +++ b/backend/app/services/tts.py @@ -51,7 +51,9 @@ class TTSService: provider: Optional[str] = None, model: str = "flash", speed: float = 1.0, - style_prompt: str = "" + style_prompt: str = "", + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> bytes: """ Generate MP3 audio from audio description VTT content. @@ -104,7 +106,10 @@ class TTSService: if self.elevenlabs_available: logger.info(f"Using ElevenLabs TTS for language: {language_code}") - return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name) + return await self._synthesize_with_elevenlabs( + ad_vtt_content, language_code, voice_name, + stability=stability, similarity_boost=similarity_boost, + ) raise ValueError("No TTS service available") @@ -116,7 +121,9 @@ class TTSService: provider: Optional[str] = None, model: str = "flash", speed: float = 1.0, - style_prompt: str = "" + style_prompt: str = "", + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> tuple[bytes, list[TTSCueSegment]]: """ Generate MP3 audio from audio description VTT content AND return individual segments. @@ -168,7 +175,10 @@ class TTSService: audio_data = await self._synthesize_text_google(text, language_code, voice_name) elif self.elevenlabs_available: voice_id = self._get_elevenlabs_voice(language_code, voice_name) - audio_data = await self._synthesize_text_elevenlabs(text, voice_id) + audio_data = await self._synthesize_text_elevenlabs( + text, voice_id, + stability=stability, similarity_boost=similarity_boost, + ) else: raise ValueError("No TTS service available") @@ -277,7 +287,9 @@ class TTSService: self, ad_vtt_content: str, language_code: str = "en-US", - voice_name: Optional[str] = None + voice_name: Optional[str] = None, + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> bytes: """Generate MP3 using ElevenLabs TTS""" # Parse VTT cues @@ -307,7 +319,10 @@ class TTSService: # Synthesize this cue with ElevenLabs text = cue["text"].strip() if text: - audio_data = await self._synthesize_text_elevenlabs(text, voice_id) + audio_data = await self._synthesize_text_elevenlabs( + text, voice_id, + stability=stability, similarity_boost=similarity_boost, + ) # Convert to AudioSegment and get actual duration audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3") @@ -360,7 +375,13 @@ class TTSService: return response.audio_content - async def _synthesize_text_elevenlabs(self, text: str, voice_id: str) -> bytes: + async def _synthesize_text_elevenlabs( + self, + text: str, + voice_id: str, + stability: float = 0.5, + similarity_boost: float = 0.5, + ) -> bytes: """Synthesize text using ElevenLabs API""" url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}" @@ -374,8 +395,8 @@ class TTSService: "text": text, "model_id": "eleven_multilingual_v2", "voice_settings": { - "stability": 0.5, - "similarity_boost": 0.5, + "stability": stability, + "similarity_boost": similarity_boost, "style": 0.0, "use_speaker_boost": True } diff --git a/backend/app/tasks/tts_synthesis.py b/backend/app/tasks/tts_synthesis.py index 66cdb3f..07c4aee 100644 --- a/backend/app/tasks/tts_synthesis.py +++ b/backend/app/tasks/tts_synthesis.py @@ -44,7 +44,9 @@ def synthesize_cue_task( provider: str, model: str, speed: float, - style_prompt: str + style_prompt: str, + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> dict: """ Synthesize a single AD cue and upload to GCS immediately. @@ -84,7 +86,9 @@ def synthesize_cue_task( provider=provider, model=model, speed=speed, - style_prompt=style_prompt + style_prompt=style_prompt, + stability=stability, + similarity_boost=similarity_boost, ) ) @@ -154,7 +158,9 @@ async def _synthesize_single_cue( provider: str, model: str, speed: float, - style_prompt: str + style_prompt: str, + stability: float = 0.5, + similarity_boost: float = 0.5, ) -> tuple[bytes, float]: """ Synthesize a single cue's text to audio. @@ -186,7 +192,10 @@ async def _synthesize_single_cue( elif provider == "elevenlabs": language_code = f"{simple_lang}-US" if simple_lang == "en" else f"{simple_lang}-{simple_lang.upper()}" voice_id = tts_service._get_elevenlabs_voice(language_code, voice_name) - audio_bytes = await tts_service._synthesize_text_elevenlabs(text, voice_id) + audio_bytes = await tts_service._synthesize_text_elevenlabs( + text, voice_id, + stability=stability, similarity_boost=similarity_boost, + ) else: raise ValueError(f"Unknown TTS provider: {provider}") @@ -262,6 +271,8 @@ def dispatch_language_tts( speed = tts_preferences.get("speed", 1.0) style_preset = tts_preferences.get("style_preset", "neutral") custom_style_prompt = tts_preferences.get("custom_style_prompt") + stability = tts_preferences.get("stability") if tts_preferences.get("stability") is not None else 0.5 + similarity_boost = tts_preferences.get("similarity_boost") if tts_preferences.get("similarity_boost") is not None else 0.5 # Resolve style prompt from preset or custom if style_preset == "custom" and custom_style_prompt: @@ -287,7 +298,9 @@ def dispatch_language_tts( provider=provider, model=model, speed=speed, - style_prompt=style_prompt + style_prompt=style_prompt, + stability=stability, + similarity_boost=similarity_boost, ) for i, cue in enumerate(cues) if cue.get("text", "").strip() # Skip empty cues diff --git a/frontend/src/components/ElevenLabsSettingsPanel.tsx b/frontend/src/components/ElevenLabsSettingsPanel.tsx new file mode 100644 index 0000000..92c7196 --- /dev/null +++ b/frontend/src/components/ElevenLabsSettingsPanel.tsx @@ -0,0 +1,64 @@ +import type { TTSPreferences } from '../types/api'; + +interface ElevenLabsSettingsPanelProps { + preferences: TTSPreferences; + onChange: (preferences: TTSPreferences) => void; + disabled?: boolean; +} + +export function ElevenLabsSettingsPanel({ preferences, onChange, disabled }: ElevenLabsSettingsPanelProps) { + const stability = preferences.stability ?? 0.5; + const similarityBoost = preferences.similarity_boost ?? 0.5; + + return ( +
+ {/* Stability Slider */} +
+ +

+ Controls how consistent the voice is between regenerations. Higher values are more stable. +

+ onChange({ ...preferences, stability: parseFloat(e.target.value) })} + disabled={disabled} + className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed" + /> +
+ More Variable + More Stable +
+
+ + {/* Similarity Boost Slider */} +
+ +

+ Controls how closely the voice tries to match the original. Higher values increase similarity. +

+ onChange({ ...preferences, similarity_boost: parseFloat(e.target.value) })} + disabled={disabled} + className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed" + /> +
+ Low + High +
+
+
+ ); +} diff --git a/frontend/src/components/TTSSettingsPanel.tsx b/frontend/src/components/TTSSettingsPanel.tsx index 78dd743..060412a 100644 --- a/frontend/src/components/TTSSettingsPanel.tsx +++ b/frontend/src/components/TTSSettingsPanel.tsx @@ -1,6 +1,6 @@ import { useState, useEffect } from 'react'; import { api } from '../lib/api'; -import type { TTSPreferences, TTSOptionsResponse, TTSModel, TTSStylePreset } from '../types/api'; +import type { TTSPreferences, ProviderOptionsResponse, TTSModel, TTSStylePreset } from '../types/api'; interface TTSSettingsPanelProps { preferences: TTSPreferences; @@ -9,7 +9,7 @@ interface TTSSettingsPanelProps { } export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSettingsPanelProps) { - const [options, setOptions] = useState(null); + const [options, setOptions] = useState(null); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); @@ -18,7 +18,7 @@ export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSetting const fetchOptions = async () => { try { setLoading(true); - const data = await api.getTTSOptions(); + const data = await api.getTTSOptions('gemini'); setOptions(data); setError(null); } catch (err) { @@ -50,7 +50,7 @@ export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSetting ); } - if (!options) return null; + if (!options || !options.models || !options.style_presets || !options.speed_range) return null; return (
diff --git a/frontend/src/components/VoicePreviewButton.tsx b/frontend/src/components/VoicePreviewButton.tsx index 4afe917..2a5b095 100644 --- a/frontend/src/components/VoicePreviewButton.tsx +++ b/frontend/src/components/VoicePreviewButton.tsx @@ -1,25 +1,31 @@ import { useState, useRef, useEffect } from 'react'; import { api } from '../lib/api'; -import type { TTSStylePreset } from '../types/api'; +import type { TTSStylePreset, TTSProvider } from '../types/api'; interface VoicePreviewButtonProps { voiceName: string; language: string; disabled?: boolean; + provider?: TTSProvider; model?: string; speed?: number; stylePreset?: TTSStylePreset; customStylePrompt?: string; + stability?: number; + similarityBoost?: number; } export function VoicePreviewButton({ voiceName, language, disabled, + provider, model, speed, stylePreset, - customStylePrompt + customStylePrompt, + stability, + similarityBoost, }: VoicePreviewButtonProps) { const [isLoading, setIsLoading] = useState(false); const [isPlaying, setIsPlaying] = useState(false); @@ -41,7 +47,7 @@ export function VoicePreviewButton({ } setIsPlaying(false); setError(null); - }, [voiceName, language, model, speed, stylePreset, customStylePrompt]); + }, [voiceName, language, provider, model, speed, stylePreset, customStylePrompt, stability, similarityBoost]); const handlePreview = async () => { setError(null); @@ -70,7 +76,10 @@ export function VoicePreviewButton({ model, speed, stylePreset, - customStylePrompt + customStylePrompt, + provider, + stability, + similarityBoost, ); const url = URL.createObjectURL(blob); @@ -104,9 +113,6 @@ export function VoicePreviewButton({ } }; - // Cleanup on unmount - // Note: We don't add cleanup in useEffect to allow audio caching within component lifecycle - return (
+ +
+
+ {/* Default Voice Selection */}
- {/* TTS Settings (Model, Speed, Style) */} - + {/* TTS Settings - Provider-specific */} + {preferences.provider === 'elevenlabs' ? ( + + ) : ( + + )} {/* Per-Language Voice Overrides */} {displayLanguages.length > 1 && ( @@ -202,9 +272,10 @@ export function VoiceSelector({ className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm" > {voices.voices.map((voice) => ( - ))} @@ -212,10 +283,13 @@ export function VoiceSelector({ voiceName={getVoiceForLanguage(activeLanguage)} language={activeLanguage} disabled={disabled} + provider={preferences.provider} model={preferences.model} speed={preferences.speed} stylePreset={preferences.style_preset} customStylePrompt={preferences.custom_style_prompt} + stability={preferences.stability} + similarityBoost={preferences.similarity_boost} /> diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index b935817..70fa1ca 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -25,11 +25,12 @@ import type { UpdateUserRequest, ResetPasswordResponse, AdminStatsResponse, - VoicesResponse, + ProviderVoicesResponse, LanguagesResponse, TTSPreferences, - TTSOptionsResponse, + ProviderOptionsResponse, TTSStylePreset, + TTSProvider, AccessibleVideoMethod, ReviewNote, ReviewNoteCreateRequest, @@ -349,8 +350,8 @@ class ApiClient { } // TTS endpoints - async getVoices(): Promise { - const response = await this.client.get('/tts/voices'); + async getVoices(provider: TTSProvider = 'gemini'): Promise { + const response = await this.client.get(`/tts/voices?provider=${provider}`); return response.data; } @@ -359,8 +360,8 @@ class ApiClient { return response.data; } - async getTTSOptions(): Promise { - const response = await this.client.get('/tts/options'); + async getTTSOptions(provider: TTSProvider = 'gemini'): Promise { + const response = await this.client.get(`/tts/options?provider=${provider}`); return response.data; } @@ -370,17 +371,23 @@ class ApiClient { model?: string, speed?: number, stylePreset?: TTSStylePreset, - customStylePrompt?: string + customStylePrompt?: string, + provider?: TTSProvider, + stability?: number, + similarityBoost?: number, ): Promise { const response = await this.client.post( '/tts/preview', { voice_name: voiceName, language, + provider: provider || 'gemini', model: model || 'flash', speed: speed || 1.0, style_preset: stylePreset || 'neutral', - custom_style_prompt: customStylePrompt + custom_style_prompt: customStylePrompt, + stability: stability, + similarity_boost: similarityBoost, }, { responseType: 'blob' } ); diff --git a/frontend/src/routes/admin/QCDetail.tsx b/frontend/src/routes/admin/QCDetail.tsx index 4e9d4b8..2b8a510 100644 --- a/frontend/src/routes/admin/QCDetail.tsx +++ b/frontend/src/routes/admin/QCDetail.tsx @@ -91,7 +91,9 @@ export function QCDetail() { model: 'flash', speed: 1.0, style_preset: 'neutral', - custom_style_prompt: undefined + custom_style_prompt: undefined, + stability: undefined, + similarity_boost: undefined, }); const [originalTtsPreferences, setOriginalTtsPreferences] = useState(null); diff --git a/frontend/src/routes/jobs/NewJob.tsx b/frontend/src/routes/jobs/NewJob.tsx index 55a7ac1..117ba28 100644 --- a/frontend/src/routes/jobs/NewJob.tsx +++ b/frontend/src/routes/jobs/NewJob.tsx @@ -44,7 +44,9 @@ export function NewJob() { model: 'flash', speed: 1.0, style_preset: 'neutral', - custom_style_prompt: undefined + custom_style_prompt: undefined, + stability: undefined, + similarity_boost: undefined, }); const [accessibleVideoMethod, setAccessibleVideoMethod] = useState('pause_insert'); diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index 5c8e167..8d76ef4 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -55,6 +55,9 @@ export interface TTSPreferences { speed: number; style_preset: TTSStylePreset; custom_style_prompt?: string; + // ElevenLabs-specific settings + stability?: number; + similarity_boost?: number; } export interface RequestedOutputs { @@ -69,6 +72,22 @@ export interface RequestedOutputs { translation_mode?: TranslationMode; // "video_native" (default) or "traditional" } +export interface VoiceInfo { + id: string; + name: string; + description?: string; + preview_url?: string; + labels?: Record; + category?: string; +} + +export interface ProviderVoicesResponse { + provider: string; + voices: VoiceInfo[]; + default: string; +} + +/** @deprecated Use ProviderVoicesResponse instead */ export interface VoicesResponse { voices: string[]; default: string; @@ -91,6 +110,25 @@ export interface SpeedRange { step: number; } +export interface FloatRange { + min: number; + max: number; + default: number; + step: number; +} + +export interface ProviderOptionsResponse { + provider: string; + // Gemini-specific + models?: TTSOptionItem[]; + style_presets?: TTSOptionItem[]; + speed_range?: SpeedRange; + // ElevenLabs-specific + stability_range?: FloatRange; + similarity_boost_range?: FloatRange; +} + +/** @deprecated Use ProviderOptionsResponse instead */ export interface TTSOptionsResponse { models: TTSOptionItem[]; style_presets: TTSOptionItem[];