feat: add ElevenLabs voice selection to frontend and backend

Add dynamic ElevenLabs voice catalog with provider toggle in the UI,
allowing users to browse ElevenLabs voices, configure stability and
similarity boost settings, and preview/synthesize with ElevenLabs TTS.

Backend:
- New elevenlabs_voices.py service with 1-hour cached API fetching
- TTS routes support ?provider= query param for voices and options
- Preview endpoint routes to ElevenLabs or Gemini based on provider
- stability/similarity_boost params flow through TTS synthesis pipeline
- TTSPreferences model extended with ElevenLabs-specific fields
- Deprecated hardcoded elevenlabs_voices config (now fetched dynamically)

Frontend:
- Provider toggle (Gemini/ElevenLabs) in VoiceSelector
- ElevenLabsSettingsPanel with stability and similarity boost sliders
- VoicePreviewButton supports provider-specific preview parameters
- API client passes provider param to voices, options, and preview endpoints
- New VoiceInfo, ProviderVoicesResponse, ProviderOptionsResponse types

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-03-03 13:58:56 +00:00
parent 31b7be0a2f
commit 1e177a6d5c
14 changed files with 537 additions and 88 deletions

View file

@ -1,12 +1,14 @@
from typing import Literal, Optional
from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import Response
from pydantic import BaseModel, Field
from ...core.config import settings
from ...core.logging import get_logger
from ...services.gemini_tts import gemini_tts_service
from ...services.elevenlabs_voices import elevenlabs_voice_service
from ...services.tts import tts_service
from ...core.dependencies import get_current_user
logger = get_logger(__name__)
@ -18,17 +20,33 @@ class VoicePreviewRequest(BaseModel):
"""Request to generate a voice preview"""
voice_name: str
language: str = "en"
provider: Literal["gemini", "elevenlabs"] = "gemini"
# Gemini-specific
model: Literal["flash", "pro"] = "flash"
speed: float = Field(default=1.0, ge=0.5, le=2.0)
style_preset: Literal[
"neutral", "calm", "energetic", "professional", "warm", "documentary", "custom"
] = "neutral"
custom_style_prompt: Optional[str] = None
# ElevenLabs-specific
stability: Optional[float] = Field(default=None, ge=0.0, le=1.0)
similarity_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0)
class VoicesResponse(BaseModel):
"""Available TTS voices"""
voices: list[str]
class VoiceInfo(BaseModel):
"""Structured voice information for any provider."""
id: str
name: str
description: Optional[str] = None
preview_url: Optional[str] = None
labels: Optional[dict[str, str]] = None
category: Optional[str] = None
class ProviderVoicesResponse(BaseModel):
"""Available TTS voices for a specific provider."""
provider: str
voices: list[VoiceInfo]
default: str
@ -52,23 +70,63 @@ class SpeedRange(BaseModel):
step: float
class TTSOptionsResponse(BaseModel):
"""Available TTS configuration options"""
models: list[TTSOptionItem]
style_presets: list[TTSOptionItem]
speed_range: SpeedRange
class FloatRange(BaseModel):
"""Generic float range for sliders."""
min: float
max: float
default: float
step: float
@router.get("/voices", response_model=VoicesResponse)
class ProviderOptionsResponse(BaseModel):
"""Available TTS configuration options for a provider."""
provider: str
# Gemini-specific
models: Optional[list[TTSOptionItem]] = None
style_presets: Optional[list[TTSOptionItem]] = None
speed_range: Optional[SpeedRange] = None
# ElevenLabs-specific
stability_range: Optional[FloatRange] = None
similarity_boost_range: Optional[FloatRange] = None
@router.get("/voices", response_model=ProviderVoicesResponse)
async def list_voices(
current_user=Depends(get_current_user)
) -> VoicesResponse:
provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"),
current_user=Depends(get_current_user),
) -> ProviderVoicesResponse:
"""
List all available Gemini TTS voices.
List available TTS voices for the specified provider.
"""
return VoicesResponse(
voices=settings.gemini_tts_voices,
default=settings.gemini_tts_default_voice
if provider == "elevenlabs":
el_voices = await elevenlabs_voice_service.get_voices()
voices = [
VoiceInfo(
id=v.voice_id,
name=v.name,
description=v.description or None,
preview_url=v.preview_url or None,
labels=v.labels or None,
category=v.category or None,
)
for v in el_voices
]
default_id = voices[0].id if voices else ""
return ProviderVoicesResponse(
provider="elevenlabs",
voices=voices,
default=default_id,
)
# Default: Gemini
voices = [
VoiceInfo(id=name, name=name)
for name in settings.gemini_tts_voices
]
return ProviderVoicesResponse(
provider="gemini",
voices=voices,
default=settings.gemini_tts_default_voice,
)
@ -85,14 +143,24 @@ async def list_languages(
)
@router.get("/options", response_model=TTSOptionsResponse)
@router.get("/options", response_model=ProviderOptionsResponse)
async def get_tts_options(
current_user=Depends(get_current_user)
) -> TTSOptionsResponse:
provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"),
current_user=Depends(get_current_user),
) -> ProviderOptionsResponse:
"""
Get available TTS configuration options including models, style presets, and speed range.
Get available TTS configuration options for the specified provider.
"""
return TTSOptionsResponse(
if provider == "elevenlabs":
return ProviderOptionsResponse(
provider="elevenlabs",
stability_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05),
similarity_boost_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05),
)
# Default: Gemini
return ProviderOptionsResponse(
provider="gemini",
models=[
TTSOptionItem(value="flash", label="Flash (Fast, Cost-efficient)"),
TTSOptionItem(value="pro", label="Pro (Higher Quality)"),
@ -111,7 +179,7 @@ async def get_tts_options(
max=settings.gemini_tts_speed_max,
default=settings.gemini_tts_speed_default,
step=settings.gemini_tts_speed_step
)
),
)
@ -124,6 +192,14 @@ async def preview_voice(
Generate a voice preview audio sample with all TTS settings applied.
Returns MP3 audio data.
"""
if request.provider == "elevenlabs":
return await _preview_elevenlabs(request)
return await _preview_gemini(request)
async def _preview_gemini(request: VoicePreviewRequest) -> Response:
"""Generate a Gemini TTS voice preview."""
# Validate voice name
if request.voice_name not in settings.gemini_tts_voices:
raise HTTPException(
@ -146,11 +222,10 @@ async def preview_voice(
try:
logger.info(
f"Generating voice preview: voice={request.voice_name}, language={request.language}, "
f"Generating Gemini voice preview: voice={request.voice_name}, language={request.language}, "
f"model={request.model}, speed={request.speed}x, style={request.style_preset}"
)
# Generate preview audio with all settings
audio_data = await gemini_tts_service.synthesize_preview(
voice_name=request.voice_name,
language=request.language,
@ -168,7 +243,53 @@ async def preview_voice(
)
except Exception as e:
logger.error(f"Voice preview generation failed: {e}")
logger.error(f"Gemini voice preview generation failed: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to generate voice preview: {str(e)}"
) from e
async def _preview_elevenlabs(request: VoicePreviewRequest) -> Response:
"""Generate an ElevenLabs TTS voice preview."""
if not tts_service.elevenlabs_available:
raise HTTPException(
status_code=400,
detail="ElevenLabs TTS is not configured"
)
# Get sample text for the language
sample_text = settings.gemini_tts_preview_samples.get(
request.language,
settings.gemini_tts_preview_samples.get("en", "This is a preview of the audio description voice.")
)
stability = request.stability if request.stability is not None else 0.5
similarity_boost = request.similarity_boost if request.similarity_boost is not None else 0.5
try:
logger.info(
f"Generating ElevenLabs voice preview: voice={request.voice_name}, language={request.language}, "
f"stability={stability}, similarity_boost={similarity_boost}"
)
audio_data = await tts_service._synthesize_text_elevenlabs(
text=sample_text,
voice_id=request.voice_name,
stability=stability,
similarity_boost=similarity_boost,
)
return Response(
content=audio_data,
media_type="audio/mpeg",
headers={
"Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3"
}
)
except Exception as e:
logger.error(f"ElevenLabs voice preview generation failed: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to generate voice preview: {str(e)}"

View file

@ -45,12 +45,9 @@ class Settings(BaseSettings):
"fr-FR": "fr-FR-Neural2-A",
"de-DE": "de-DE-Neural2-B"
}
elevenlabs_voices: dict[str, str] = {
"en-US": "21m00Tcm4TlvDq8ikWAM",
"es-ES": "VR6AewLTigWG4xSOukaG",
"fr-FR": "TxGEqnHWrfWFTfGW9XjX",
"de-DE": "pNInz6obpgDQGcFmaJgB"
}
# Deprecated: ElevenLabs voices are now fetched dynamically via the API.
# This fallback map is only used by _get_elevenlabs_voice() when no voice_name is provided.
elevenlabs_voices: dict[str, str] = {}
# Gemini TTS Configuration
gemini_tts_model: str = "gemini-2.5-flash-preview-tts"

View file

@ -51,6 +51,9 @@ class TTSPreferences(BaseModel):
"neutral", "calm", "energetic", "professional", "warm", "documentary", "custom"
] = "neutral"
custom_style_prompt: Optional[str] = None # Used when style_preset is "custom"
# ElevenLabs-specific settings
stability: Optional[float] = None # 0.0-1.0, default 0.5 when used
similarity_boost: Optional[float] = None # 0.0-1.0, default 0.5 when used
class RequestedOutputs(BaseModel):

View file

@ -0,0 +1,101 @@
"""
ElevenLabs Voice Catalog Service.
Fetches and caches available voices from the ElevenLabs API.
"""
import time
from dataclasses import dataclass, field
from typing import Optional
import aiohttp
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
CACHE_TTL_SECONDS = 3600 # 1 hour
@dataclass
class ElevenLabsVoice:
"""Structured voice data from ElevenLabs."""
voice_id: str
name: str
category: str = ""
description: str = ""
preview_url: str = ""
labels: dict[str, str] = field(default_factory=dict)
class ElevenLabsVoiceService:
def __init__(self):
self._cache: list[ElevenLabsVoice] = []
self._cache_time: float = 0.0
def _is_cache_valid(self) -> bool:
return bool(self._cache) and (time.time() - self._cache_time) < CACHE_TTL_SECONDS
async def get_voices(self) -> list[ElevenLabsVoice]:
"""
Fetch voices from ElevenLabs API with in-memory cache (1-hour TTL).
Falls back to stale cache on API failure.
"""
if self._is_cache_valid():
return self._cache
if not settings.elevenlabs_api_key:
logger.warning("ElevenLabs API key not configured")
return self._cache # Return stale cache or empty
try:
voices = await self._fetch_voices()
self._cache = voices
self._cache_time = time.time()
logger.info(f"Fetched {len(voices)} voices from ElevenLabs API")
return voices
except Exception as e:
logger.warning(f"Failed to fetch ElevenLabs voices, using stale cache: {e}")
return self._cache # Stale cache fallback
async def _fetch_voices(self) -> list[ElevenLabsVoice]:
"""Fetch voices from the ElevenLabs API."""
url = "https://api.elevenlabs.io/v1/voices"
headers = {
"xi-api-key": settings.elevenlabs_api_key,
"Accept": "application/json",
}
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as response:
if response.status != 200:
error_text = await response.text()
raise ValueError(f"ElevenLabs API error: {response.status} - {error_text}")
data = await response.json()
voices: list[ElevenLabsVoice] = []
for v in data.get("voices", []):
voices.append(ElevenLabsVoice(
voice_id=v.get("voice_id", ""),
name=v.get("name", ""),
category=v.get("category", ""),
description=v.get("description", ""),
preview_url=v.get("preview_url", ""),
labels=v.get("labels", {}),
))
return voices
async def get_voice_by_id(self, voice_id: str) -> Optional[ElevenLabsVoice]:
"""Look up a specific voice by ID."""
voices = await self.get_voices()
for v in voices:
if v.voice_id == voice_id:
return v
return None
# Singleton instance
elevenlabs_voice_service = ElevenLabsVoiceService()

View file

@ -51,7 +51,9 @@ class TTSService:
provider: Optional[str] = None,
model: str = "flash",
speed: float = 1.0,
style_prompt: str = ""
style_prompt: str = "",
stability: float = 0.5,
similarity_boost: float = 0.5,
) -> bytes:
"""
Generate MP3 audio from audio description VTT content.
@ -104,7 +106,10 @@ class TTSService:
if self.elevenlabs_available:
logger.info(f"Using ElevenLabs TTS for language: {language_code}")
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
return await self._synthesize_with_elevenlabs(
ad_vtt_content, language_code, voice_name,
stability=stability, similarity_boost=similarity_boost,
)
raise ValueError("No TTS service available")
@ -116,7 +121,9 @@ class TTSService:
provider: Optional[str] = None,
model: str = "flash",
speed: float = 1.0,
style_prompt: str = ""
style_prompt: str = "",
stability: float = 0.5,
similarity_boost: float = 0.5,
) -> tuple[bytes, list[TTSCueSegment]]:
"""
Generate MP3 audio from audio description VTT content AND return individual segments.
@ -168,7 +175,10 @@ class TTSService:
audio_data = await self._synthesize_text_google(text, language_code, voice_name)
elif self.elevenlabs_available:
voice_id = self._get_elevenlabs_voice(language_code, voice_name)
audio_data = await self._synthesize_text_elevenlabs(text, voice_id)
audio_data = await self._synthesize_text_elevenlabs(
text, voice_id,
stability=stability, similarity_boost=similarity_boost,
)
else:
raise ValueError("No TTS service available")
@ -277,7 +287,9 @@ class TTSService:
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: Optional[str] = None
voice_name: Optional[str] = None,
stability: float = 0.5,
similarity_boost: float = 0.5,
) -> bytes:
"""Generate MP3 using ElevenLabs TTS"""
# Parse VTT cues
@ -307,7 +319,10 @@ class TTSService:
# Synthesize this cue with ElevenLabs
text = cue["text"].strip()
if text:
audio_data = await self._synthesize_text_elevenlabs(text, voice_id)
audio_data = await self._synthesize_text_elevenlabs(
text, voice_id,
stability=stability, similarity_boost=similarity_boost,
)
# Convert to AudioSegment and get actual duration
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
@ -360,7 +375,13 @@ class TTSService:
return response.audio_content
async def _synthesize_text_elevenlabs(self, text: str, voice_id: str) -> bytes:
async def _synthesize_text_elevenlabs(
self,
text: str,
voice_id: str,
stability: float = 0.5,
similarity_boost: float = 0.5,
) -> bytes:
"""Synthesize text using ElevenLabs API"""
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
@ -374,8 +395,8 @@ class TTSService:
"text": text,
"model_id": "eleven_multilingual_v2",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.5,
"stability": stability,
"similarity_boost": similarity_boost,
"style": 0.0,
"use_speaker_boost": True
}

View file

@ -44,7 +44,9 @@ def synthesize_cue_task(
provider: str,
model: str,
speed: float,
style_prompt: str
style_prompt: str,
stability: float = 0.5,
similarity_boost: float = 0.5,
) -> dict:
"""
Synthesize a single AD cue and upload to GCS immediately.
@ -84,7 +86,9 @@ def synthesize_cue_task(
provider=provider,
model=model,
speed=speed,
style_prompt=style_prompt
style_prompt=style_prompt,
stability=stability,
similarity_boost=similarity_boost,
)
)
@ -154,7 +158,9 @@ async def _synthesize_single_cue(
provider: str,
model: str,
speed: float,
style_prompt: str
style_prompt: str,
stability: float = 0.5,
similarity_boost: float = 0.5,
) -> tuple[bytes, float]:
"""
Synthesize a single cue's text to audio.
@ -186,7 +192,10 @@ async def _synthesize_single_cue(
elif provider == "elevenlabs":
language_code = f"{simple_lang}-US" if simple_lang == "en" else f"{simple_lang}-{simple_lang.upper()}"
voice_id = tts_service._get_elevenlabs_voice(language_code, voice_name)
audio_bytes = await tts_service._synthesize_text_elevenlabs(text, voice_id)
audio_bytes = await tts_service._synthesize_text_elevenlabs(
text, voice_id,
stability=stability, similarity_boost=similarity_boost,
)
else:
raise ValueError(f"Unknown TTS provider: {provider}")
@ -262,6 +271,8 @@ def dispatch_language_tts(
speed = tts_preferences.get("speed", 1.0)
style_preset = tts_preferences.get("style_preset", "neutral")
custom_style_prompt = tts_preferences.get("custom_style_prompt")
stability = tts_preferences.get("stability") if tts_preferences.get("stability") is not None else 0.5
similarity_boost = tts_preferences.get("similarity_boost") if tts_preferences.get("similarity_boost") is not None else 0.5
# Resolve style prompt from preset or custom
if style_preset == "custom" and custom_style_prompt:
@ -287,7 +298,9 @@ def dispatch_language_tts(
provider=provider,
model=model,
speed=speed,
style_prompt=style_prompt
style_prompt=style_prompt,
stability=stability,
similarity_boost=similarity_boost,
)
for i, cue in enumerate(cues)
if cue.get("text", "").strip() # Skip empty cues

View file

@ -0,0 +1,64 @@
import type { TTSPreferences } from '../types/api';
interface ElevenLabsSettingsPanelProps {
preferences: TTSPreferences;
onChange: (preferences: TTSPreferences) => void;
disabled?: boolean;
}
export function ElevenLabsSettingsPanel({ preferences, onChange, disabled }: ElevenLabsSettingsPanelProps) {
const stability = preferences.stability ?? 0.5;
const similarityBoost = preferences.similarity_boost ?? 0.5;
return (
<div className="space-y-4">
{/* Stability Slider */}
<div className="bg-gray-50 rounded-lg p-4">
<label className="block text-sm font-medium text-gray-700 mb-2">
Stability: {stability.toFixed(2)}
</label>
<p className="text-xs text-gray-500 mb-3">
Controls how consistent the voice is between regenerations. Higher values are more stable.
</p>
<input
type="range"
min={0}
max={1}
step={0.05}
value={stability}
onChange={(e) => onChange({ ...preferences, stability: parseFloat(e.target.value) })}
disabled={disabled}
className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed"
/>
<div className="flex justify-between text-xs text-gray-500 mt-1">
<span>More Variable</span>
<span>More Stable</span>
</div>
</div>
{/* Similarity Boost Slider */}
<div className="bg-gray-50 rounded-lg p-4">
<label className="block text-sm font-medium text-gray-700 mb-2">
Similarity Boost: {similarityBoost.toFixed(2)}
</label>
<p className="text-xs text-gray-500 mb-3">
Controls how closely the voice tries to match the original. Higher values increase similarity.
</p>
<input
type="range"
min={0}
max={1}
step={0.05}
value={similarityBoost}
onChange={(e) => onChange({ ...preferences, similarity_boost: parseFloat(e.target.value) })}
disabled={disabled}
className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed"
/>
<div className="flex justify-between text-xs text-gray-500 mt-1">
<span>Low</span>
<span>High</span>
</div>
</div>
</div>
);
}

View file

@ -1,6 +1,6 @@
import { useState, useEffect } from 'react';
import { api } from '../lib/api';
import type { TTSPreferences, TTSOptionsResponse, TTSModel, TTSStylePreset } from '../types/api';
import type { TTSPreferences, ProviderOptionsResponse, TTSModel, TTSStylePreset } from '../types/api';
interface TTSSettingsPanelProps {
preferences: TTSPreferences;
@ -9,7 +9,7 @@ interface TTSSettingsPanelProps {
}
export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSettingsPanelProps) {
const [options, setOptions] = useState<TTSOptionsResponse | null>(null);
const [options, setOptions] = useState<ProviderOptionsResponse | null>(null);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
@ -18,7 +18,7 @@ export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSetting
const fetchOptions = async () => {
try {
setLoading(true);
const data = await api.getTTSOptions();
const data = await api.getTTSOptions('gemini');
setOptions(data);
setError(null);
} catch (err) {
@ -50,7 +50,7 @@ export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSetting
);
}
if (!options) return null;
if (!options || !options.models || !options.style_presets || !options.speed_range) return null;
return (
<div className="space-y-4">

View file

@ -1,25 +1,31 @@
import { useState, useRef, useEffect } from 'react';
import { api } from '../lib/api';
import type { TTSStylePreset } from '../types/api';
import type { TTSStylePreset, TTSProvider } from '../types/api';
interface VoicePreviewButtonProps {
voiceName: string;
language: string;
disabled?: boolean;
provider?: TTSProvider;
model?: string;
speed?: number;
stylePreset?: TTSStylePreset;
customStylePrompt?: string;
stability?: number;
similarityBoost?: number;
}
export function VoicePreviewButton({
voiceName,
language,
disabled,
provider,
model,
speed,
stylePreset,
customStylePrompt
customStylePrompt,
stability,
similarityBoost,
}: VoicePreviewButtonProps) {
const [isLoading, setIsLoading] = useState(false);
const [isPlaying, setIsPlaying] = useState(false);
@ -41,7 +47,7 @@ export function VoicePreviewButton({
}
setIsPlaying(false);
setError(null);
}, [voiceName, language, model, speed, stylePreset, customStylePrompt]);
}, [voiceName, language, provider, model, speed, stylePreset, customStylePrompt, stability, similarityBoost]);
const handlePreview = async () => {
setError(null);
@ -70,7 +76,10 @@ export function VoicePreviewButton({
model,
speed,
stylePreset,
customStylePrompt
customStylePrompt,
provider,
stability,
similarityBoost,
);
const url = URL.createObjectURL(blob);
@ -104,9 +113,6 @@ export function VoicePreviewButton({
}
};
// Cleanup on unmount
// Note: We don't add cleanup in useEffect to allow audio caching within component lifecycle
return (
<div className="inline-flex items-center gap-2">
<button

View file

@ -2,7 +2,8 @@ import { useState, useEffect } from 'react';
import { api } from '../lib/api';
import { VoicePreviewButton } from './VoicePreviewButton';
import { TTSSettingsPanel } from './TTSSettingsPanel';
import type { TTSPreferences, VoicesResponse, LanguagesResponse } from '../types/api';
import { ElevenLabsSettingsPanel } from './ElevenLabsSettingsPanel';
import type { TTSPreferences, ProviderVoicesResponse, LanguagesResponse, TTSProvider } from '../types/api';
interface VoiceSelectorProps {
selectedLanguages: string[];
@ -17,23 +18,37 @@ export function VoiceSelector({
onChange,
disabled
}: VoiceSelectorProps) {
const [voices, setVoices] = useState<VoicesResponse | null>(null);
const [voices, setVoices] = useState<ProviderVoicesResponse | null>(null);
const [languages, setLanguages] = useState<LanguagesResponse | null>(null);
const [activeLanguage, setActiveLanguage] = useState<string>(selectedLanguages[0] || 'en');
const [isLoading, setIsLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
// Fetch voices and languages on mount
// Fetch voices and languages when provider changes
useEffect(() => {
const fetchData = async () => {
try {
setIsLoading(true);
setError(null);
const [voicesData, languagesData] = await Promise.all([
api.getVoices(),
api.getVoices(preferences.provider),
api.getLanguages()
]);
setVoices(voicesData);
setLanguages(languagesData);
// Set default voice from API response if switching providers
if (voicesData.default && voicesData.voices.length > 0) {
// Only reset default voice if the current one isn't in the new voice list
const currentVoiceExists = voicesData.voices.some(v => v.id === preferences.default_voice);
if (!currentVoiceExists) {
onChange({
...preferences,
default_voice: voicesData.default,
voices_per_language: {},
});
}
}
} catch (err) {
setError('Failed to load voice options');
console.error('Voice selector error:', err);
@ -43,7 +58,8 @@ export function VoiceSelector({
};
fetchData();
}, []);
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [preferences.provider]);
// Update active language when selected languages change
useEffect(() => {
@ -52,21 +68,31 @@ export function VoiceSelector({
}
}, [selectedLanguages, activeLanguage]);
const handleDefaultVoiceChange = (voice: string) => {
const handleProviderChange = (provider: TTSProvider) => {
if (provider === preferences.provider) return;
onChange({
...preferences,
default_voice: voice
provider,
default_voice: '', // Will be set after fetch
voices_per_language: {},
});
};
const handleLanguageVoiceChange = (language: string, voice: string) => {
const handleDefaultVoiceChange = (voiceId: string) => {
onChange({
...preferences,
default_voice: voiceId
});
};
const handleLanguageVoiceChange = (language: string, voiceId: string) => {
const newVoicesPerLanguage = {
...preferences.voices_per_language,
[language]: voice
[language]: voiceId
};
// If voice matches default, remove from per-language overrides
if (voice === preferences.default_voice) {
if (voiceId === preferences.default_voice) {
delete newVoicesPerLanguage[language];
}
@ -113,13 +139,45 @@ export function VoiceSelector({
}
// Filter languages to only show selected ones
// The first language in selectedLanguages is the primary/source language
const displayLanguages = selectedLanguages.length > 0
? selectedLanguages
: ['en'];
return (
<div className="space-y-4">
{/* Provider Toggle */}
<div className="bg-gray-50 rounded-lg p-4">
<label className="block text-sm font-medium text-gray-700 mb-2">
TTS Provider
</label>
<div className="flex rounded-lg border border-gray-300 overflow-hidden">
<button
type="button"
onClick={() => handleProviderChange('gemini')}
disabled={disabled}
className={`flex-1 px-4 py-2 text-sm font-medium transition-colors ${
preferences.provider === 'gemini'
? 'bg-blue-600 text-white'
: 'bg-white text-gray-700 hover:bg-gray-50'
} disabled:opacity-50`}
>
Gemini
</button>
<button
type="button"
onClick={() => handleProviderChange('elevenlabs')}
disabled={disabled}
className={`flex-1 px-4 py-2 text-sm font-medium border-l border-gray-300 transition-colors ${
preferences.provider === 'elevenlabs'
? 'bg-blue-600 text-white'
: 'bg-white text-gray-700 hover:bg-gray-50'
} disabled:opacity-50`}
>
ElevenLabs
</button>
</div>
</div>
{/* Default Voice Selection */}
<div className="bg-gray-50 rounded-lg p-4">
<label className="block text-sm font-medium text-gray-700 mb-2">
@ -136,8 +194,9 @@ export function VoiceSelector({
className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm"
>
{voices.voices.map((voice) => (
<option key={voice} value={voice}>
{voice}
<option key={voice.id} value={voice.id}>
{voice.name}
{voice.category ? ` (${voice.category})` : ''}
</option>
))}
</select>
@ -145,20 +204,31 @@ export function VoiceSelector({
voiceName={preferences.default_voice}
language={displayLanguages[0] || 'en'}
disabled={disabled}
provider={preferences.provider}
model={preferences.model}
speed={preferences.speed}
stylePreset={preferences.style_preset}
customStylePrompt={preferences.custom_style_prompt}
stability={preferences.stability}
similarityBoost={preferences.similarity_boost}
/>
</div>
</div>
{/* TTS Settings (Model, Speed, Style) */}
<TTSSettingsPanel
preferences={preferences}
onChange={onChange}
disabled={disabled}
/>
{/* TTS Settings - Provider-specific */}
{preferences.provider === 'elevenlabs' ? (
<ElevenLabsSettingsPanel
preferences={preferences}
onChange={onChange}
disabled={disabled}
/>
) : (
<TTSSettingsPanel
preferences={preferences}
onChange={onChange}
disabled={disabled}
/>
)}
{/* Per-Language Voice Overrides */}
{displayLanguages.length > 1 && (
@ -202,9 +272,10 @@ export function VoiceSelector({
className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm"
>
{voices.voices.map((voice) => (
<option key={voice} value={voice}>
{voice}
{voice === preferences.default_voice ? ' (default)' : ''}
<option key={voice.id} value={voice.id}>
{voice.name}
{voice.category ? ` (${voice.category})` : ''}
{voice.id === preferences.default_voice ? ' (default)' : ''}
</option>
))}
</select>
@ -212,10 +283,13 @@ export function VoiceSelector({
voiceName={getVoiceForLanguage(activeLanguage)}
language={activeLanguage}
disabled={disabled}
provider={preferences.provider}
model={preferences.model}
speed={preferences.speed}
stylePreset={preferences.style_preset}
customStylePrompt={preferences.custom_style_prompt}
stability={preferences.stability}
similarityBoost={preferences.similarity_boost}
/>
</div>

View file

@ -25,11 +25,12 @@ import type {
UpdateUserRequest,
ResetPasswordResponse,
AdminStatsResponse,
VoicesResponse,
ProviderVoicesResponse,
LanguagesResponse,
TTSPreferences,
TTSOptionsResponse,
ProviderOptionsResponse,
TTSStylePreset,
TTSProvider,
AccessibleVideoMethod,
ReviewNote,
ReviewNoteCreateRequest,
@ -349,8 +350,8 @@ class ApiClient {
}
// TTS endpoints
async getVoices(): Promise<VoicesResponse> {
const response = await this.client.get('/tts/voices');
async getVoices(provider: TTSProvider = 'gemini'): Promise<ProviderVoicesResponse> {
const response = await this.client.get(`/tts/voices?provider=${provider}`);
return response.data;
}
@ -359,8 +360,8 @@ class ApiClient {
return response.data;
}
async getTTSOptions(): Promise<TTSOptionsResponse> {
const response = await this.client.get('/tts/options');
async getTTSOptions(provider: TTSProvider = 'gemini'): Promise<ProviderOptionsResponse> {
const response = await this.client.get(`/tts/options?provider=${provider}`);
return response.data;
}
@ -370,17 +371,23 @@ class ApiClient {
model?: string,
speed?: number,
stylePreset?: TTSStylePreset,
customStylePrompt?: string
customStylePrompt?: string,
provider?: TTSProvider,
stability?: number,
similarityBoost?: number,
): Promise<Blob> {
const response = await this.client.post(
'/tts/preview',
{
voice_name: voiceName,
language,
provider: provider || 'gemini',
model: model || 'flash',
speed: speed || 1.0,
style_preset: stylePreset || 'neutral',
custom_style_prompt: customStylePrompt
custom_style_prompt: customStylePrompt,
stability: stability,
similarity_boost: similarityBoost,
},
{ responseType: 'blob' }
);

View file

@ -91,7 +91,9 @@ export function QCDetail() {
model: 'flash',
speed: 1.0,
style_preset: 'neutral',
custom_style_prompt: undefined
custom_style_prompt: undefined,
stability: undefined,
similarity_boost: undefined,
});
const [originalTtsPreferences, setOriginalTtsPreferences] = useState<TTSPreferences | null>(null);

View file

@ -44,7 +44,9 @@ export function NewJob() {
model: 'flash',
speed: 1.0,
style_preset: 'neutral',
custom_style_prompt: undefined
custom_style_prompt: undefined,
stability: undefined,
similarity_boost: undefined,
});
const [accessibleVideoMethod, setAccessibleVideoMethod] = useState<AccessibleVideoMethod>('pause_insert');

View file

@ -55,6 +55,9 @@ export interface TTSPreferences {
speed: number;
style_preset: TTSStylePreset;
custom_style_prompt?: string;
// ElevenLabs-specific settings
stability?: number;
similarity_boost?: number;
}
export interface RequestedOutputs {
@ -69,6 +72,22 @@ export interface RequestedOutputs {
translation_mode?: TranslationMode; // "video_native" (default) or "traditional"
}
export interface VoiceInfo {
id: string;
name: string;
description?: string;
preview_url?: string;
labels?: Record<string, string>;
category?: string;
}
export interface ProviderVoicesResponse {
provider: string;
voices: VoiceInfo[];
default: string;
}
/** @deprecated Use ProviderVoicesResponse instead */
export interface VoicesResponse {
voices: string[];
default: string;
@ -91,6 +110,25 @@ export interface SpeedRange {
step: number;
}
export interface FloatRange {
min: number;
max: number;
default: number;
step: number;
}
export interface ProviderOptionsResponse {
provider: string;
// Gemini-specific
models?: TTSOptionItem[];
style_presets?: TTSOptionItem[];
speed_range?: SpeedRange;
// ElevenLabs-specific
stability_range?: FloatRange;
similarity_boost_range?: FloatRange;
}
/** @deprecated Use ProviderOptionsResponse instead */
export interface TTSOptionsResponse {
models: TTSOptionItem[];
style_presets: TTSOptionItem[];