feat: add ElevenLabs voice selection to frontend and backend
Add dynamic ElevenLabs voice catalog with provider toggle in the UI, allowing users to browse ElevenLabs voices, configure stability and similarity boost settings, and preview/synthesize with ElevenLabs TTS. Backend: - New elevenlabs_voices.py service with 1-hour cached API fetching - TTS routes support ?provider= query param for voices and options - Preview endpoint routes to ElevenLabs or Gemini based on provider - stability/similarity_boost params flow through TTS synthesis pipeline - TTSPreferences model extended with ElevenLabs-specific fields - Deprecated hardcoded elevenlabs_voices config (now fetched dynamically) Frontend: - Provider toggle (Gemini/ElevenLabs) in VoiceSelector - ElevenLabsSettingsPanel with stability and similarity boost sliders - VoicePreviewButton supports provider-specific preview parameters - API client passes provider param to voices, options, and preview endpoints - New VoiceInfo, ProviderVoicesResponse, ProviderOptionsResponse types Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
31b7be0a2f
commit
1e177a6d5c
14 changed files with 537 additions and 88 deletions
|
|
@ -1,12 +1,14 @@
|
|||
from typing import Literal, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from fastapi.responses import Response
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from ...core.config import settings
|
||||
from ...core.logging import get_logger
|
||||
from ...services.gemini_tts import gemini_tts_service
|
||||
from ...services.elevenlabs_voices import elevenlabs_voice_service
|
||||
from ...services.tts import tts_service
|
||||
from ...core.dependencies import get_current_user
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
|
@ -18,17 +20,33 @@ class VoicePreviewRequest(BaseModel):
|
|||
"""Request to generate a voice preview"""
|
||||
voice_name: str
|
||||
language: str = "en"
|
||||
provider: Literal["gemini", "elevenlabs"] = "gemini"
|
||||
# Gemini-specific
|
||||
model: Literal["flash", "pro"] = "flash"
|
||||
speed: float = Field(default=1.0, ge=0.5, le=2.0)
|
||||
style_preset: Literal[
|
||||
"neutral", "calm", "energetic", "professional", "warm", "documentary", "custom"
|
||||
] = "neutral"
|
||||
custom_style_prompt: Optional[str] = None
|
||||
# ElevenLabs-specific
|
||||
stability: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
||||
similarity_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
||||
|
||||
|
||||
class VoicesResponse(BaseModel):
|
||||
"""Available TTS voices"""
|
||||
voices: list[str]
|
||||
class VoiceInfo(BaseModel):
|
||||
"""Structured voice information for any provider."""
|
||||
id: str
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
preview_url: Optional[str] = None
|
||||
labels: Optional[dict[str, str]] = None
|
||||
category: Optional[str] = None
|
||||
|
||||
|
||||
class ProviderVoicesResponse(BaseModel):
|
||||
"""Available TTS voices for a specific provider."""
|
||||
provider: str
|
||||
voices: list[VoiceInfo]
|
||||
default: str
|
||||
|
||||
|
||||
|
|
@ -52,23 +70,63 @@ class SpeedRange(BaseModel):
|
|||
step: float
|
||||
|
||||
|
||||
class TTSOptionsResponse(BaseModel):
|
||||
"""Available TTS configuration options"""
|
||||
models: list[TTSOptionItem]
|
||||
style_presets: list[TTSOptionItem]
|
||||
speed_range: SpeedRange
|
||||
class FloatRange(BaseModel):
|
||||
"""Generic float range for sliders."""
|
||||
min: float
|
||||
max: float
|
||||
default: float
|
||||
step: float
|
||||
|
||||
|
||||
@router.get("/voices", response_model=VoicesResponse)
|
||||
class ProviderOptionsResponse(BaseModel):
|
||||
"""Available TTS configuration options for a provider."""
|
||||
provider: str
|
||||
# Gemini-specific
|
||||
models: Optional[list[TTSOptionItem]] = None
|
||||
style_presets: Optional[list[TTSOptionItem]] = None
|
||||
speed_range: Optional[SpeedRange] = None
|
||||
# ElevenLabs-specific
|
||||
stability_range: Optional[FloatRange] = None
|
||||
similarity_boost_range: Optional[FloatRange] = None
|
||||
|
||||
|
||||
@router.get("/voices", response_model=ProviderVoicesResponse)
|
||||
async def list_voices(
|
||||
current_user=Depends(get_current_user)
|
||||
) -> VoicesResponse:
|
||||
provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"),
|
||||
current_user=Depends(get_current_user),
|
||||
) -> ProviderVoicesResponse:
|
||||
"""
|
||||
List all available Gemini TTS voices.
|
||||
List available TTS voices for the specified provider.
|
||||
"""
|
||||
return VoicesResponse(
|
||||
voices=settings.gemini_tts_voices,
|
||||
default=settings.gemini_tts_default_voice
|
||||
if provider == "elevenlabs":
|
||||
el_voices = await elevenlabs_voice_service.get_voices()
|
||||
voices = [
|
||||
VoiceInfo(
|
||||
id=v.voice_id,
|
||||
name=v.name,
|
||||
description=v.description or None,
|
||||
preview_url=v.preview_url or None,
|
||||
labels=v.labels or None,
|
||||
category=v.category or None,
|
||||
)
|
||||
for v in el_voices
|
||||
]
|
||||
default_id = voices[0].id if voices else ""
|
||||
return ProviderVoicesResponse(
|
||||
provider="elevenlabs",
|
||||
voices=voices,
|
||||
default=default_id,
|
||||
)
|
||||
|
||||
# Default: Gemini
|
||||
voices = [
|
||||
VoiceInfo(id=name, name=name)
|
||||
for name in settings.gemini_tts_voices
|
||||
]
|
||||
return ProviderVoicesResponse(
|
||||
provider="gemini",
|
||||
voices=voices,
|
||||
default=settings.gemini_tts_default_voice,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -85,14 +143,24 @@ async def list_languages(
|
|||
)
|
||||
|
||||
|
||||
@router.get("/options", response_model=TTSOptionsResponse)
|
||||
@router.get("/options", response_model=ProviderOptionsResponse)
|
||||
async def get_tts_options(
|
||||
current_user=Depends(get_current_user)
|
||||
) -> TTSOptionsResponse:
|
||||
provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"),
|
||||
current_user=Depends(get_current_user),
|
||||
) -> ProviderOptionsResponse:
|
||||
"""
|
||||
Get available TTS configuration options including models, style presets, and speed range.
|
||||
Get available TTS configuration options for the specified provider.
|
||||
"""
|
||||
return TTSOptionsResponse(
|
||||
if provider == "elevenlabs":
|
||||
return ProviderOptionsResponse(
|
||||
provider="elevenlabs",
|
||||
stability_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05),
|
||||
similarity_boost_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05),
|
||||
)
|
||||
|
||||
# Default: Gemini
|
||||
return ProviderOptionsResponse(
|
||||
provider="gemini",
|
||||
models=[
|
||||
TTSOptionItem(value="flash", label="Flash (Fast, Cost-efficient)"),
|
||||
TTSOptionItem(value="pro", label="Pro (Higher Quality)"),
|
||||
|
|
@ -111,7 +179,7 @@ async def get_tts_options(
|
|||
max=settings.gemini_tts_speed_max,
|
||||
default=settings.gemini_tts_speed_default,
|
||||
step=settings.gemini_tts_speed_step
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -124,6 +192,14 @@ async def preview_voice(
|
|||
Generate a voice preview audio sample with all TTS settings applied.
|
||||
Returns MP3 audio data.
|
||||
"""
|
||||
if request.provider == "elevenlabs":
|
||||
return await _preview_elevenlabs(request)
|
||||
|
||||
return await _preview_gemini(request)
|
||||
|
||||
|
||||
async def _preview_gemini(request: VoicePreviewRequest) -> Response:
|
||||
"""Generate a Gemini TTS voice preview."""
|
||||
# Validate voice name
|
||||
if request.voice_name not in settings.gemini_tts_voices:
|
||||
raise HTTPException(
|
||||
|
|
@ -146,11 +222,10 @@ async def preview_voice(
|
|||
|
||||
try:
|
||||
logger.info(
|
||||
f"Generating voice preview: voice={request.voice_name}, language={request.language}, "
|
||||
f"Generating Gemini voice preview: voice={request.voice_name}, language={request.language}, "
|
||||
f"model={request.model}, speed={request.speed}x, style={request.style_preset}"
|
||||
)
|
||||
|
||||
# Generate preview audio with all settings
|
||||
audio_data = await gemini_tts_service.synthesize_preview(
|
||||
voice_name=request.voice_name,
|
||||
language=request.language,
|
||||
|
|
@ -168,7 +243,53 @@ async def preview_voice(
|
|||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Voice preview generation failed: {e}")
|
||||
logger.error(f"Gemini voice preview generation failed: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to generate voice preview: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
async def _preview_elevenlabs(request: VoicePreviewRequest) -> Response:
|
||||
"""Generate an ElevenLabs TTS voice preview."""
|
||||
if not tts_service.elevenlabs_available:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="ElevenLabs TTS is not configured"
|
||||
)
|
||||
|
||||
# Get sample text for the language
|
||||
sample_text = settings.gemini_tts_preview_samples.get(
|
||||
request.language,
|
||||
settings.gemini_tts_preview_samples.get("en", "This is a preview of the audio description voice.")
|
||||
)
|
||||
|
||||
stability = request.stability if request.stability is not None else 0.5
|
||||
similarity_boost = request.similarity_boost if request.similarity_boost is not None else 0.5
|
||||
|
||||
try:
|
||||
logger.info(
|
||||
f"Generating ElevenLabs voice preview: voice={request.voice_name}, language={request.language}, "
|
||||
f"stability={stability}, similarity_boost={similarity_boost}"
|
||||
)
|
||||
|
||||
audio_data = await tts_service._synthesize_text_elevenlabs(
|
||||
text=sample_text,
|
||||
voice_id=request.voice_name,
|
||||
stability=stability,
|
||||
similarity_boost=similarity_boost,
|
||||
)
|
||||
|
||||
return Response(
|
||||
content=audio_data,
|
||||
media_type="audio/mpeg",
|
||||
headers={
|
||||
"Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3"
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ElevenLabs voice preview generation failed: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to generate voice preview: {str(e)}"
|
||||
|
|
|
|||
|
|
@ -45,12 +45,9 @@ class Settings(BaseSettings):
|
|||
"fr-FR": "fr-FR-Neural2-A",
|
||||
"de-DE": "de-DE-Neural2-B"
|
||||
}
|
||||
elevenlabs_voices: dict[str, str] = {
|
||||
"en-US": "21m00Tcm4TlvDq8ikWAM",
|
||||
"es-ES": "VR6AewLTigWG4xSOukaG",
|
||||
"fr-FR": "TxGEqnHWrfWFTfGW9XjX",
|
||||
"de-DE": "pNInz6obpgDQGcFmaJgB"
|
||||
}
|
||||
# Deprecated: ElevenLabs voices are now fetched dynamically via the API.
|
||||
# This fallback map is only used by _get_elevenlabs_voice() when no voice_name is provided.
|
||||
elevenlabs_voices: dict[str, str] = {}
|
||||
|
||||
# Gemini TTS Configuration
|
||||
gemini_tts_model: str = "gemini-2.5-flash-preview-tts"
|
||||
|
|
|
|||
|
|
@ -51,6 +51,9 @@ class TTSPreferences(BaseModel):
|
|||
"neutral", "calm", "energetic", "professional", "warm", "documentary", "custom"
|
||||
] = "neutral"
|
||||
custom_style_prompt: Optional[str] = None # Used when style_preset is "custom"
|
||||
# ElevenLabs-specific settings
|
||||
stability: Optional[float] = None # 0.0-1.0, default 0.5 when used
|
||||
similarity_boost: Optional[float] = None # 0.0-1.0, default 0.5 when used
|
||||
|
||||
|
||||
class RequestedOutputs(BaseModel):
|
||||
|
|
|
|||
101
backend/app/services/elevenlabs_voices.py
Normal file
101
backend/app/services/elevenlabs_voices.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
"""
|
||||
ElevenLabs Voice Catalog Service.
|
||||
|
||||
Fetches and caches available voices from the ElevenLabs API.
|
||||
"""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
CACHE_TTL_SECONDS = 3600 # 1 hour
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElevenLabsVoice:
|
||||
"""Structured voice data from ElevenLabs."""
|
||||
voice_id: str
|
||||
name: str
|
||||
category: str = ""
|
||||
description: str = ""
|
||||
preview_url: str = ""
|
||||
labels: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
class ElevenLabsVoiceService:
|
||||
def __init__(self):
|
||||
self._cache: list[ElevenLabsVoice] = []
|
||||
self._cache_time: float = 0.0
|
||||
|
||||
def _is_cache_valid(self) -> bool:
|
||||
return bool(self._cache) and (time.time() - self._cache_time) < CACHE_TTL_SECONDS
|
||||
|
||||
async def get_voices(self) -> list[ElevenLabsVoice]:
|
||||
"""
|
||||
Fetch voices from ElevenLabs API with in-memory cache (1-hour TTL).
|
||||
Falls back to stale cache on API failure.
|
||||
"""
|
||||
if self._is_cache_valid():
|
||||
return self._cache
|
||||
|
||||
if not settings.elevenlabs_api_key:
|
||||
logger.warning("ElevenLabs API key not configured")
|
||||
return self._cache # Return stale cache or empty
|
||||
|
||||
try:
|
||||
voices = await self._fetch_voices()
|
||||
self._cache = voices
|
||||
self._cache_time = time.time()
|
||||
logger.info(f"Fetched {len(voices)} voices from ElevenLabs API")
|
||||
return voices
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch ElevenLabs voices, using stale cache: {e}")
|
||||
return self._cache # Stale cache fallback
|
||||
|
||||
async def _fetch_voices(self) -> list[ElevenLabsVoice]:
|
||||
"""Fetch voices from the ElevenLabs API."""
|
||||
url = "https://api.elevenlabs.io/v1/voices"
|
||||
headers = {
|
||||
"xi-api-key": settings.elevenlabs_api_key,
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers=headers) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
raise ValueError(f"ElevenLabs API error: {response.status} - {error_text}")
|
||||
|
||||
data = await response.json()
|
||||
|
||||
voices: list[ElevenLabsVoice] = []
|
||||
for v in data.get("voices", []):
|
||||
voices.append(ElevenLabsVoice(
|
||||
voice_id=v.get("voice_id", ""),
|
||||
name=v.get("name", ""),
|
||||
category=v.get("category", ""),
|
||||
description=v.get("description", ""),
|
||||
preview_url=v.get("preview_url", ""),
|
||||
labels=v.get("labels", {}),
|
||||
))
|
||||
|
||||
return voices
|
||||
|
||||
async def get_voice_by_id(self, voice_id: str) -> Optional[ElevenLabsVoice]:
|
||||
"""Look up a specific voice by ID."""
|
||||
voices = await self.get_voices()
|
||||
for v in voices:
|
||||
if v.voice_id == voice_id:
|
||||
return v
|
||||
return None
|
||||
|
||||
|
||||
# Singleton instance
|
||||
elevenlabs_voice_service = ElevenLabsVoiceService()
|
||||
|
|
@ -51,7 +51,9 @@ class TTSService:
|
|||
provider: Optional[str] = None,
|
||||
model: str = "flash",
|
||||
speed: float = 1.0,
|
||||
style_prompt: str = ""
|
||||
style_prompt: str = "",
|
||||
stability: float = 0.5,
|
||||
similarity_boost: float = 0.5,
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate MP3 audio from audio description VTT content.
|
||||
|
|
@ -104,7 +106,10 @@ class TTSService:
|
|||
|
||||
if self.elevenlabs_available:
|
||||
logger.info(f"Using ElevenLabs TTS for language: {language_code}")
|
||||
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
|
||||
return await self._synthesize_with_elevenlabs(
|
||||
ad_vtt_content, language_code, voice_name,
|
||||
stability=stability, similarity_boost=similarity_boost,
|
||||
)
|
||||
|
||||
raise ValueError("No TTS service available")
|
||||
|
||||
|
|
@ -116,7 +121,9 @@ class TTSService:
|
|||
provider: Optional[str] = None,
|
||||
model: str = "flash",
|
||||
speed: float = 1.0,
|
||||
style_prompt: str = ""
|
||||
style_prompt: str = "",
|
||||
stability: float = 0.5,
|
||||
similarity_boost: float = 0.5,
|
||||
) -> tuple[bytes, list[TTSCueSegment]]:
|
||||
"""
|
||||
Generate MP3 audio from audio description VTT content AND return individual segments.
|
||||
|
|
@ -168,7 +175,10 @@ class TTSService:
|
|||
audio_data = await self._synthesize_text_google(text, language_code, voice_name)
|
||||
elif self.elevenlabs_available:
|
||||
voice_id = self._get_elevenlabs_voice(language_code, voice_name)
|
||||
audio_data = await self._synthesize_text_elevenlabs(text, voice_id)
|
||||
audio_data = await self._synthesize_text_elevenlabs(
|
||||
text, voice_id,
|
||||
stability=stability, similarity_boost=similarity_boost,
|
||||
)
|
||||
else:
|
||||
raise ValueError("No TTS service available")
|
||||
|
||||
|
|
@ -277,7 +287,9 @@ class TTSService:
|
|||
self,
|
||||
ad_vtt_content: str,
|
||||
language_code: str = "en-US",
|
||||
voice_name: Optional[str] = None
|
||||
voice_name: Optional[str] = None,
|
||||
stability: float = 0.5,
|
||||
similarity_boost: float = 0.5,
|
||||
) -> bytes:
|
||||
"""Generate MP3 using ElevenLabs TTS"""
|
||||
# Parse VTT cues
|
||||
|
|
@ -307,7 +319,10 @@ class TTSService:
|
|||
# Synthesize this cue with ElevenLabs
|
||||
text = cue["text"].strip()
|
||||
if text:
|
||||
audio_data = await self._synthesize_text_elevenlabs(text, voice_id)
|
||||
audio_data = await self._synthesize_text_elevenlabs(
|
||||
text, voice_id,
|
||||
stability=stability, similarity_boost=similarity_boost,
|
||||
)
|
||||
|
||||
# Convert to AudioSegment and get actual duration
|
||||
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
|
|
@ -360,7 +375,13 @@ class TTSService:
|
|||
|
||||
return response.audio_content
|
||||
|
||||
async def _synthesize_text_elevenlabs(self, text: str, voice_id: str) -> bytes:
|
||||
async def _synthesize_text_elevenlabs(
|
||||
self,
|
||||
text: str,
|
||||
voice_id: str,
|
||||
stability: float = 0.5,
|
||||
similarity_boost: float = 0.5,
|
||||
) -> bytes:
|
||||
"""Synthesize text using ElevenLabs API"""
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
|
||||
|
||||
|
|
@ -374,8 +395,8 @@ class TTSService:
|
|||
"text": text,
|
||||
"model_id": "eleven_multilingual_v2",
|
||||
"voice_settings": {
|
||||
"stability": 0.5,
|
||||
"similarity_boost": 0.5,
|
||||
"stability": stability,
|
||||
"similarity_boost": similarity_boost,
|
||||
"style": 0.0,
|
||||
"use_speaker_boost": True
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,7 +44,9 @@ def synthesize_cue_task(
|
|||
provider: str,
|
||||
model: str,
|
||||
speed: float,
|
||||
style_prompt: str
|
||||
style_prompt: str,
|
||||
stability: float = 0.5,
|
||||
similarity_boost: float = 0.5,
|
||||
) -> dict:
|
||||
"""
|
||||
Synthesize a single AD cue and upload to GCS immediately.
|
||||
|
|
@ -84,7 +86,9 @@ def synthesize_cue_task(
|
|||
provider=provider,
|
||||
model=model,
|
||||
speed=speed,
|
||||
style_prompt=style_prompt
|
||||
style_prompt=style_prompt,
|
||||
stability=stability,
|
||||
similarity_boost=similarity_boost,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -154,7 +158,9 @@ async def _synthesize_single_cue(
|
|||
provider: str,
|
||||
model: str,
|
||||
speed: float,
|
||||
style_prompt: str
|
||||
style_prompt: str,
|
||||
stability: float = 0.5,
|
||||
similarity_boost: float = 0.5,
|
||||
) -> tuple[bytes, float]:
|
||||
"""
|
||||
Synthesize a single cue's text to audio.
|
||||
|
|
@ -186,7 +192,10 @@ async def _synthesize_single_cue(
|
|||
elif provider == "elevenlabs":
|
||||
language_code = f"{simple_lang}-US" if simple_lang == "en" else f"{simple_lang}-{simple_lang.upper()}"
|
||||
voice_id = tts_service._get_elevenlabs_voice(language_code, voice_name)
|
||||
audio_bytes = await tts_service._synthesize_text_elevenlabs(text, voice_id)
|
||||
audio_bytes = await tts_service._synthesize_text_elevenlabs(
|
||||
text, voice_id,
|
||||
stability=stability, similarity_boost=similarity_boost,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown TTS provider: {provider}")
|
||||
|
||||
|
|
@ -262,6 +271,8 @@ def dispatch_language_tts(
|
|||
speed = tts_preferences.get("speed", 1.0)
|
||||
style_preset = tts_preferences.get("style_preset", "neutral")
|
||||
custom_style_prompt = tts_preferences.get("custom_style_prompt")
|
||||
stability = tts_preferences.get("stability") if tts_preferences.get("stability") is not None else 0.5
|
||||
similarity_boost = tts_preferences.get("similarity_boost") if tts_preferences.get("similarity_boost") is not None else 0.5
|
||||
|
||||
# Resolve style prompt from preset or custom
|
||||
if style_preset == "custom" and custom_style_prompt:
|
||||
|
|
@ -287,7 +298,9 @@ def dispatch_language_tts(
|
|||
provider=provider,
|
||||
model=model,
|
||||
speed=speed,
|
||||
style_prompt=style_prompt
|
||||
style_prompt=style_prompt,
|
||||
stability=stability,
|
||||
similarity_boost=similarity_boost,
|
||||
)
|
||||
for i, cue in enumerate(cues)
|
||||
if cue.get("text", "").strip() # Skip empty cues
|
||||
|
|
|
|||
64
frontend/src/components/ElevenLabsSettingsPanel.tsx
Normal file
64
frontend/src/components/ElevenLabsSettingsPanel.tsx
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
import type { TTSPreferences } from '../types/api';
|
||||
|
||||
interface ElevenLabsSettingsPanelProps {
|
||||
preferences: TTSPreferences;
|
||||
onChange: (preferences: TTSPreferences) => void;
|
||||
disabled?: boolean;
|
||||
}
|
||||
|
||||
export function ElevenLabsSettingsPanel({ preferences, onChange, disabled }: ElevenLabsSettingsPanelProps) {
|
||||
const stability = preferences.stability ?? 0.5;
|
||||
const similarityBoost = preferences.similarity_boost ?? 0.5;
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
{/* Stability Slider */}
|
||||
<div className="bg-gray-50 rounded-lg p-4">
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
Stability: {stability.toFixed(2)}
|
||||
</label>
|
||||
<p className="text-xs text-gray-500 mb-3">
|
||||
Controls how consistent the voice is between regenerations. Higher values are more stable.
|
||||
</p>
|
||||
<input
|
||||
type="range"
|
||||
min={0}
|
||||
max={1}
|
||||
step={0.05}
|
||||
value={stability}
|
||||
onChange={(e) => onChange({ ...preferences, stability: parseFloat(e.target.value) })}
|
||||
disabled={disabled}
|
||||
className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed"
|
||||
/>
|
||||
<div className="flex justify-between text-xs text-gray-500 mt-1">
|
||||
<span>More Variable</span>
|
||||
<span>More Stable</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Similarity Boost Slider */}
|
||||
<div className="bg-gray-50 rounded-lg p-4">
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
Similarity Boost: {similarityBoost.toFixed(2)}
|
||||
</label>
|
||||
<p className="text-xs text-gray-500 mb-3">
|
||||
Controls how closely the voice tries to match the original. Higher values increase similarity.
|
||||
</p>
|
||||
<input
|
||||
type="range"
|
||||
min={0}
|
||||
max={1}
|
||||
step={0.05}
|
||||
value={similarityBoost}
|
||||
onChange={(e) => onChange({ ...preferences, similarity_boost: parseFloat(e.target.value) })}
|
||||
disabled={disabled}
|
||||
className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed"
|
||||
/>
|
||||
<div className="flex justify-between text-xs text-gray-500 mt-1">
|
||||
<span>Low</span>
|
||||
<span>High</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import { useState, useEffect } from 'react';
|
||||
import { api } from '../lib/api';
|
||||
import type { TTSPreferences, TTSOptionsResponse, TTSModel, TTSStylePreset } from '../types/api';
|
||||
import type { TTSPreferences, ProviderOptionsResponse, TTSModel, TTSStylePreset } from '../types/api';
|
||||
|
||||
interface TTSSettingsPanelProps {
|
||||
preferences: TTSPreferences;
|
||||
|
|
@ -9,7 +9,7 @@ interface TTSSettingsPanelProps {
|
|||
}
|
||||
|
||||
export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSettingsPanelProps) {
|
||||
const [options, setOptions] = useState<TTSOptionsResponse | null>(null);
|
||||
const [options, setOptions] = useState<ProviderOptionsResponse | null>(null);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
|
|
@ -18,7 +18,7 @@ export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSetting
|
|||
const fetchOptions = async () => {
|
||||
try {
|
||||
setLoading(true);
|
||||
const data = await api.getTTSOptions();
|
||||
const data = await api.getTTSOptions('gemini');
|
||||
setOptions(data);
|
||||
setError(null);
|
||||
} catch (err) {
|
||||
|
|
@ -50,7 +50,7 @@ export function TTSSettingsPanel({ preferences, onChange, disabled }: TTSSetting
|
|||
);
|
||||
}
|
||||
|
||||
if (!options) return null;
|
||||
if (!options || !options.models || !options.style_presets || !options.speed_range) return null;
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
|
|
|
|||
|
|
@ -1,25 +1,31 @@
|
|||
import { useState, useRef, useEffect } from 'react';
|
||||
import { api } from '../lib/api';
|
||||
import type { TTSStylePreset } from '../types/api';
|
||||
import type { TTSStylePreset, TTSProvider } from '../types/api';
|
||||
|
||||
interface VoicePreviewButtonProps {
|
||||
voiceName: string;
|
||||
language: string;
|
||||
disabled?: boolean;
|
||||
provider?: TTSProvider;
|
||||
model?: string;
|
||||
speed?: number;
|
||||
stylePreset?: TTSStylePreset;
|
||||
customStylePrompt?: string;
|
||||
stability?: number;
|
||||
similarityBoost?: number;
|
||||
}
|
||||
|
||||
export function VoicePreviewButton({
|
||||
voiceName,
|
||||
language,
|
||||
disabled,
|
||||
provider,
|
||||
model,
|
||||
speed,
|
||||
stylePreset,
|
||||
customStylePrompt
|
||||
customStylePrompt,
|
||||
stability,
|
||||
similarityBoost,
|
||||
}: VoicePreviewButtonProps) {
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [isPlaying, setIsPlaying] = useState(false);
|
||||
|
|
@ -41,7 +47,7 @@ export function VoicePreviewButton({
|
|||
}
|
||||
setIsPlaying(false);
|
||||
setError(null);
|
||||
}, [voiceName, language, model, speed, stylePreset, customStylePrompt]);
|
||||
}, [voiceName, language, provider, model, speed, stylePreset, customStylePrompt, stability, similarityBoost]);
|
||||
|
||||
const handlePreview = async () => {
|
||||
setError(null);
|
||||
|
|
@ -70,7 +76,10 @@ export function VoicePreviewButton({
|
|||
model,
|
||||
speed,
|
||||
stylePreset,
|
||||
customStylePrompt
|
||||
customStylePrompt,
|
||||
provider,
|
||||
stability,
|
||||
similarityBoost,
|
||||
);
|
||||
const url = URL.createObjectURL(blob);
|
||||
|
||||
|
|
@ -104,9 +113,6 @@ export function VoicePreviewButton({
|
|||
}
|
||||
};
|
||||
|
||||
// Cleanup on unmount
|
||||
// Note: We don't add cleanup in useEffect to allow audio caching within component lifecycle
|
||||
|
||||
return (
|
||||
<div className="inline-flex items-center gap-2">
|
||||
<button
|
||||
|
|
|
|||
|
|
@ -2,7 +2,8 @@ import { useState, useEffect } from 'react';
|
|||
import { api } from '../lib/api';
|
||||
import { VoicePreviewButton } from './VoicePreviewButton';
|
||||
import { TTSSettingsPanel } from './TTSSettingsPanel';
|
||||
import type { TTSPreferences, VoicesResponse, LanguagesResponse } from '../types/api';
|
||||
import { ElevenLabsSettingsPanel } from './ElevenLabsSettingsPanel';
|
||||
import type { TTSPreferences, ProviderVoicesResponse, LanguagesResponse, TTSProvider } from '../types/api';
|
||||
|
||||
interface VoiceSelectorProps {
|
||||
selectedLanguages: string[];
|
||||
|
|
@ -17,23 +18,37 @@ export function VoiceSelector({
|
|||
onChange,
|
||||
disabled
|
||||
}: VoiceSelectorProps) {
|
||||
const [voices, setVoices] = useState<VoicesResponse | null>(null);
|
||||
const [voices, setVoices] = useState<ProviderVoicesResponse | null>(null);
|
||||
const [languages, setLanguages] = useState<LanguagesResponse | null>(null);
|
||||
const [activeLanguage, setActiveLanguage] = useState<string>(selectedLanguages[0] || 'en');
|
||||
const [isLoading, setIsLoading] = useState(true);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
// Fetch voices and languages on mount
|
||||
// Fetch voices and languages when provider changes
|
||||
useEffect(() => {
|
||||
const fetchData = async () => {
|
||||
try {
|
||||
setIsLoading(true);
|
||||
setError(null);
|
||||
const [voicesData, languagesData] = await Promise.all([
|
||||
api.getVoices(),
|
||||
api.getVoices(preferences.provider),
|
||||
api.getLanguages()
|
||||
]);
|
||||
setVoices(voicesData);
|
||||
setLanguages(languagesData);
|
||||
|
||||
// Set default voice from API response if switching providers
|
||||
if (voicesData.default && voicesData.voices.length > 0) {
|
||||
// Only reset default voice if the current one isn't in the new voice list
|
||||
const currentVoiceExists = voicesData.voices.some(v => v.id === preferences.default_voice);
|
||||
if (!currentVoiceExists) {
|
||||
onChange({
|
||||
...preferences,
|
||||
default_voice: voicesData.default,
|
||||
voices_per_language: {},
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
setError('Failed to load voice options');
|
||||
console.error('Voice selector error:', err);
|
||||
|
|
@ -43,7 +58,8 @@ export function VoiceSelector({
|
|||
};
|
||||
|
||||
fetchData();
|
||||
}, []);
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [preferences.provider]);
|
||||
|
||||
// Update active language when selected languages change
|
||||
useEffect(() => {
|
||||
|
|
@ -52,21 +68,31 @@ export function VoiceSelector({
|
|||
}
|
||||
}, [selectedLanguages, activeLanguage]);
|
||||
|
||||
const handleDefaultVoiceChange = (voice: string) => {
|
||||
const handleProviderChange = (provider: TTSProvider) => {
|
||||
if (provider === preferences.provider) return;
|
||||
onChange({
|
||||
...preferences,
|
||||
default_voice: voice
|
||||
provider,
|
||||
default_voice: '', // Will be set after fetch
|
||||
voices_per_language: {},
|
||||
});
|
||||
};
|
||||
|
||||
const handleLanguageVoiceChange = (language: string, voice: string) => {
|
||||
const handleDefaultVoiceChange = (voiceId: string) => {
|
||||
onChange({
|
||||
...preferences,
|
||||
default_voice: voiceId
|
||||
});
|
||||
};
|
||||
|
||||
const handleLanguageVoiceChange = (language: string, voiceId: string) => {
|
||||
const newVoicesPerLanguage = {
|
||||
...preferences.voices_per_language,
|
||||
[language]: voice
|
||||
[language]: voiceId
|
||||
};
|
||||
|
||||
// If voice matches default, remove from per-language overrides
|
||||
if (voice === preferences.default_voice) {
|
||||
if (voiceId === preferences.default_voice) {
|
||||
delete newVoicesPerLanguage[language];
|
||||
}
|
||||
|
||||
|
|
@ -113,13 +139,45 @@ export function VoiceSelector({
|
|||
}
|
||||
|
||||
// Filter languages to only show selected ones
|
||||
// The first language in selectedLanguages is the primary/source language
|
||||
const displayLanguages = selectedLanguages.length > 0
|
||||
? selectedLanguages
|
||||
: ['en'];
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
{/* Provider Toggle */}
|
||||
<div className="bg-gray-50 rounded-lg p-4">
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
TTS Provider
|
||||
</label>
|
||||
<div className="flex rounded-lg border border-gray-300 overflow-hidden">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => handleProviderChange('gemini')}
|
||||
disabled={disabled}
|
||||
className={`flex-1 px-4 py-2 text-sm font-medium transition-colors ${
|
||||
preferences.provider === 'gemini'
|
||||
? 'bg-blue-600 text-white'
|
||||
: 'bg-white text-gray-700 hover:bg-gray-50'
|
||||
} disabled:opacity-50`}
|
||||
>
|
||||
Gemini
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => handleProviderChange('elevenlabs')}
|
||||
disabled={disabled}
|
||||
className={`flex-1 px-4 py-2 text-sm font-medium border-l border-gray-300 transition-colors ${
|
||||
preferences.provider === 'elevenlabs'
|
||||
? 'bg-blue-600 text-white'
|
||||
: 'bg-white text-gray-700 hover:bg-gray-50'
|
||||
} disabled:opacity-50`}
|
||||
>
|
||||
ElevenLabs
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Default Voice Selection */}
|
||||
<div className="bg-gray-50 rounded-lg p-4">
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
|
|
@ -136,8 +194,9 @@ export function VoiceSelector({
|
|||
className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm"
|
||||
>
|
||||
{voices.voices.map((voice) => (
|
||||
<option key={voice} value={voice}>
|
||||
{voice}
|
||||
<option key={voice.id} value={voice.id}>
|
||||
{voice.name}
|
||||
{voice.category ? ` (${voice.category})` : ''}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
|
|
@ -145,20 +204,31 @@ export function VoiceSelector({
|
|||
voiceName={preferences.default_voice}
|
||||
language={displayLanguages[0] || 'en'}
|
||||
disabled={disabled}
|
||||
provider={preferences.provider}
|
||||
model={preferences.model}
|
||||
speed={preferences.speed}
|
||||
stylePreset={preferences.style_preset}
|
||||
customStylePrompt={preferences.custom_style_prompt}
|
||||
stability={preferences.stability}
|
||||
similarityBoost={preferences.similarity_boost}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* TTS Settings (Model, Speed, Style) */}
|
||||
<TTSSettingsPanel
|
||||
preferences={preferences}
|
||||
onChange={onChange}
|
||||
disabled={disabled}
|
||||
/>
|
||||
{/* TTS Settings - Provider-specific */}
|
||||
{preferences.provider === 'elevenlabs' ? (
|
||||
<ElevenLabsSettingsPanel
|
||||
preferences={preferences}
|
||||
onChange={onChange}
|
||||
disabled={disabled}
|
||||
/>
|
||||
) : (
|
||||
<TTSSettingsPanel
|
||||
preferences={preferences}
|
||||
onChange={onChange}
|
||||
disabled={disabled}
|
||||
/>
|
||||
)}
|
||||
|
||||
{/* Per-Language Voice Overrides */}
|
||||
{displayLanguages.length > 1 && (
|
||||
|
|
@ -202,9 +272,10 @@ export function VoiceSelector({
|
|||
className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm"
|
||||
>
|
||||
{voices.voices.map((voice) => (
|
||||
<option key={voice} value={voice}>
|
||||
{voice}
|
||||
{voice === preferences.default_voice ? ' (default)' : ''}
|
||||
<option key={voice.id} value={voice.id}>
|
||||
{voice.name}
|
||||
{voice.category ? ` (${voice.category})` : ''}
|
||||
{voice.id === preferences.default_voice ? ' (default)' : ''}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
|
|
@ -212,10 +283,13 @@ export function VoiceSelector({
|
|||
voiceName={getVoiceForLanguage(activeLanguage)}
|
||||
language={activeLanguage}
|
||||
disabled={disabled}
|
||||
provider={preferences.provider}
|
||||
model={preferences.model}
|
||||
speed={preferences.speed}
|
||||
stylePreset={preferences.style_preset}
|
||||
customStylePrompt={preferences.custom_style_prompt}
|
||||
stability={preferences.stability}
|
||||
similarityBoost={preferences.similarity_boost}
|
||||
/>
|
||||
</div>
|
||||
|
||||
|
|
|
|||
|
|
@ -25,11 +25,12 @@ import type {
|
|||
UpdateUserRequest,
|
||||
ResetPasswordResponse,
|
||||
AdminStatsResponse,
|
||||
VoicesResponse,
|
||||
ProviderVoicesResponse,
|
||||
LanguagesResponse,
|
||||
TTSPreferences,
|
||||
TTSOptionsResponse,
|
||||
ProviderOptionsResponse,
|
||||
TTSStylePreset,
|
||||
TTSProvider,
|
||||
AccessibleVideoMethod,
|
||||
ReviewNote,
|
||||
ReviewNoteCreateRequest,
|
||||
|
|
@ -349,8 +350,8 @@ class ApiClient {
|
|||
}
|
||||
|
||||
// TTS endpoints
|
||||
async getVoices(): Promise<VoicesResponse> {
|
||||
const response = await this.client.get('/tts/voices');
|
||||
async getVoices(provider: TTSProvider = 'gemini'): Promise<ProviderVoicesResponse> {
|
||||
const response = await this.client.get(`/tts/voices?provider=${provider}`);
|
||||
return response.data;
|
||||
}
|
||||
|
||||
|
|
@ -359,8 +360,8 @@ class ApiClient {
|
|||
return response.data;
|
||||
}
|
||||
|
||||
async getTTSOptions(): Promise<TTSOptionsResponse> {
|
||||
const response = await this.client.get('/tts/options');
|
||||
async getTTSOptions(provider: TTSProvider = 'gemini'): Promise<ProviderOptionsResponse> {
|
||||
const response = await this.client.get(`/tts/options?provider=${provider}`);
|
||||
return response.data;
|
||||
}
|
||||
|
||||
|
|
@ -370,17 +371,23 @@ class ApiClient {
|
|||
model?: string,
|
||||
speed?: number,
|
||||
stylePreset?: TTSStylePreset,
|
||||
customStylePrompt?: string
|
||||
customStylePrompt?: string,
|
||||
provider?: TTSProvider,
|
||||
stability?: number,
|
||||
similarityBoost?: number,
|
||||
): Promise<Blob> {
|
||||
const response = await this.client.post(
|
||||
'/tts/preview',
|
||||
{
|
||||
voice_name: voiceName,
|
||||
language,
|
||||
provider: provider || 'gemini',
|
||||
model: model || 'flash',
|
||||
speed: speed || 1.0,
|
||||
style_preset: stylePreset || 'neutral',
|
||||
custom_style_prompt: customStylePrompt
|
||||
custom_style_prompt: customStylePrompt,
|
||||
stability: stability,
|
||||
similarity_boost: similarityBoost,
|
||||
},
|
||||
{ responseType: 'blob' }
|
||||
);
|
||||
|
|
|
|||
|
|
@ -91,7 +91,9 @@ export function QCDetail() {
|
|||
model: 'flash',
|
||||
speed: 1.0,
|
||||
style_preset: 'neutral',
|
||||
custom_style_prompt: undefined
|
||||
custom_style_prompt: undefined,
|
||||
stability: undefined,
|
||||
similarity_boost: undefined,
|
||||
});
|
||||
const [originalTtsPreferences, setOriginalTtsPreferences] = useState<TTSPreferences | null>(null);
|
||||
|
||||
|
|
|
|||
|
|
@ -44,7 +44,9 @@ export function NewJob() {
|
|||
model: 'flash',
|
||||
speed: 1.0,
|
||||
style_preset: 'neutral',
|
||||
custom_style_prompt: undefined
|
||||
custom_style_prompt: undefined,
|
||||
stability: undefined,
|
||||
similarity_boost: undefined,
|
||||
});
|
||||
const [accessibleVideoMethod, setAccessibleVideoMethod] = useState<AccessibleVideoMethod>('pause_insert');
|
||||
|
||||
|
|
|
|||
|
|
@ -55,6 +55,9 @@ export interface TTSPreferences {
|
|||
speed: number;
|
||||
style_preset: TTSStylePreset;
|
||||
custom_style_prompt?: string;
|
||||
// ElevenLabs-specific settings
|
||||
stability?: number;
|
||||
similarity_boost?: number;
|
||||
}
|
||||
|
||||
export interface RequestedOutputs {
|
||||
|
|
@ -69,6 +72,22 @@ export interface RequestedOutputs {
|
|||
translation_mode?: TranslationMode; // "video_native" (default) or "traditional"
|
||||
}
|
||||
|
||||
export interface VoiceInfo {
|
||||
id: string;
|
||||
name: string;
|
||||
description?: string;
|
||||
preview_url?: string;
|
||||
labels?: Record<string, string>;
|
||||
category?: string;
|
||||
}
|
||||
|
||||
export interface ProviderVoicesResponse {
|
||||
provider: string;
|
||||
voices: VoiceInfo[];
|
||||
default: string;
|
||||
}
|
||||
|
||||
/** @deprecated Use ProviderVoicesResponse instead */
|
||||
export interface VoicesResponse {
|
||||
voices: string[];
|
||||
default: string;
|
||||
|
|
@ -91,6 +110,25 @@ export interface SpeedRange {
|
|||
step: number;
|
||||
}
|
||||
|
||||
export interface FloatRange {
|
||||
min: number;
|
||||
max: number;
|
||||
default: number;
|
||||
step: number;
|
||||
}
|
||||
|
||||
export interface ProviderOptionsResponse {
|
||||
provider: string;
|
||||
// Gemini-specific
|
||||
models?: TTSOptionItem[];
|
||||
style_presets?: TTSOptionItem[];
|
||||
speed_range?: SpeedRange;
|
||||
// ElevenLabs-specific
|
||||
stability_range?: FloatRange;
|
||||
similarity_boost_range?: FloatRange;
|
||||
}
|
||||
|
||||
/** @deprecated Use ProviderOptionsResponse instead */
|
||||
export interface TTSOptionsResponse {
|
||||
models: TTSOptionItem[];
|
||||
style_presets: TTSOptionItem[];
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue