video-accessibility/backend/app/core/config.py
Vadym Samoilenko 1e177a6d5c feat: add ElevenLabs voice selection to frontend and backend
Add dynamic ElevenLabs voice catalog with provider toggle in the UI,
allowing users to browse ElevenLabs voices, configure stability and
similarity boost settings, and preview/synthesize with ElevenLabs TTS.

Backend:
- New elevenlabs_voices.py service with 1-hour cached API fetching
- TTS routes support ?provider= query param for voices and options
- Preview endpoint routes to ElevenLabs or Gemini based on provider
- stability/similarity_boost params flow through TTS synthesis pipeline
- TTSPreferences model extended with ElevenLabs-specific fields
- Deprecated hardcoded elevenlabs_voices config (now fetched dynamically)

Frontend:
- Provider toggle (Gemini/ElevenLabs) in VoiceSelector
- ElevenLabsSettingsPanel with stability and similarity boost sliders
- VoicePreviewButton supports provider-specific preview parameters
- API client passes provider param to voices, options, and preview endpoints
- New VoiceInfo, ProviderVoicesResponse, ProviderOptionsResponse types

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-03 13:58:56 +00:00

218 lines
8.5 KiB
Python

from pydantic_settings import BaseSettings
class Settings(BaseSettings):
# App
app_env: str = "dev"
api_base_url: str = "http://localhost:8000"
# Auth
jwt_secret: str
jwt_alg: str = "HS256"
jwt_access_ttl_min: int = 15
jwt_refresh_ttl_days: int = 7
cookie_domain: str = "localhost"
cookie_secure: bool = False
cookie_samesite: str = "Lax"
# Database
mongodb_uri: str
mongodb_db: str = "accessible_video"
# Redis
redis_url: str
# Celery
celery_broker_url: str = ""
celery_result_backend: str = ""
# GCP
gcp_project_id: str
gcs_bucket: str = "accessible-video"
google_application_credentials: str = ""
# AI Services
gemini_api_key: str
elevenlabs_api_key: str = ""
google_tts_credentials: str = ""
# TTS Voice Configuration
tts_provider: str = "gemini" # "gemini", "google", or "elevenlabs"
google_tts_voices: dict[str, str] = {
"en-US": "en-US-Neural2-D",
"es-ES": "es-ES-Neural2-A",
"fr-FR": "fr-FR-Neural2-A",
"de-DE": "de-DE-Neural2-B"
}
# Deprecated: ElevenLabs voices are now fetched dynamically via the API.
# This fallback map is only used by _get_elevenlabs_voice() when no voice_name is provided.
elevenlabs_voices: dict[str, str] = {}
# Gemini TTS Configuration
gemini_tts_model: str = "gemini-2.5-flash-preview-tts"
gemini_tts_default_voice: str = "Kore"
gemini_tts_voices: list[str] = [
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
]
gemini_tts_languages: dict[str, str] = {
"en": "en-US",
"es": "es-US",
"fr": "fr-FR",
"de": "de-DE",
"it": "it-IT",
"pt": "pt-BR",
"ja": "ja-JP",
"ko": "ko-KR",
"ar": "ar-EG",
"hi": "hi-IN",
"id": "id-ID",
"nl": "nl-NL",
"pl": "pl-PL",
"ru": "ru-RU",
"th": "th-TH",
"tr": "tr-TR",
"vi": "vi-VN",
"ro": "ro-RO",
"uk": "uk-UA",
"bn": "bn-BD",
"mr": "mr-IN",
"ta": "ta-IN",
"te": "te-IN",
"zh": "zh-CN"
}
gemini_tts_language_names: dict[str, str] = {
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"ja": "Japanese",
"ko": "Korean",
"ar": "Arabic",
"hi": "Hindi",
"id": "Indonesian",
"nl": "Dutch",
"pl": "Polish",
"ru": "Russian",
"th": "Thai",
"tr": "Turkish",
"vi": "Vietnamese",
"ro": "Romanian",
"uk": "Ukrainian",
"bn": "Bengali",
"mr": "Marathi",
"ta": "Tamil",
"te": "Telugu",
"zh": "Chinese"
}
gemini_tts_preview_samples: dict[str, str] = {
"en": "This is a preview of the audio description voice.",
"es": "Esta es una vista previa de la voz de audiodescripcion.",
"fr": "Ceci est un apercu de la voix de l'audiodescription.",
"de": "Dies ist eine Vorschau der Audiodeskriptionsstimme.",
"it": "Questa e un'anteprima della voce dell'audiodescrizione.",
"pt": "Esta e uma previa da voz da audiodescricao.",
"ja": "これは音声解説の声のプレビューです。",
"ko": "이것은 오디오 설명 음성의 미리보기입니다.",
"ar": "هذه معاينة لصوت الوصف الصوتي.",
"hi": "यह ऑडियो विवरण आवाज का पूर्वावलोकन है।",
"id": "Ini adalah pratinjau suara deskripsi audio.",
"nl": "Dit is een voorbeeld van de audiodescriptiestem.",
"pl": "To jest podglad glosu audiodeskrypcji.",
"ru": "Это предварительный просмотр голоса аудиоописания.",
"th": "นี่คือตัวอย่างเสียงบรรยายภาพ",
"tr": "Bu, sesli betimleme sesinin bir onizlemesidir.",
"vi": "Day la ban xem truoc giong mo ta am thanh.",
"ro": "Aceasta este o previzualizare a vocii descrierii audio.",
"uk": "Це попередній перегляд голосу аудіоопису.",
"bn": "এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।",
"mr": "हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे.",
"ta": "இது ஆடியோ விளக்க குரலின் முன்னோட்டம்.",
"te": "ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ.",
"zh": "这是音频描述语音的预览。"
}
# Gemini TTS Model Options
gemini_tts_models: dict[str, str] = {
"flash": "gemini-2.5-flash-preview-tts", # Fast, cost-efficient
"pro": "gemini-2.5-pro-preview-tts", # Higher quality
}
# Gemini TTS Style Presets - prompts prepended to text for style control
gemini_tts_style_prompts: dict[str, str] = {
"neutral": "", # No modification
"calm": "Speak in a calm, gentle, and soothing manner with a relaxed pace. ",
"energetic": "Speak with energy and enthusiasm, maintaining an upbeat and dynamic tone. ",
"professional": "Speak in a clear, professional, and authoritative manner suitable for corporate content. ",
"warm": "Speak in a warm, friendly, and approachable manner as if speaking to a friend. ",
"documentary": "Speak in a measured, informative tone similar to a documentary narrator, with clear enunciation and appropriate pauses. ",
}
# TTS Speed range configuration
gemini_tts_speed_min: float = 0.5
gemini_tts_speed_max: float = 2.0
gemini_tts_speed_default: float = 1.0
gemini_tts_speed_step: float = 0.1
# Whisper Configuration (for pause point refinement)
whisper_model: str = "medium" # Options: tiny, base, small, medium, large-v3
whisper_max_search_window: float = 30.0 # Max seconds to search for speech gap after Gemini point
whisper_sentence_gap_threshold: float = 0.5 # Gap duration to classify as sentence boundary
whisper_phrase_gap_threshold: float = 0.3 # Gap duration to classify as phrase boundary
whisper_min_gap_threshold: float = 0.15 # Minimum gap duration to consider
# Cloud Run Service URLs (empty = use local processing)
# When set, CPU-intensive work is offloaded to Cloud Run with autoscaling
whisper_service_url: str = "" # e.g., "https://whisper-service-xxx.run.app"
ffmpeg_service_url: str = "" # e.g., "https://ffmpeg-service-xxx.run.app"
# Celery Worker Concurrency Settings
# When using Cloud Run, workers just make HTTP calls so can handle more concurrent tasks
# When running locally, concurrency is limited by CPU/RAM constraints
#
# Recommended settings:
# Cloud Run mode: WHISPER_WORKER_CONCURRENCY=10, FFMPEG_WORKER_CONCURRENCY=20
# Local mode: WHISPER_WORKER_CONCURRENCY=1, FFMPEG_WORKER_CONCURRENCY=2
worker_concurrency: int = 8 # Main worker (default,ingest,notify,render)
whisper_worker_concurrency: int = 1 # Whisper worker (default: 1 for local RAM constraints)
ffmpeg_worker_concurrency: int = 4 # FFmpeg tasks on main worker
tts_worker_concurrency: int = 8 # TTS worker
# Email
sendgrid_api_key: str
email_from: str
client_base_url: str
# Microsoft Authentication (Azure AD)
azure_client_id: str = ""
azure_authority: str = ""
azure_redirect_uri: str = ""
# Observability
sentry_dsn: str = ""
otel_exporter_otlp_endpoint: str = ""
# CORS - comma-separated list of allowed origins
cors_origins: str = "http://localhost:5173,http://localhost:5174,http://localhost:3000,http://localhost:6001"
@property
def cors_origins_list(self) -> list[str]:
"""Parse CORS origins from comma-separated string to list."""
return [origin.strip() for origin in self.cors_origins.split(",") if origin.strip()]
class Config:
env_file = ".env"
settings = Settings()
def get_settings():
"""Get settings instance - for dependency injection"""
return settings