"""Text to Speech Service - ElevenLabs Supported Models (December 2025): - eleven_multilingual_v2: Highest quality, 32 languages (default) - eleven_flash_v2_5: Ultra-low 75ms latency for real-time/chatbots - eleven_turbo_v2_5: Emotion & drama - great for dialogue, characters, storytelling - eleven_monolingual_v1: English only (legacy) - eleven_v3: Latest model with high emotional range (alpha, multilingual only) Model Selection Guide: - Quality & Languages → eleven_multilingual_v2 - Speed/Real-time (chatbots, live agents) → eleven_flash_v2_5 - Emotion & Drama (dialogue, characters) → eleven_turbo_v2_5 Voice Settings: - stability: 0.0-1.0 (higher = more consistent, lower = more expressive) - similarity_boost: 0.0-1.0 (higher = closer to original voice) - style: 0.0-1.0 (style exaggeration, v2+ models only) - use_speaker_boost: boolean (enhance voice clarity) - speed: 0.7-1.2 (speech speed, default 1.0) Advanced Features: - seed: Integer for reproducible output (same seed + params = same result) - previous_text: Context for better prosody continuation - next_text: Lookahead context for natural flow - apply_text_normalization: 'auto', 'on', 'off' (number/date spelling) - language_code: Override auto-detection (e.g., 'en', 'es', 'fr') Output Formats: - MP3: mp3_44100_128, mp3_44100_192, mp3_22050_32 - PCM: pcm_16000, pcm_22050, pcm_24000, pcm_44100, pcm_48000 - Opus: opus_48000, opus_64000 - Other: ulaw_8000, alaw_8000 Voice Cloning: - Instant Voice Cloning (IVC): Quick replication from short samples - Professional Voice Cloning (PVC): 30+ min audio for highest fidelity """ import httpx import os from uuid import uuid4 from datetime import datetime from typing import Optional, Dict, Any from app.database import SessionLocal from app.models.job import Job from app.models.asset import Asset from app.config import settings # Available models with their descriptions ELEVENLABS_MODELS = { "eleven_multilingual_v2": { "name": "Multilingual v2", "description": "Highest quality, supports 32 languages", "latency": "medium", "use_case": "quality", "supports_style": True, "languages": 32 }, "eleven_flash_v2_5": { "name": "Flash v2.5", "description": "Ultra-low 75ms latency for real-time apps", "latency": "ultra-low", "use_case": "realtime", "supports_style": True, "languages": 32 }, "eleven_turbo_v2_5": { "name": "Turbo v2.5", "description": "Emotion & drama - dialogue, characters, storytelling", "latency": "low", "use_case": "emotion", "supports_style": True, "languages": 32 }, "eleven_v3": { "name": "Eleven v3 (Alpha)", "description": "Latest model with high emotional range", "latency": "medium", "use_case": "emotion", "supports_style": True, "languages": 32 }, "eleven_monolingual_v1": { "name": "English v1", "description": "English only, legacy model", "latency": "medium", "use_case": "legacy", "supports_style": False, "languages": 1 } } OUTPUT_FORMATS = { # MP3 formats "mp3_44100_128": {"ext": "mp3", "mime": "audio/mpeg"}, "mp3_44100_192": {"ext": "mp3", "mime": "audio/mpeg"}, "mp3_22050_32": {"ext": "mp3", "mime": "audio/mpeg"}, # PCM formats (raw audio) "pcm_16000": {"ext": "wav", "mime": "audio/wav"}, "pcm_22050": {"ext": "wav", "mime": "audio/wav"}, "pcm_24000": {"ext": "wav", "mime": "audio/wav"}, "pcm_44100": {"ext": "wav", "mime": "audio/wav"}, "pcm_48000": {"ext": "wav", "mime": "audio/wav"}, # Opus formats "opus_48000": {"ext": "opus", "mime": "audio/opus"}, "opus_64000": {"ext": "opus", "mime": "audio/opus"}, # Telephony formats "ulaw_8000": {"ext": "wav", "mime": "audio/wav"}, "alaw_8000": {"ext": "wav", "mime": "audio/wav"} } async def synthesize(job_id: str): """Synthesize speech from text using ElevenLabs Input parameters: - text: The text to convert to speech - voice_id: ElevenLabs voice ID - model_id: Model to use (see ELEVENLABS_MODELS) - stability: Voice stability 0.0-1.0 (default 0.5) - similarity_boost: Voice similarity 0.0-1.0 (default 0.75) - style: Style exaggeration 0.0-1.0 (v2+ models, default 0.0) - use_speaker_boost: Enhance voice clarity (default true) - speed: Speech speed 0.7-1.2 (default 1.0) - output_format: Audio format (default mp3_44100_128) - seed: Optional seed for reproducible output - language_code: Override auto-detection (e.g., 'en', 'es', 'fr', 'de') - previous_text: Context from before for better prosody - next_text: Lookahead context for natural flow - apply_text_normalization: 'auto', 'on', 'off' (how to spell numbers/dates) """ db = SessionLocal() try: job = db.query(Job).filter(Job.id == job_id).first() if not job: return input_data = job.input_data # Extract all parameters with defaults text = input_data.get("text", "") voice_id = input_data.get("voice_id", "21m00Tcm4TlvDq8ikWAM") model_id = input_data.get("model_id", "eleven_multilingual_v2") stability = float(input_data.get("stability", 0.5)) similarity_boost = float(input_data.get("similarity_boost", 0.75)) style = float(input_data.get("style", 0.0)) use_speaker_boost = input_data.get("use_speaker_boost", True) speed = float(input_data.get("speed", 1.0)) output_format = input_data.get("output_format", "mp3_44100_128") seed = input_data.get("seed") # New advanced parameters language_code = input_data.get("language_code") previous_text = input_data.get("previous_text") next_text = input_data.get("next_text") apply_text_normalization = input_data.get("apply_text_normalization", "auto") # Validate speed range speed = max(0.7, min(1.2, speed)) job.progress = 10 job.api_provider = "elevenlabs" job.api_model = model_id db.commit() # Get model config to check supported features model_config = ELEVENLABS_MODELS.get(model_id, ELEVENLABS_MODELS["eleven_multilingual_v2"]) # Build voice settings voice_settings: Dict[str, Any] = { "stability": stability, "similarity_boost": similarity_boost, "use_speaker_boost": use_speaker_boost } # Style only supported in v2+ models if model_config.get("supports_style", False): voice_settings["style"] = style # Build request payload payload: Dict[str, Any] = { "text": text, "model_id": model_id, "voice_settings": voice_settings } # Add optional parameters if speed != 1.0: payload["speed"] = speed if seed is not None: payload["seed"] = seed if language_code: payload["language_code"] = language_code if previous_text: payload["previous_text"] = previous_text if next_text: payload["next_text"] = next_text if apply_text_normalization and apply_text_normalization != "auto": payload["apply_text_normalization"] = apply_text_normalization # Determine accept header based on format format_info = OUTPUT_FORMATS.get(output_format, OUTPUT_FORMATS["mp3_44100_128"]) async with httpx.AsyncClient(timeout=120) as client: response = await client.post( f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}", headers={ "xi-api-key": settings.elevenlabs_api_key, "Content-Type": "application/json", "Accept": f"audio/mpeg" # ElevenLabs returns mp3 by default }, params={"output_format": output_format}, json=payload ) response.raise_for_status() audio_data = response.content job.progress = 80 db.commit() # Save audio file filename = f"tts_{uuid4()}.mp3" storage_path = os.path.join(settings.storage_path, "audio") os.makedirs(storage_path, exist_ok=True) file_path = os.path.join(storage_path, filename) with open(file_path, "wb") as f: f.write(audio_data) # Create asset asset = Asset( user_id=job.user_id, project_id=job.project_id, original_filename=filename, stored_filename=filename, file_path=file_path, file_type="audio", mime_type="audio/mpeg", file_size_bytes=len(audio_data), source_module="text_to_speech", source_job_id=job.id, asset_metadata={ "text_length": len(text), "voice_id": voice_id, "model_id": model_id } ) db.add(asset) db.commit() db.refresh(asset) job.output_asset_ids = [asset.id] job.output_data = {"asset_id": str(asset.id), "file_path": file_path} job.progress = 100 job.status = "completed" job.completed_at = datetime.utcnow() db.commit() except Exception as e: job.status = "failed" job.error_message = str(e) db.commit() finally: db.close() async def speech_to_speech(job_id: str): """Convert voice to another voice using ElevenLabs""" db = SessionLocal() try: job = db.query(Job).filter(Job.id == job_id).first() if not job: return input_data = job.input_data input_asset_ids = job.input_asset_ids if not input_asset_ids: raise ValueError("No input asset provided") input_asset = db.query(Asset).filter(Asset.id == input_asset_ids[0]).first() if not input_asset: raise ValueError("Input asset not found") job.progress = 10 job.api_provider = "elevenlabs" job.api_model = "eleven_english_sts_v2" db.commit() voice_id = input_data.get("voice_id") if not voice_id: raise ValueError("No voice_id provided") # Read input audio with open(input_asset.file_path, "rb") as f: audio_data = f.read() job.progress = 20 db.commit() async with httpx.AsyncClient(timeout=120) as client: response = await client.post( f"https://api.elevenlabs.io/v1/speech-to-speech/{voice_id}", headers={ "xi-api-key": settings.elevenlabs_api_key, "Accept": "audio/mpeg" }, files={"audio": (input_asset.original_filename, audio_data, input_asset.mime_type)}, data={ "model_id": "eleven_english_sts_v2", "voice_settings": '{"stability": 0.5, "similarity_boost": 0.5}' } ) response.raise_for_status() converted_audio = response.content job.progress = 80 db.commit() # Save converted audio filename = f"sts_{uuid4()}.mp3" storage_path = os.path.join(settings.storage_path, "audio") os.makedirs(storage_path, exist_ok=True) file_path = os.path.join(storage_path, filename) with open(file_path, "wb") as f: f.write(converted_audio) # Create asset asset = Asset( user_id=job.user_id, project_id=job.project_id, original_filename=filename, stored_filename=filename, file_path=file_path, file_type="audio", mime_type="audio/mpeg", file_size_bytes=len(converted_audio), source_module="speech_to_speech", source_job_id=job.id, parent_asset_id=input_asset.id, asset_metadata={"voice_id": voice_id} ) db.add(asset) db.commit() db.refresh(asset) job.output_asset_ids = [asset.id] job.output_data = {"asset_id": str(asset.id), "file_path": file_path} job.progress = 100 job.status = "completed" job.completed_at = datetime.utcnow() db.commit() except Exception as e: job.status = "failed" job.error_message = str(e) db.commit() finally: db.close() async def get_voices() -> list: """Get available ElevenLabs voices""" if not settings.elevenlabs_api_key: # Return default voices when API key is not configured return [ {"voice_id": "21m00Tcm4TlvDq8ikWAM", "name": "Rachel (Default)", "category": "premade", "labels": {"accent": "american", "gender": "female"}}, {"voice_id": "AZnzlk1XvdvUeBnXmlld", "name": "Domi", "category": "premade", "labels": {"accent": "american", "gender": "female"}}, {"voice_id": "EXAVITQu4vr4xnSDxMaL", "name": "Bella", "category": "premade", "labels": {"accent": "american", "gender": "female"}}, {"voice_id": "ErXwobaYiN019PkySvjV", "name": "Antoni", "category": "premade", "labels": {"accent": "american", "gender": "male"}}, {"voice_id": "MF3mGyEYCl7XYWbV9V6O", "name": "Elli", "category": "premade", "labels": {"accent": "american", "gender": "female"}}, {"voice_id": "TxGEqnHWrfWFTfGW9XjX", "name": "Josh", "category": "premade", "labels": {"accent": "american", "gender": "male"}}, {"voice_id": "VR6AewLTigWG4xSOukaG", "name": "Arnold", "category": "premade", "labels": {"accent": "american", "gender": "male"}}, {"voice_id": "pNInz6obpgDQGcFmaJgB", "name": "Adam", "category": "premade", "labels": {"accent": "american", "gender": "male"}}, {"voice_id": "yoZ06aMxZJJ28mfd3POQ", "name": "Sam", "category": "premade", "labels": {"accent": "american", "gender": "male"}}, ] try: async with httpx.AsyncClient(timeout=30) as client: response = await client.get( "https://api.elevenlabs.io/v1/voices", headers={"xi-api-key": settings.elevenlabs_api_key} ) response.raise_for_status() data = response.json() voices = [] for voice in data.get("voices", []): voices.append({ "voice_id": voice.get("voice_id"), "name": voice.get("name"), "preview_url": voice.get("preview_url"), "category": voice.get("category"), "labels": voice.get("labels", {}) }) return voices except Exception: # Return default voices on error return [ {"voice_id": "21m00Tcm4TlvDq8ikWAM", "name": "Rachel (Default)", "category": "premade"}, {"voice_id": "ErXwobaYiN019PkySvjV", "name": "Antoni", "category": "premade"}, {"voice_id": "TxGEqnHWrfWFTfGW9XjX", "name": "Josh", "category": "premade"}, ]