forge/backend/app/services/text_to_speech.py
DJP 7a804e896d Initial commit - FORGE AI unified platform
Features:
- Image generation (OpenAI, Gemini, Leonardo, Bria, Stability, Flux)
- Nano Banana iterative editing
- Video generation and upscaling
- Audio TTS, STT, sound effects (ElevenLabs)
- Text prompt studio and alt text
- User authentication with JWT/cookies
- Admin panel with voice management
- Job queue with Celery
- PostgreSQL + Redis backend
- Next.js 15 + FastAPI architecture

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
2025-12-09 20:39:00 -05:00

406 lines
15 KiB
Python

"""Text to Speech Service - ElevenLabs
Supported Models (December 2025):
- eleven_multilingual_v2: Highest quality, 32 languages (default)
- eleven_flash_v2_5: Ultra-low 75ms latency for real-time/chatbots
- eleven_turbo_v2_5: Emotion & drama - great for dialogue, characters, storytelling
- eleven_monolingual_v1: English only (legacy)
- eleven_v3: Latest model with high emotional range (alpha, multilingual only)
Model Selection Guide:
- Quality & Languages → eleven_multilingual_v2
- Speed/Real-time (chatbots, live agents) → eleven_flash_v2_5
- Emotion & Drama (dialogue, characters) → eleven_turbo_v2_5
Voice Settings:
- stability: 0.0-1.0 (higher = more consistent, lower = more expressive)
- similarity_boost: 0.0-1.0 (higher = closer to original voice)
- style: 0.0-1.0 (style exaggeration, v2+ models only)
- use_speaker_boost: boolean (enhance voice clarity)
- speed: 0.7-1.2 (speech speed, default 1.0)
Advanced Features:
- seed: Integer for reproducible output (same seed + params = same result)
- previous_text: Context for better prosody continuation
- next_text: Lookahead context for natural flow
- apply_text_normalization: 'auto', 'on', 'off' (number/date spelling)
- language_code: Override auto-detection (e.g., 'en', 'es', 'fr')
Output Formats:
- MP3: mp3_44100_128, mp3_44100_192, mp3_22050_32
- PCM: pcm_16000, pcm_22050, pcm_24000, pcm_44100, pcm_48000
- Opus: opus_48000, opus_64000
- Other: ulaw_8000, alaw_8000
Voice Cloning:
- Instant Voice Cloning (IVC): Quick replication from short samples
- Professional Voice Cloning (PVC): 30+ min audio for highest fidelity
"""
import httpx
import os
from uuid import uuid4
from datetime import datetime
from typing import Optional, Dict, Any
from app.database import SessionLocal
from app.models.job import Job
from app.models.asset import Asset
from app.config import settings
# Available models with their descriptions
ELEVENLABS_MODELS = {
"eleven_multilingual_v2": {
"name": "Multilingual v2",
"description": "Highest quality, supports 32 languages",
"latency": "medium",
"use_case": "quality",
"supports_style": True,
"languages": 32
},
"eleven_flash_v2_5": {
"name": "Flash v2.5",
"description": "Ultra-low 75ms latency for real-time apps",
"latency": "ultra-low",
"use_case": "realtime",
"supports_style": True,
"languages": 32
},
"eleven_turbo_v2_5": {
"name": "Turbo v2.5",
"description": "Emotion & drama - dialogue, characters, storytelling",
"latency": "low",
"use_case": "emotion",
"supports_style": True,
"languages": 32
},
"eleven_v3": {
"name": "Eleven v3 (Alpha)",
"description": "Latest model with high emotional range",
"latency": "medium",
"use_case": "emotion",
"supports_style": True,
"languages": 32
},
"eleven_monolingual_v1": {
"name": "English v1",
"description": "English only, legacy model",
"latency": "medium",
"use_case": "legacy",
"supports_style": False,
"languages": 1
}
}
OUTPUT_FORMATS = {
# MP3 formats
"mp3_44100_128": {"ext": "mp3", "mime": "audio/mpeg"},
"mp3_44100_192": {"ext": "mp3", "mime": "audio/mpeg"},
"mp3_22050_32": {"ext": "mp3", "mime": "audio/mpeg"},
# PCM formats (raw audio)
"pcm_16000": {"ext": "wav", "mime": "audio/wav"},
"pcm_22050": {"ext": "wav", "mime": "audio/wav"},
"pcm_24000": {"ext": "wav", "mime": "audio/wav"},
"pcm_44100": {"ext": "wav", "mime": "audio/wav"},
"pcm_48000": {"ext": "wav", "mime": "audio/wav"},
# Opus formats
"opus_48000": {"ext": "opus", "mime": "audio/opus"},
"opus_64000": {"ext": "opus", "mime": "audio/opus"},
# Telephony formats
"ulaw_8000": {"ext": "wav", "mime": "audio/wav"},
"alaw_8000": {"ext": "wav", "mime": "audio/wav"}
}
async def synthesize(job_id: str):
"""Synthesize speech from text using ElevenLabs
Input parameters:
- text: The text to convert to speech
- voice_id: ElevenLabs voice ID
- model_id: Model to use (see ELEVENLABS_MODELS)
- stability: Voice stability 0.0-1.0 (default 0.5)
- similarity_boost: Voice similarity 0.0-1.0 (default 0.75)
- style: Style exaggeration 0.0-1.0 (v2+ models, default 0.0)
- use_speaker_boost: Enhance voice clarity (default true)
- speed: Speech speed 0.7-1.2 (default 1.0)
- output_format: Audio format (default mp3_44100_128)
- seed: Optional seed for reproducible output
- language_code: Override auto-detection (e.g., 'en', 'es', 'fr', 'de')
- previous_text: Context from before for better prosody
- next_text: Lookahead context for natural flow
- apply_text_normalization: 'auto', 'on', 'off' (how to spell numbers/dates)
"""
db = SessionLocal()
try:
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
return
input_data = job.input_data
# Extract all parameters with defaults
text = input_data.get("text", "")
voice_id = input_data.get("voice_id", "21m00Tcm4TlvDq8ikWAM")
model_id = input_data.get("model_id", "eleven_multilingual_v2")
stability = float(input_data.get("stability", 0.5))
similarity_boost = float(input_data.get("similarity_boost", 0.75))
style = float(input_data.get("style", 0.0))
use_speaker_boost = input_data.get("use_speaker_boost", True)
speed = float(input_data.get("speed", 1.0))
output_format = input_data.get("output_format", "mp3_44100_128")
seed = input_data.get("seed")
# New advanced parameters
language_code = input_data.get("language_code")
previous_text = input_data.get("previous_text")
next_text = input_data.get("next_text")
apply_text_normalization = input_data.get("apply_text_normalization", "auto")
# Validate speed range
speed = max(0.7, min(1.2, speed))
job.progress = 10
job.api_provider = "elevenlabs"
job.api_model = model_id
db.commit()
# Get model config to check supported features
model_config = ELEVENLABS_MODELS.get(model_id, ELEVENLABS_MODELS["eleven_multilingual_v2"])
# Build voice settings
voice_settings: Dict[str, Any] = {
"stability": stability,
"similarity_boost": similarity_boost,
"use_speaker_boost": use_speaker_boost
}
# Style only supported in v2+ models
if model_config.get("supports_style", False):
voice_settings["style"] = style
# Build request payload
payload: Dict[str, Any] = {
"text": text,
"model_id": model_id,
"voice_settings": voice_settings
}
# Add optional parameters
if speed != 1.0:
payload["speed"] = speed
if seed is not None:
payload["seed"] = seed
if language_code:
payload["language_code"] = language_code
if previous_text:
payload["previous_text"] = previous_text
if next_text:
payload["next_text"] = next_text
if apply_text_normalization and apply_text_normalization != "auto":
payload["apply_text_normalization"] = apply_text_normalization
# Determine accept header based on format
format_info = OUTPUT_FORMATS.get(output_format, OUTPUT_FORMATS["mp3_44100_128"])
async with httpx.AsyncClient(timeout=120) as client:
response = await client.post(
f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
headers={
"xi-api-key": settings.elevenlabs_api_key,
"Content-Type": "application/json",
"Accept": f"audio/mpeg" # ElevenLabs returns mp3 by default
},
params={"output_format": output_format},
json=payload
)
response.raise_for_status()
audio_data = response.content
job.progress = 80
db.commit()
# Save audio file
filename = f"tts_{uuid4()}.mp3"
storage_path = os.path.join(settings.storage_path, "audio")
os.makedirs(storage_path, exist_ok=True)
file_path = os.path.join(storage_path, filename)
with open(file_path, "wb") as f:
f.write(audio_data)
# Create asset
asset = Asset(
user_id=job.user_id,
project_id=job.project_id,
original_filename=filename,
stored_filename=filename,
file_path=file_path,
file_type="audio",
mime_type="audio/mpeg",
file_size_bytes=len(audio_data),
source_module="text_to_speech",
source_job_id=job.id,
metadata={
"text_length": len(text),
"voice_id": voice_id,
"model_id": model_id
}
)
db.add(asset)
db.commit()
db.refresh(asset)
job.output_asset_ids = [asset.id]
job.output_data = {"asset_id": str(asset.id), "file_path": file_path}
job.progress = 100
job.status = "completed"
job.completed_at = datetime.utcnow()
db.commit()
except Exception as e:
job.status = "failed"
job.error_message = str(e)
db.commit()
finally:
db.close()
async def speech_to_speech(job_id: str):
"""Convert voice to another voice using ElevenLabs"""
db = SessionLocal()
try:
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
return
input_data = job.input_data
input_asset_ids = job.input_asset_ids
if not input_asset_ids:
raise ValueError("No input asset provided")
input_asset = db.query(Asset).filter(Asset.id == input_asset_ids[0]).first()
if not input_asset:
raise ValueError("Input asset not found")
job.progress = 10
job.api_provider = "elevenlabs"
job.api_model = "eleven_english_sts_v2"
db.commit()
voice_id = input_data.get("voice_id")
if not voice_id:
raise ValueError("No voice_id provided")
# Read input audio
with open(input_asset.file_path, "rb") as f:
audio_data = f.read()
job.progress = 20
db.commit()
async with httpx.AsyncClient(timeout=120) as client:
response = await client.post(
f"https://api.elevenlabs.io/v1/speech-to-speech/{voice_id}",
headers={
"xi-api-key": settings.elevenlabs_api_key,
"Accept": "audio/mpeg"
},
files={"audio": (input_asset.original_filename, audio_data, input_asset.mime_type)},
data={
"model_id": "eleven_english_sts_v2",
"voice_settings": '{"stability": 0.5, "similarity_boost": 0.5}'
}
)
response.raise_for_status()
converted_audio = response.content
job.progress = 80
db.commit()
# Save converted audio
filename = f"sts_{uuid4()}.mp3"
storage_path = os.path.join(settings.storage_path, "audio")
os.makedirs(storage_path, exist_ok=True)
file_path = os.path.join(storage_path, filename)
with open(file_path, "wb") as f:
f.write(converted_audio)
# Create asset
asset = Asset(
user_id=job.user_id,
project_id=job.project_id,
original_filename=filename,
stored_filename=filename,
file_path=file_path,
file_type="audio",
mime_type="audio/mpeg",
file_size_bytes=len(converted_audio),
source_module="speech_to_speech",
source_job_id=job.id,
parent_asset_id=input_asset.id,
metadata={"voice_id": voice_id}
)
db.add(asset)
db.commit()
db.refresh(asset)
job.output_asset_ids = [asset.id]
job.output_data = {"asset_id": str(asset.id), "file_path": file_path}
job.progress = 100
job.status = "completed"
job.completed_at = datetime.utcnow()
db.commit()
except Exception as e:
job.status = "failed"
job.error_message = str(e)
db.commit()
finally:
db.close()
async def get_voices() -> list:
"""Get available ElevenLabs voices"""
if not settings.elevenlabs_api_key:
# Return default voices when API key is not configured
return [
{"voice_id": "21m00Tcm4TlvDq8ikWAM", "name": "Rachel (Default)", "category": "premade", "labels": {"accent": "american", "gender": "female"}},
{"voice_id": "AZnzlk1XvdvUeBnXmlld", "name": "Domi", "category": "premade", "labels": {"accent": "american", "gender": "female"}},
{"voice_id": "EXAVITQu4vr4xnSDxMaL", "name": "Bella", "category": "premade", "labels": {"accent": "american", "gender": "female"}},
{"voice_id": "ErXwobaYiN019PkySvjV", "name": "Antoni", "category": "premade", "labels": {"accent": "american", "gender": "male"}},
{"voice_id": "MF3mGyEYCl7XYWbV9V6O", "name": "Elli", "category": "premade", "labels": {"accent": "american", "gender": "female"}},
{"voice_id": "TxGEqnHWrfWFTfGW9XjX", "name": "Josh", "category": "premade", "labels": {"accent": "american", "gender": "male"}},
{"voice_id": "VR6AewLTigWG4xSOukaG", "name": "Arnold", "category": "premade", "labels": {"accent": "american", "gender": "male"}},
{"voice_id": "pNInz6obpgDQGcFmaJgB", "name": "Adam", "category": "premade", "labels": {"accent": "american", "gender": "male"}},
{"voice_id": "yoZ06aMxZJJ28mfd3POQ", "name": "Sam", "category": "premade", "labels": {"accent": "american", "gender": "male"}},
]
try:
async with httpx.AsyncClient(timeout=30) as client:
response = await client.get(
"https://api.elevenlabs.io/v1/voices",
headers={"xi-api-key": settings.elevenlabs_api_key}
)
response.raise_for_status()
data = response.json()
voices = []
for voice in data.get("voices", []):
voices.append({
"voice_id": voice.get("voice_id"),
"name": voice.get("name"),
"preview_url": voice.get("preview_url"),
"category": voice.get("category"),
"labels": voice.get("labels", {})
})
return voices
except Exception:
# Return default voices on error
return [
{"voice_id": "21m00Tcm4TlvDq8ikWAM", "name": "Rachel (Default)", "category": "premade"},
{"voice_id": "ErXwobaYiN019PkySvjV", "name": "Antoni", "category": "premade"},
{"voice_id": "TxGEqnHWrfWFTfGW9XjX", "name": "Josh", "category": "premade"},
]