video-accessibility/backend/app/api/v1/routes_tts.py
Vadym Samoilenko 2b721d182b feat: Client → Team → Project isolation system with Project Manager role
Backend:
- New UserRole.PROJECT_MANAGER with pm_client_ids[] on User model
- New models: Client (slug-based), Team (member_user_ids[]), Project (client-scoped)
- Job model gains project_id field
- New GET/POST/PATCH/DELETE /clients, /clients/{id}/teams, /clients/{id}/projects,
  /clients/{id}/pm routes (admin-only client CRUD; PM or admin for teams/projects)
- get_accessible_project_ids() helper: staff→all, PM→their clients' projects,
  CLIENT→projects from teams they belong to (with legacy owner fallback)
- list_jobs, get_job, bulk_download, get_vtt_content, delete_job all use new isolation

Frontend:
- UserRole type gains 'project_manager'
- Job, JobCreateRequest gain project_id field
- Client, Team, Project, PMUser types added
- ApiClient: full client/team/project/PM CRUD methods
- useClients hook with all query/mutation hooks
- Admin pages: ClientList + ClientDetail (teams, members, projects, PM assignment)
- NewJob form: client + project picker (shown when clients exist)
- Sidebar: Clients nav item for admin and project_manager roles
- Routes: /admin/clients and /admin/clients/:clientId behind RoleGate

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 15:11:13 +01:00

347 lines
11 KiB
Python

import asyncio
import time
from typing import Literal, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import Response
from pydantic import BaseModel, Field
from ...core.config import settings
from ...core.logging import get_logger
from ...services.gemini_tts import gemini_tts_service
from ...services.elevenlabs_voices import elevenlabs_voice_service
from ...services.tts import tts_service
from ...services import cost_tracker
from ...core.dependencies import get_current_user
logger = get_logger(__name__)
router = APIRouter(prefix="/tts", tags=["tts"])
class VoicePreviewRequest(BaseModel):
"""Request to generate a voice preview"""
voice_name: str
language: str = "en"
provider: Literal["gemini", "elevenlabs"] = "gemini"
# Gemini-specific
model: Literal["flash", "pro"] = "flash"
speed: float = Field(default=1.0, ge=0.5, le=2.0)
style_preset: Literal[
"neutral", "calm", "energetic", "professional", "warm", "documentary", "custom"
] = "neutral"
custom_style_prompt: Optional[str] = None
# ElevenLabs-specific
stability: Optional[float] = Field(default=None, ge=0.0, le=1.0)
similarity_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0)
class VoiceInfo(BaseModel):
"""Structured voice information for any provider."""
id: str
name: str
description: Optional[str] = None
preview_url: Optional[str] = None
labels: Optional[dict[str, str]] = None
category: Optional[str] = None
class ProviderVoicesResponse(BaseModel):
"""Available TTS voices for a specific provider."""
provider: str
voices: list[VoiceInfo]
default: str
available: bool = True
error: Optional[str] = None
class LanguagesResponse(BaseModel):
"""Supported TTS languages"""
languages: dict[str, str] # code -> display name
preview_samples: dict[str, str] # code -> sample text
class TTSOptionItem(BaseModel):
"""Single option with value and label"""
value: str
label: str
class SpeedRange(BaseModel):
"""Speed slider range configuration"""
min: float
max: float
default: float
step: float
class FloatRange(BaseModel):
"""Generic float range for sliders."""
min: float
max: float
default: float
step: float
class ProviderOptionsResponse(BaseModel):
"""Available TTS configuration options for a provider."""
provider: str
# Gemini-specific
models: Optional[list[TTSOptionItem]] = None
style_presets: Optional[list[TTSOptionItem]] = None
speed_range: Optional[SpeedRange] = None
# ElevenLabs-specific
stability_range: Optional[FloatRange] = None
similarity_boost_range: Optional[FloatRange] = None
@router.get("/voices", response_model=ProviderVoicesResponse)
async def list_voices(
provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"),
current_user=Depends(get_current_user),
) -> ProviderVoicesResponse:
"""
List available TTS voices for the specified provider.
"""
if provider == "elevenlabs":
if not tts_service.elevenlabs_available:
return ProviderVoicesResponse(
provider="elevenlabs",
voices=[],
default="",
available=False,
)
try:
el_voices = await elevenlabs_voice_service.get_voices()
except Exception as e:
logger.warning(f"ElevenLabs get_voices failed: {e}")
return ProviderVoicesResponse(
provider="elevenlabs",
voices=[],
default="",
available=False,
error=str(e),
)
voices = [
VoiceInfo(
id=v.voice_id,
name=v.name,
description=v.description or None,
preview_url=v.preview_url or None,
labels=v.labels or None,
category=v.category or None,
)
for v in el_voices
]
default_id = voices[0].id if voices else ""
return ProviderVoicesResponse(
provider="elevenlabs",
voices=voices,
default=default_id,
available=True,
)
# Default: Gemini
voices = [
VoiceInfo(id=name, name=name)
for name in settings.gemini_tts_voices
]
return ProviderVoicesResponse(
provider="gemini",
voices=voices,
default=settings.gemini_tts_default_voice,
)
@router.get("/languages", response_model=LanguagesResponse)
async def list_languages(
current_user=Depends(get_current_user)
) -> LanguagesResponse:
"""
List all supported TTS languages with display names and preview samples.
"""
return LanguagesResponse(
languages=settings.gemini_tts_language_names,
preview_samples=settings.gemini_tts_preview_samples
)
@router.get("/options", response_model=ProviderOptionsResponse)
async def get_tts_options(
provider: str = Query("gemini", description="TTS provider: gemini or elevenlabs"),
current_user=Depends(get_current_user),
) -> ProviderOptionsResponse:
"""
Get available TTS configuration options for the specified provider.
"""
if provider == "elevenlabs":
return ProviderOptionsResponse(
provider="elevenlabs",
stability_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05),
similarity_boost_range=FloatRange(min=0.0, max=1.0, default=0.5, step=0.05),
)
# Default: Gemini
return ProviderOptionsResponse(
provider="gemini",
models=[
TTSOptionItem(value="flash", label="Flash (Fast, Cost-efficient)"),
TTSOptionItem(value="pro", label="Pro (Higher Quality)"),
],
style_presets=[
TTSOptionItem(value="neutral", label="Neutral"),
TTSOptionItem(value="calm", label="Calm & Soothing"),
TTSOptionItem(value="energetic", label="Energetic"),
TTSOptionItem(value="professional", label="Professional"),
TTSOptionItem(value="warm", label="Warm & Friendly"),
TTSOptionItem(value="documentary", label="Documentary"),
TTSOptionItem(value="custom", label="Custom Prompt"),
],
speed_range=SpeedRange(
min=settings.gemini_tts_speed_min,
max=settings.gemini_tts_speed_max,
default=settings.gemini_tts_speed_default,
step=settings.gemini_tts_speed_step
),
)
@router.post("/preview")
async def preview_voice(
request: VoicePreviewRequest,
current_user=Depends(get_current_user)
) -> Response:
"""
Generate a voice preview audio sample with all TTS settings applied.
Returns MP3 audio data.
"""
user_id: str = current_user.get("email") or current_user.get("sub") or "unknown"
if request.provider == "elevenlabs":
return await _preview_elevenlabs(request, user_id)
return await _preview_gemini(request, user_id)
async def _preview_gemini(request: VoicePreviewRequest, user_id: str) -> Response:
"""Generate a Gemini TTS voice preview."""
# Validate voice name
if request.voice_name not in settings.gemini_tts_voices:
raise HTTPException(
status_code=400,
detail=f"Invalid voice name. Available voices: {', '.join(settings.gemini_tts_voices)}"
)
# Validate language
if request.language not in settings.gemini_tts_languages:
raise HTTPException(
status_code=400,
detail=f"Unsupported language. Available languages: {', '.join(settings.gemini_tts_languages.keys())}"
)
# Resolve style prompt from preset or custom
if request.style_preset == "custom" and request.custom_style_prompt:
style_prompt = request.custom_style_prompt
else:
style_prompt = settings.gemini_tts_style_prompts.get(request.style_preset, "")
sample_text = settings.gemini_tts_preview_samples.get(
request.language,
settings.gemini_tts_preview_samples.get("en", "This is a voice preview.")
)
try:
logger.info(
f"Generating Gemini voice preview: voice={request.voice_name}, language={request.language}, "
f"model={request.model}, speed={request.speed}x, style={request.style_preset}"
)
t0 = time.monotonic()
audio_data = await gemini_tts_service.synthesize_preview(
voice_name=request.voice_name,
language=request.language,
model=request.model,
speed=request.speed,
style_prompt=style_prompt
)
elapsed_ms = int((time.monotonic() - t0) * 1000)
model_id = settings.gemini_tts_models.get(request.model, settings.gemini_tts_model)
asyncio.create_task(cost_tracker.aio_record(
model=model_id,
provider="google",
user_external_id=user_id,
chars=len(sample_text),
latency_ms=elapsed_ms,
))
return Response(
content=audio_data,
media_type="audio/mpeg",
headers={
"Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3"
}
)
except Exception as e:
logger.error(f"Gemini voice preview generation failed: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to generate voice preview: {str(e)}"
) from e
async def _preview_elevenlabs(request: VoicePreviewRequest, user_id: str) -> Response:
"""Generate an ElevenLabs TTS voice preview."""
if not tts_service.elevenlabs_available:
raise HTTPException(
status_code=400,
detail="ElevenLabs TTS is not configured"
)
# Get sample text for the language
sample_text = settings.gemini_tts_preview_samples.get(
request.language,
settings.gemini_tts_preview_samples.get("en", "This is a preview of the audio description voice.")
)
stability = request.stability if request.stability is not None else 0.5
similarity_boost = request.similarity_boost if request.similarity_boost is not None else 0.5
try:
logger.info(
f"Generating ElevenLabs voice preview: voice={request.voice_name}, language={request.language}, "
f"stability={stability}, similarity_boost={similarity_boost}"
)
t0 = time.monotonic()
audio_data = await tts_service._synthesize_text_elevenlabs(
text=sample_text,
voice_id=request.voice_name,
stability=stability,
similarity_boost=similarity_boost,
)
elapsed_ms = int((time.monotonic() - t0) * 1000)
asyncio.create_task(cost_tracker.aio_record(
model="eleven_multilingual_v2",
provider="elevenlabs",
user_external_id=user_id,
chars=len(sample_text),
latency_ms=elapsed_ms,
))
return Response(
content=audio_data,
media_type="audio/mpeg",
headers={
"Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3"
}
)
except Exception as e:
logger.error(f"ElevenLabs voice preview generation failed: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to generate voice preview: {str(e)}"
) from e