upgrade TTS to Gemini TTS with voice selection and preview

- Add Gemini TTS service with 30 voices and 24 languages
- Add TTS API endpoints for voice listing and preview
- Add per-language voice selection in job creation form
- Add voice override at QC approval stage
- Add VoiceSelector and VoicePreviewButton components
- Update TTSPreferences model with provider and voice mapping

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
michael 2025-12-22 14:41:57 -06:00
parent 46b6f25fd0
commit 29643f6683
16 changed files with 1075 additions and 55 deletions

View file

@ -369,15 +369,23 @@ async def approve_source(
source_language = job_doc["source"].get("language", "en")
new_status = JobStatus.APPROVED_ENGLISH if source_language == "en" else JobStatus.APPROVED_SOURCE
# Build update operations
update_set = {
"status": new_status.value,
"review.notes": request.notes or "",
"review.reviewer_id": str(current_user.id),
"updated_at": datetime.utcnow()
}
# If TTS preferences override provided, update requested_outputs.tts_preferences
if request.tts_preferences:
update_set["requested_outputs.tts_preferences"] = request.tts_preferences.model_dump()
logger.info(f"Updating TTS preferences for job {job_id}: {request.tts_preferences}")
result = await db.jobs.find_one_and_update(
{"_id": job_id, "status": JobStatus.PENDING_QC.value},
{
"$set": {
"status": new_status.value,
"review.notes": request.notes or "",
"review.reviewer_id": str(current_user.id),
"updated_at": datetime.utcnow()
},
"$set": update_set,
"$push": {
"review.history": {
"at": datetime.utcnow(),

View file

@ -0,0 +1,104 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response
from pydantic import BaseModel
from ...core.config import settings
from ...core.logging import get_logger
from ...services.gemini_tts import gemini_tts_service
from ..deps import get_current_user
logger = get_logger(__name__)
router = APIRouter(prefix="/tts", tags=["tts"])
class VoicePreviewRequest(BaseModel):
"""Request to generate a voice preview"""
voice_name: str
language: str = "en"
class VoicesResponse(BaseModel):
"""Available TTS voices"""
voices: list[str]
default: str
class LanguagesResponse(BaseModel):
"""Supported TTS languages"""
languages: dict[str, str] # code -> display name
preview_samples: dict[str, str] # code -> sample text
@router.get("/voices", response_model=VoicesResponse)
async def list_voices(
current_user=Depends(get_current_user)
) -> VoicesResponse:
"""
List all available Gemini TTS voices.
"""
return VoicesResponse(
voices=settings.gemini_tts_voices,
default=settings.gemini_tts_default_voice
)
@router.get("/languages", response_model=LanguagesResponse)
async def list_languages(
current_user=Depends(get_current_user)
) -> LanguagesResponse:
"""
List all supported TTS languages with display names and preview samples.
"""
return LanguagesResponse(
languages=settings.gemini_tts_language_names,
preview_samples=settings.gemini_tts_preview_samples
)
@router.post("/preview")
async def preview_voice(
request: VoicePreviewRequest,
current_user=Depends(get_current_user)
) -> Response:
"""
Generate a voice preview audio sample.
Returns MP3 audio data.
"""
# Validate voice name
if request.voice_name not in settings.gemini_tts_voices:
raise HTTPException(
status_code=400,
detail=f"Invalid voice name. Available voices: {', '.join(settings.gemini_tts_voices)}"
)
# Validate language
if request.language not in settings.gemini_tts_languages:
raise HTTPException(
status_code=400,
detail=f"Unsupported language. Available languages: {', '.join(settings.gemini_tts_languages.keys())}"
)
try:
logger.info(f"Generating voice preview: voice={request.voice_name}, language={request.language}")
# Generate preview audio
audio_data = await gemini_tts_service.synthesize_preview(
voice_name=request.voice_name,
language=request.language
)
return Response(
content=audio_data,
media_type="audio/mpeg",
headers={
"Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3"
}
)
except Exception as e:
logger.error(f"Voice preview generation failed: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to generate voice preview: {str(e)}"
) from e

View file

@ -39,20 +39,109 @@ class Settings(BaseSettings):
google_tts_credentials: str = ""
# TTS Voice Configuration
tts_provider: str = "google" # "google" or "elevenlabs"
tts_provider: str = "gemini" # "gemini", "google", or "elevenlabs"
google_tts_voices: dict[str, str] = {
"en-US": "en-US-Neural2-D",
"es-ES": "es-ES-Neural2-A",
"es-ES": "es-ES-Neural2-A",
"fr-FR": "fr-FR-Neural2-A",
"de-DE": "de-DE-Neural2-B"
}
elevenlabs_voices: dict[str, str] = {
"en-US": "21m00Tcm4TlvDq8ikWAM",
"es-ES": "VR6AewLTigWG4xSOukaG",
"fr-FR": "TxGEqnHWrfWFTfGW9XjX",
"fr-FR": "TxGEqnHWrfWFTfGW9XjX",
"de-DE": "pNInz6obpgDQGcFmaJgB"
}
# Gemini TTS Configuration
gemini_tts_model: str = "gemini-2.5-flash-preview-tts"
gemini_tts_default_voice: str = "Kore"
gemini_tts_voices: list[str] = [
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
]
gemini_tts_languages: dict[str, str] = {
"en": "en-US",
"es": "es-US",
"fr": "fr-FR",
"de": "de-DE",
"it": "it-IT",
"pt": "pt-BR",
"ja": "ja-JP",
"ko": "ko-KR",
"ar": "ar-EG",
"hi": "hi-IN",
"id": "id-ID",
"nl": "nl-NL",
"pl": "pl-PL",
"ru": "ru-RU",
"th": "th-TH",
"tr": "tr-TR",
"vi": "vi-VN",
"ro": "ro-RO",
"uk": "uk-UA",
"bn": "bn-BD",
"mr": "mr-IN",
"ta": "ta-IN",
"te": "te-IN",
"zh": "zh-CN"
}
gemini_tts_language_names: dict[str, str] = {
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"ja": "Japanese",
"ko": "Korean",
"ar": "Arabic",
"hi": "Hindi",
"id": "Indonesian",
"nl": "Dutch",
"pl": "Polish",
"ru": "Russian",
"th": "Thai",
"tr": "Turkish",
"vi": "Vietnamese",
"ro": "Romanian",
"uk": "Ukrainian",
"bn": "Bengali",
"mr": "Marathi",
"ta": "Tamil",
"te": "Telugu",
"zh": "Chinese"
}
gemini_tts_preview_samples: dict[str, str] = {
"en": "This is a preview of the audio description voice.",
"es": "Esta es una vista previa de la voz de audiodescripcion.",
"fr": "Ceci est un apercu de la voix de l'audiodescription.",
"de": "Dies ist eine Vorschau der Audiodeskriptionsstimme.",
"it": "Questa e un'anteprima della voce dell'audiodescrizione.",
"pt": "Esta e uma previa da voz da audiodescricao.",
"ja": "これは音声解説の声のプレビューです。",
"ko": "이것은 오디오 설명 음성의 미리보기입니다.",
"ar": "هذه معاينة لصوت الوصف الصوتي.",
"hi": "यह ऑडियो विवरण आवाज का पूर्वावलोकन है।",
"id": "Ini adalah pratinjau suara deskripsi audio.",
"nl": "Dit is een voorbeeld van de audiodescriptiestem.",
"pl": "To jest podglad glosu audiodeskrypcji.",
"ru": "Это предварительный просмотр голоса аудиоописания.",
"th": "นี่คือตัวอย่างเสียงบรรยายภาพ",
"tr": "Bu, sesli betimleme sesinin bir onizlemesidir.",
"vi": "Day la ban xem truoc giong mo ta am thanh.",
"ro": "Aceasta este o previzualizare a vocii descrierii audio.",
"uk": "Це попередній перегляд голосу аудіоопису.",
"bn": "এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।",
"mr": "हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे.",
"ta": "இது ஆடியோ விளக்க குரலின் முன்னோட்டம்.",
"te": "ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ.",
"zh": "这是音频描述语音的预览。"
}
# Email
sendgrid_api_key: str
email_from: str

View file

@ -14,6 +14,7 @@ from .api.v1.routes_admin import router as admin_router
from .api.v1.routes_auth import router as auth_router
from .api.v1.routes_files import router as files_router
from .api.v1.routes_jobs import router as jobs_router
from .api.v1.routes_tts import router as tts_router
from .api.v1.routes_websockets import router as websockets_router
from .services.websocket import connection_manager
from .core.config import settings
@ -241,6 +242,7 @@ async def validation_middleware(request, call_next):
app.include_router(auth_router, prefix="/api/v1")
app.include_router(files_router, prefix="/api/v1")
app.include_router(jobs_router, prefix="/api/v1")
app.include_router(tts_router, prefix="/api/v1")
app.include_router(admin_router, prefix="/api/v1")
app.include_router(websockets_router, prefix="/api/v1")

View file

@ -35,12 +35,20 @@ class Source(BaseModel):
detected_language: Optional[str] = None # AI-detected language from Gemini
class TTSPreferences(BaseModel):
"""TTS voice preferences for audio description generation"""
provider: Literal["gemini", "google", "elevenlabs"] = "gemini"
default_voice: str = "Kore" # Default Gemini voice
voices_per_language: dict[str, str] = {} # {"en": "Kore", "es": "Aoede"}
class RequestedOutputs(BaseModel):
captions_vtt: bool = True
audio_description_vtt: bool = True
audio_description_mp3: bool = True
languages: list[str] = []
transcreation: list[str] = []
tts_preferences: Optional[TTSPreferences] = None
class LangOutput(BaseModel):

View file

@ -1,8 +1,8 @@
from typing import Any, Optional, Union
from typing import Any, Literal, Optional, Union
from pydantic import BaseModel
from ..models.job import JobStatus, LangOutput, RequestedOutputs, Review
from ..models.job import JobStatus, LangOutput, RequestedOutputs, Review, TTSPreferences
class JobResponse(BaseModel):
@ -43,6 +43,7 @@ class ApproveEnglishRequest(BaseModel):
class ApproveSourceRequest(BaseModel):
"""Request to approve source language content (works for any language)"""
notes: Optional[str] = None
tts_preferences: Optional[TTSPreferences] = None # Override TTS voice settings
class RejectJobRequest(BaseModel):

View file

@ -0,0 +1,256 @@
import io
import wave
from google import genai
from google.genai import types
from pydub import AudioSegment
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
class GeminiTTSService:
"""Text-to-Speech service using Gemini TTS API"""
def __init__(self):
self.client = genai.Client(api_key=settings.gemini_api_key)
self.model = settings.gemini_tts_model
self.default_voice = settings.gemini_tts_default_voice
logger.info(f"Gemini TTS service initialized with model: {self.model}")
async def synthesize_text(
self,
text: str,
voice_name: str,
language: str = "en"
) -> bytes:
"""
Synthesize text to audio using Gemini TTS.
Returns MP3 audio bytes.
"""
if not text.strip():
raise ValueError("Text cannot be empty")
# Validate voice
if voice_name not in settings.gemini_tts_voices:
logger.warning(f"Unknown voice '{voice_name}', using default '{self.default_voice}'")
voice_name = self.default_voice
try:
# Generate audio using Gemini TTS
response = self.client.models.generate_content(
model=self.model,
contents=text,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice_name,
)
)
),
)
)
# Extract PCM audio data from response
if not response.candidates or not response.candidates[0].content.parts:
raise ValueError("No audio data in Gemini TTS response")
pcm_data = response.candidates[0].content.parts[0].inline_data.data
# Convert PCM to MP3
mp3_data = self._pcm_to_mp3(pcm_data)
return mp3_data
except Exception as e:
logger.error(f"Gemini TTS synthesis failed: {e}")
raise
async def synthesize_preview(
self,
voice_name: str,
language: str = "en"
) -> bytes:
"""
Generate a preview audio sample for voice selection.
Uses language-specific sample text.
"""
# Get preview sample text for the language
sample_text = settings.gemini_tts_preview_samples.get(
language,
settings.gemini_tts_preview_samples.get("en", "This is a voice preview.")
)
return await self.synthesize_text(sample_text, voice_name, language)
async def synthesize_audio_description(
self,
ad_vtt_content: str,
language: str = "en",
voice_name: str | None = None
) -> bytes:
"""
Synthesize full audio description from VTT content.
Maintains timing alignment with original VTT cues.
"""
if voice_name is None:
voice_name = self.default_voice
# Validate voice
if voice_name not in settings.gemini_tts_voices:
logger.warning(f"Unknown voice '{voice_name}', using default '{self.default_voice}'")
voice_name = self.default_voice
# Parse VTT cues
cues = self._parse_ad_cues(ad_vtt_content)
if not cues:
raise ValueError("No audio description cues found in VTT content")
logger.info(f"Synthesizing {len(cues)} audio description cues with voice '{voice_name}'")
# Synthesize each cue with precise timing anchoring
audio_segments = []
current_audio_position = 0.0
for i, cue in enumerate(cues):
target_start_time = cue["start_time"]
# Add silence to reach the exact VTT start time
if target_start_time > current_audio_position:
silence_duration = target_start_time - current_audio_position
silence = AudioSegment.silent(duration=int(silence_duration * 1000))
audio_segments.append(silence)
current_audio_position = target_start_time
# Synthesize this cue's text
text = cue["text"].strip()
if text:
# Ensure proper punctuation for natural TTS flow
if not text.endswith(('.', '!', '?')):
text += "."
try:
audio_data = await self.synthesize_text(text, voice_name, language)
# Convert to AudioSegment and get actual duration
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
audio_segments.append(audio_segment)
# Update position based on actual audio duration
actual_audio_duration = len(audio_segment) / 1000.0
current_audio_position += actual_audio_duration
except Exception as e:
logger.warning(f"Failed to synthesize cue {i}: {e}")
# Add silence for failed cue
cue_duration = cue["end_time"] - cue["start_time"]
silence = AudioSegment.silent(duration=int(cue_duration * 1000))
audio_segments.append(silence)
current_audio_position += cue_duration
# Combine all segments
if audio_segments:
final_audio = sum(audio_segments, AudioSegment.empty())
else:
final_audio = AudioSegment.silent(duration=1000)
# Export to MP3
output_buffer = io.BytesIO()
final_audio.export(output_buffer, format="mp3", bitrate="128k")
logger.info(f"Audio description synthesized: {len(output_buffer.getvalue())} bytes")
return output_buffer.getvalue()
def _pcm_to_mp3(self, pcm_data: bytes) -> bytes:
"""
Convert raw PCM audio (24kHz, 16-bit, mono) to MP3.
Gemini TTS outputs PCM at 24000 Hz sample rate.
"""
# Create WAV from PCM data
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, "wb") as wf:
wf.setnchannels(1) # Mono
wf.setsampwidth(2) # 16-bit (2 bytes)
wf.setframerate(24000) # 24kHz
wf.writeframes(pcm_data)
# Convert WAV to MP3 using pydub
wav_buffer.seek(0)
audio_segment = AudioSegment.from_wav(wav_buffer)
# Export as MP3
mp3_buffer = io.BytesIO()
audio_segment.export(mp3_buffer, format="mp3", bitrate="128k")
return mp3_buffer.getvalue()
def _parse_ad_cues(self, vtt_content: str) -> list[dict]:
"""Parse audio description VTT and extract timing + text"""
lines = vtt_content.strip().split('\n')
cues = []
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip header and empty lines
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
i += 1
continue
# Check for timing line
if " --> " in line:
timing_parts = line.split(" --> ")
start_time = self._parse_timestamp(timing_parts[0].strip())
end_time = self._parse_timestamp(timing_parts[1].strip())
# Get text from next line(s)
i += 1
text_lines = []
while i < len(lines) and lines[i].strip() != "":
text_lines.append(lines[i].strip())
i += 1
if text_lines:
cues.append({
"start_time": start_time,
"end_time": end_time,
"text": " ".join(text_lines)
})
else:
i += 1
return cues
def _parse_timestamp(self, timestamp: str) -> float:
"""Convert VTT timestamp to seconds"""
parts = timestamp.split(":")
if len(parts) == 3: # HH:MM:SS.mmm
hours, minutes, seconds = parts
elif len(parts) == 2: # MM:SS.mmm
hours, minutes, seconds = "0", parts[0], parts[1]
else:
raise ValueError(f"Invalid timestamp format: {timestamp}")
sec_parts = seconds.split(".")
seconds_val = int(sec_parts[0])
milliseconds = int(sec_parts[1]) if len(sec_parts) > 1 else 0
total_seconds = (
int(hours) * 3600 +
int(minutes) * 60 +
seconds_val +
milliseconds / 1000.0
)
return total_seconds
# Global service instance
gemini_tts_service = GeminiTTSService()

View file

@ -7,47 +7,74 @@ from pydub import AudioSegment
from ..core.config import settings
from ..core.logging import get_logger
from .gemini_tts import gemini_tts_service
logger = get_logger(__name__)
class TTSService:
def __init__(self):
# Initialize Google TTS (uses GOOGLE_APPLICATION_CREDENTIALS env var)
# The same GCP credentials used for GCS also work for TTS
# Check Gemini TTS availability (uses same API key as other Gemini services)
self.gemini_available = bool(settings.gemini_api_key)
# Initialize Google Cloud TTS (uses GOOGLE_APPLICATION_CREDENTIALS env var)
try:
self.google_client = texttospeech.TextToSpeechClient()
logger.info("Google TTS client initialized successfully")
logger.info("Google Cloud TTS client initialized successfully")
except Exception as e:
logger.warning(f"Google TTS credentials not configured: {e}")
logger.warning(f"Google Cloud TTS credentials not configured: {e}")
self.google_client = None
# Check ElevenLabs availability
self.elevenlabs_available = bool(settings.elevenlabs_api_key)
# Log configured provider
logger.info(f"TTS provider configured: {settings.tts_provider}")
async def synthesize_audio_description(
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: Optional[str] = None
voice_name: Optional[str] = None,
provider: Optional[str] = None
) -> bytes:
"""
Generate MP3 audio from audio description VTT content
Synthesizes each cue separately and stitches them together with timing
Uses Google TTS with ElevenLabs fallback
Generate MP3 audio from audio description VTT content.
Synthesizes each cue separately and stitches them together with timing.
Provider priority: specified provider > settings.tts_provider > fallback chain
Fallback chain: Gemini -> Google Cloud TTS -> ElevenLabs
"""
# Try Google TTS first, fallback to ElevenLabs
try:
if self.google_client:
return await self._synthesize_with_google(ad_vtt_content, language_code, voice_name)
elif self.elevenlabs_available:
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
else:
raise ValueError("No TTS service configured")
except Exception as e:
if self.elevenlabs_available and self.google_client:
logger.warning(f"Google TTS failed, trying ElevenLabs: {e}")
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
raise
# Determine which provider to use
active_provider = provider or settings.tts_provider
# Extract simple language code for Gemini (e.g., "en-US" -> "en")
simple_lang = language_code.split("-")[0] if "-" in language_code else language_code
# Try the configured provider first, then fallback
if active_provider == "gemini" and self.gemini_available:
try:
logger.info(f"Using Gemini TTS for language: {simple_lang}, voice: {voice_name}")
return await gemini_tts_service.synthesize_audio_description(
ad_vtt_content, simple_lang, voice_name
)
except Exception as e:
logger.warning(f"Gemini TTS failed, falling back: {e}")
# Fall through to Google/ElevenLabs
if active_provider == "google" or (active_provider == "gemini" and self.google_client):
try:
if self.google_client:
logger.info(f"Using Google Cloud TTS for language: {language_code}")
return await self._synthesize_with_google(ad_vtt_content, language_code, voice_name)
except Exception as e:
logger.warning(f"Google Cloud TTS failed: {e}")
if self.elevenlabs_available:
logger.info(f"Using ElevenLabs TTS for language: {language_code}")
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
raise ValueError("No TTS service available")
async def _synthesize_with_google(
self,

View file

@ -269,7 +269,9 @@ async def _async_translate_and_synthesize(job_id: str):
# Generate TTS for languages that need MP3
if job_doc["requested_outputs"]["audio_description_mp3"]:
await _generate_tts_for_languages(job_id, updated_outputs, db, source_language)
# Get TTS preferences from job
tts_preferences = job_doc["requested_outputs"].get("tts_preferences", {})
await _generate_tts_for_languages(job_id, updated_outputs, db, source_language, tts_preferences)
# Update final status
await db.jobs.update_one(
@ -323,33 +325,53 @@ async def _async_translate_and_synthesize(job_id: str):
client.close()
async def _generate_tts_for_languages(job_id: str, outputs: dict[str, Any], db, source_language: str = "en"):
async def _generate_tts_for_languages(
job_id: str,
outputs: dict[str, Any],
db,
source_language: str = "en",
tts_preferences: dict = None
):
"""Generate TTS audio for each language's audio description"""
if tts_preferences is None:
tts_preferences = {}
# Always generate source language MP3 first
if source_language in outputs and "ad_vtt_gcs" in outputs[source_language]:
await _generate_language_tts(job_id, source_language, outputs[source_language], db)
await _generate_language_tts(job_id, source_language, outputs[source_language], db, tts_preferences)
# Generate for other languages
for language, lang_output in outputs.items():
if language != source_language and "ad_vtt_gcs" in lang_output:
await _generate_language_tts(job_id, language, lang_output, db)
await _generate_language_tts(job_id, language, lang_output, db, tts_preferences)
async def _generate_language_tts(job_id: str, language: str, lang_output: dict, db):
async def _generate_language_tts(job_id: str, language: str, lang_output: dict, db, tts_preferences: dict = None):
"""Generate TTS for a specific language"""
if tts_preferences is None:
tts_preferences = {}
try:
# Download AD VTT content
ad_blob_path = lang_output["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
ad_blob = gcs_service.bucket.blob(ad_blob_path)
ad_vtt_content = ad_blob.download_as_text()
# Get voice for this language from preferences
voices_per_language = tts_preferences.get("voices_per_language", {})
voice_name = voices_per_language.get(language, tts_preferences.get("default_voice"))
provider = tts_preferences.get("provider", "gemini")
# Generate MP3 with retry
language_code = f"{language}-US" if language == "en" else f"{language}-{language.upper()}"
logger.info(f"Generating TTS for {language} with voice={voice_name}, provider={provider}")
async def synthesize():
return await tts_service.synthesize_audio_description(ad_vtt_content, language_code)
return await tts_service.synthesize_audio_description(
ad_vtt_content, language_code, voice_name=voice_name, provider=provider
)
mp3_data = await retry_with_backoff(synthesize, max_retries=3)
# Upload MP3 to GCS

View file

@ -0,0 +1,138 @@
import { useState, useRef } from 'react';
import { api } from '../lib/api';
interface VoicePreviewButtonProps {
voiceName: string;
language: string;
disabled?: boolean;
}
export function VoicePreviewButton({ voiceName, language, disabled }: VoicePreviewButtonProps) {
const [isLoading, setIsLoading] = useState(false);
const [isPlaying, setIsPlaying] = useState(false);
const [error, setError] = useState<string | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const audioUrlRef = useRef<string | null>(null);
const handlePreview = async () => {
setError(null);
// If already playing, stop
if (isPlaying && audioRef.current) {
audioRef.current.pause();
audioRef.current.currentTime = 0;
setIsPlaying(false);
return;
}
// If we have cached audio, play it
if (audioUrlRef.current && audioRef.current) {
audioRef.current.play();
setIsPlaying(true);
return;
}
// Fetch new audio
setIsLoading(true);
try {
const blob = await api.previewVoice(voiceName, language);
const url = URL.createObjectURL(blob);
// Clean up old URL if exists
if (audioUrlRef.current) {
URL.revokeObjectURL(audioUrlRef.current);
}
audioUrlRef.current = url;
// Create and play audio
const audio = new Audio(url);
audioRef.current = audio;
audio.onended = () => {
setIsPlaying(false);
};
audio.onerror = () => {
setError('Failed to play audio');
setIsPlaying(false);
};
await audio.play();
setIsPlaying(true);
} catch (err) {
setError('Failed to generate preview');
console.error('Voice preview error:', err);
} finally {
setIsLoading(false);
}
};
// Cleanup on unmount
// Note: We don't add cleanup in useEffect to allow audio caching within component lifecycle
return (
<div className="inline-flex items-center gap-2">
<button
type="button"
onClick={handlePreview}
disabled={disabled || isLoading}
className={`
inline-flex items-center gap-1.5 px-3 py-1.5 text-sm font-medium rounded-md
transition-colors duration-150
${disabled || isLoading
? 'bg-gray-100 text-gray-400 cursor-not-allowed'
: isPlaying
? 'bg-red-100 text-red-700 hover:bg-red-200'
: 'bg-blue-100 text-blue-700 hover:bg-blue-200'
}
`}
>
{isLoading ? (
<>
<svg className="animate-spin h-4 w-4" viewBox="0 0 24 24">
<circle
className="opacity-25"
cx="12"
cy="12"
r="10"
stroke="currentColor"
strokeWidth="4"
fill="none"
/>
<path
className="opacity-75"
fill="currentColor"
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
/>
</svg>
Loading...
</>
) : isPlaying ? (
<>
<svg className="h-4 w-4" fill="currentColor" viewBox="0 0 20 20">
<path
fillRule="evenodd"
d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zM7 8a1 1 0 011-1h4a1 1 0 110 2H8a1 1 0 01-1-1zm0 4a1 1 0 011-1h4a1 1 0 110 2H8a1 1 0 01-1-1z"
clipRule="evenodd"
/>
</svg>
Stop
</>
) : (
<>
<svg className="h-4 w-4" fill="currentColor" viewBox="0 0 20 20">
<path
fillRule="evenodd"
d="M10 18a8 8 0 100-16 8 8 0 000 16zM9.555 7.168A1 1 0 008 8v4a1 1 0 001.555.832l3-2a1 1 0 000-1.664l-3-2z"
clipRule="evenodd"
/>
</svg>
Preview
</>
)}
</button>
{error && <span className="text-xs text-red-600">{error}</span>}
</div>
);
}

View file

@ -0,0 +1,216 @@
import { useState, useEffect } from 'react';
import { api } from '../lib/api';
import { VoicePreviewButton } from './VoicePreviewButton';
import type { TTSPreferences, VoicesResponse, LanguagesResponse } from '../types/api';
interface VoiceSelectorProps {
selectedLanguages: string[];
preferences: TTSPreferences;
onChange: (preferences: TTSPreferences) => void;
disabled?: boolean;
}
export function VoiceSelector({
selectedLanguages,
preferences,
onChange,
disabled
}: VoiceSelectorProps) {
const [voices, setVoices] = useState<VoicesResponse | null>(null);
const [languages, setLanguages] = useState<LanguagesResponse | null>(null);
const [activeLanguage, setActiveLanguage] = useState<string>(selectedLanguages[0] || 'en');
const [isLoading, setIsLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
// Fetch voices and languages on mount
useEffect(() => {
const fetchData = async () => {
try {
setIsLoading(true);
const [voicesData, languagesData] = await Promise.all([
api.getVoices(),
api.getLanguages()
]);
setVoices(voicesData);
setLanguages(languagesData);
} catch (err) {
setError('Failed to load voice options');
console.error('Voice selector error:', err);
} finally {
setIsLoading(false);
}
};
fetchData();
}, []);
// Update active language when selected languages change
useEffect(() => {
if (selectedLanguages.length > 0 && !selectedLanguages.includes(activeLanguage)) {
setActiveLanguage(selectedLanguages[0]);
}
}, [selectedLanguages, activeLanguage]);
const handleDefaultVoiceChange = (voice: string) => {
onChange({
...preferences,
default_voice: voice
});
};
const handleLanguageVoiceChange = (language: string, voice: string) => {
const newVoicesPerLanguage = {
...preferences.voices_per_language,
[language]: voice
};
// If voice matches default, remove from per-language overrides
if (voice === preferences.default_voice) {
delete newVoicesPerLanguage[language];
}
onChange({
...preferences,
voices_per_language: newVoicesPerLanguage
});
};
const getVoiceForLanguage = (language: string): string => {
return preferences.voices_per_language[language] || preferences.default_voice;
};
if (isLoading) {
return (
<div className="flex items-center gap-2 text-gray-500">
<svg className="animate-spin h-5 w-5" viewBox="0 0 24 24">
<circle
className="opacity-25"
cx="12"
cy="12"
r="10"
stroke="currentColor"
strokeWidth="4"
fill="none"
/>
<path
className="opacity-75"
fill="currentColor"
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
/>
</svg>
Loading voice options...
</div>
);
}
if (error || !voices || !languages) {
return (
<div className="text-red-600 text-sm">
{error || 'Failed to load voice options'}
</div>
);
}
// Filter languages to only show selected ones, always include English
const displayLanguages = selectedLanguages.length > 0
? ['en', ...selectedLanguages.filter(l => l !== 'en')]
: ['en'];
return (
<div className="space-y-4">
{/* Default Voice Selection */}
<div className="bg-gray-50 rounded-lg p-4">
<label className="block text-sm font-medium text-gray-700 mb-2">
Default Voice
</label>
<p className="text-xs text-gray-500 mb-3">
This voice will be used for all languages unless overridden below.
</p>
<div className="flex items-center gap-3">
<select
value={preferences.default_voice}
onChange={(e) => handleDefaultVoiceChange(e.target.value)}
disabled={disabled}
className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm"
>
{voices.voices.map((voice) => (
<option key={voice} value={voice}>
{voice}
</option>
))}
</select>
<VoicePreviewButton
voiceName={preferences.default_voice}
language="en"
disabled={disabled}
/>
</div>
</div>
{/* Per-Language Voice Overrides */}
{displayLanguages.length > 1 && (
<div className="bg-gray-50 rounded-lg p-4">
<label className="block text-sm font-medium text-gray-700 mb-2">
Per-Language Voice Settings
</label>
<p className="text-xs text-gray-500 mb-3">
Optionally choose different voices for specific languages.
</p>
{/* Language Tabs */}
<div className="flex flex-wrap gap-1 mb-4 border-b border-gray-200">
{displayLanguages.map((lang) => (
<button
key={lang}
type="button"
onClick={() => setActiveLanguage(lang)}
className={`
px-3 py-2 text-sm font-medium rounded-t-md transition-colors
${activeLanguage === lang
? 'bg-white text-blue-600 border-b-2 border-blue-600 -mb-px'
: 'text-gray-500 hover:text-gray-700 hover:bg-gray-100'
}
`}
>
{languages.languages[lang] || lang.toUpperCase()}
{preferences.voices_per_language[lang] && (
<span className="ml-1 text-xs text-blue-500">*</span>
)}
</button>
))}
</div>
{/* Voice Selection for Active Language */}
<div className="flex items-center gap-3">
<select
value={getVoiceForLanguage(activeLanguage)}
onChange={(e) => handleLanguageVoiceChange(activeLanguage, e.target.value)}
disabled={disabled}
className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm"
>
{voices.voices.map((voice) => (
<option key={voice} value={voice}>
{voice}
{voice === preferences.default_voice ? ' (default)' : ''}
</option>
))}
</select>
<VoicePreviewButton
voiceName={getVoiceForLanguage(activeLanguage)}
language={activeLanguage}
disabled={disabled}
/>
</div>
{/* Sample Text Display */}
<div className="mt-3 p-3 bg-white rounded border border-gray-200">
<span className="text-xs text-gray-500 block mb-1">Preview text:</span>
<span className="text-sm text-gray-700 italic">
"{languages.preview_samples[activeLanguage] || languages.preview_samples['en']}"
</span>
</div>
</div>
)}
</div>
);
}

View file

@ -1,10 +1,11 @@
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query';
import { apiClient } from '../lib/api';
import type {
Job,
import type {
Job,
JobCreateRequest,
VttUpdateRequest,
BulkDeleteRequest
BulkDeleteRequest,
TTSPreferences
} from '../types/api';
// Query hooks
@ -88,8 +89,8 @@ export function useApproveEnglish() {
const queryClient = useQueryClient();
return useMutation({
mutationFn: ({ id, notes }: { id: string; notes?: string }) =>
apiClient.approveEnglish(id, notes),
mutationFn: ({ id, notes, tts_preferences }: { id: string; notes?: string; tts_preferences?: TTSPreferences }) =>
apiClient.approveSource(id, notes, tts_preferences),
onSuccess: (_, { id }) => {
queryClient.invalidateQueries({ queryKey: ['jobs', id] });
queryClient.invalidateQueries({ queryKey: ['jobs'] });
@ -101,8 +102,8 @@ export function useApproveSource() {
const queryClient = useQueryClient();
return useMutation({
mutationFn: ({ id, notes }: { id: string; notes?: string }) =>
apiClient.approveSource(id, notes),
mutationFn: ({ id, notes, tts_preferences }: { id: string; notes?: string; tts_preferences?: TTSPreferences }) =>
apiClient.approveSource(id, notes, tts_preferences),
onSuccess: (_, { id }) => {
queryClient.invalidateQueries({ queryKey: ['jobs', id] });
queryClient.invalidateQueries({ queryKey: ['jobs'] });

View file

@ -21,6 +21,9 @@ import type {
UpdateUserRequest,
ResetPasswordResponse,
AdminStatsResponse,
VoicesResponse,
LanguagesResponse,
TTSPreferences,
} from '../types/api';
const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || 'http://localhost:8000';
@ -175,8 +178,11 @@ class ApiClient {
return this.approveSource(id, notes);
}
async approveSource(id: string, notes?: string): Promise<Job> {
const response = await this.client.post(`/jobs/${id}/actions/approve_source`, { notes });
async approveSource(id: string, notes?: string, tts_preferences?: TTSPreferences): Promise<Job> {
const response = await this.client.post(`/jobs/${id}/actions/approve_source`, {
notes,
tts_preferences
});
return response.data;
}
@ -287,6 +293,26 @@ class ApiClient {
const response = await this.client.get('/admin/stats');
return response.data;
}
// TTS endpoints
async getVoices(): Promise<VoicesResponse> {
const response = await this.client.get('/tts/voices');
return response.data;
}
async getLanguages(): Promise<LanguagesResponse> {
const response = await this.client.get('/tts/languages');
return response.data;
}
async previewVoice(voiceName: string, language: string): Promise<Blob> {
const response = await this.client.post(
'/tts/preview',
{ voice_name: voiceName, language },
{ responseType: 'blob' }
);
return response.data;
}
}
export const apiClient = new ApiClient();

View file

@ -4,7 +4,9 @@ import { useJob, useApproveEnglish, useRejectJob, useJobVttContent, useUpdateJob
import { StatusBadge } from '../../components/StatusBadge';
import { VttEditor } from '../../components/VttEditor/VttEditor';
import { VideoWithCaptions } from '../../components/VideoWithCaptions';
import { VoiceSelector } from '../../components/VoiceSelector';
import { useToastContext } from '../../contexts/ToastContext';
import type { TTSPreferences } from '../../types/api';
export function QCDetail() {
const { id } = useParams<{ id: string }>();
@ -30,6 +32,12 @@ export function QCDetail() {
const [timingOffset, setTimingOffset] = useState(0);
const [adjustCaptions, setAdjustCaptions] = useState(true);
const [adjustAudioDescription, setAdjustAudioDescription] = useState(true);
const [showVoiceSettings, setShowVoiceSettings] = useState(false);
const [ttsPreferences, setTtsPreferences] = useState<TTSPreferences>({
provider: 'gemini',
default_voice: 'Kore',
voices_per_language: {}
});
const isProcessing = approveEnglishMutation.isPending || rejectJobMutation.isPending || updateVttMutation.isPending || adjustTimingMutation.isPending;
@ -47,6 +55,13 @@ export function QCDetail() {
}
}, [vttContent]);
// Initialize TTS preferences from job when loaded
useEffect(() => {
if (job?.requested_outputs?.tts_preferences) {
setTtsPreferences(job.requested_outputs.tts_preferences);
}
}, [job]);
// Keyboard shortcuts
useEffect(() => {
const handleKeyPress = (event: KeyboardEvent) => {
@ -131,16 +146,20 @@ export function QCDetail() {
const handleApprove = async () => {
if (!id) return;
// Save any pending changes first
if (hasUnsavedChanges) {
await saveVttChanges();
}
try {
await approveEnglishMutation.mutateAsync({
id,
notes: reviewNotes
// Only pass TTS preferences if MP3 generation is requested
const ttsPrefsToSend = job?.requested_outputs?.audio_description_mp3 ? ttsPreferences : undefined;
await approveEnglishMutation.mutateAsync({
id,
notes: reviewNotes,
tts_preferences: ttsPrefsToSend
});
toast.toastOnly.success('Job approved successfully');
navigate('/admin/qc');
@ -486,6 +505,45 @@ export function QCDetail() {
</div>
)}
{/* Voice Settings - Only show if MP3 generation is requested */}
{job?.requested_outputs?.audio_description_mp3 && (
<div className="mb-6 border border-gray-200 rounded-lg overflow-hidden">
<button
type="button"
onClick={() => setShowVoiceSettings(!showVoiceSettings)}
className="w-full flex items-center justify-between px-4 py-3 bg-gray-50 hover:bg-gray-100 transition-colors"
>
<div className="flex items-center gap-2">
<svg
className={`w-4 h-4 text-gray-500 transition-transform ${showVoiceSettings ? 'rotate-90' : ''}`}
fill="none"
stroke="currentColor"
viewBox="0 0 24 24"
>
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
</svg>
<span className="text-sm font-medium text-gray-700">Voice Settings for Audio Description</span>
{job.requested_outputs.tts_preferences && (
<span className="text-xs text-blue-600">(Configured)</span>
)}
</div>
<span className="text-xs text-gray-500">
{showVoiceSettings ? 'Click to collapse' : 'Click to customize or override TTS voice'}
</span>
</button>
{showVoiceSettings && (
<div className="p-4 border-t border-gray-200">
<VoiceSelector
selectedLanguages={['en', ...(job.requested_outputs.languages || [])]}
preferences={ttsPreferences}
onChange={setTtsPreferences}
disabled={isProcessing}
/>
</div>
)}
</div>
)}
{/* Review Notes */}
<div className="mb-6">
<label className="block text-sm font-medium text-gray-700 mb-2">

View file

@ -4,9 +4,10 @@ import { useForm } from 'react-hook-form';
import { zodResolver } from '@hookform/resolvers/zod';
import { z } from 'zod';
import { UploadDropzone } from '../../components/UploadDropzone/UploadDropzone';
import { VoiceSelector } from '../../components/VoiceSelector';
import { useCreateJob } from '../../hooks/useJob';
import { useToastContext } from '../../contexts/ToastContext';
import type { JobCreateRequest } from '../../types/api';
import type { JobCreateRequest, TTSPreferences } from '../../types/api';
const jobSchema = z.object({
title: z.string().min(1, 'Title is required'),
@ -25,6 +26,12 @@ export function NewJob() {
const [selectedFile, setSelectedFile] = useState<File | null>(null);
const [uploadProgress, setUploadProgress] = useState(0);
const [createdJob, setCreatedJob] = useState<string | null>(null);
const [showVoiceSettings, setShowVoiceSettings] = useState(false);
const [ttsPreferences, setTtsPreferences] = useState<TTSPreferences>({
provider: 'gemini',
default_voice: 'Kore',
voices_per_language: {}
});
const navigate = useNavigate();
const toast = useToastContext();
const createJobMutation = useCreateJob();
@ -51,6 +58,7 @@ export function NewJob() {
const languages = watch('languages');
const transcreation = watch('transcreation');
const sourceIsEnglish = watch('sourceIsEnglish');
const audioDescriptionMp3 = watch('audio_description_mp3');
const onSubmit = async (data: JobFormData) => {
if (!selectedFile) {
@ -68,6 +76,7 @@ export function NewJob() {
audio_description_mp3: data.audio_description_mp3,
languages: data.languages,
transcreation: data.transcreation,
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
}
};
@ -325,6 +334,42 @@ export function NewJob() {
</div>
</div>
{/* Voice Settings - Collapsible */}
{audioDescriptionMp3 && (
<div className="border border-gray-200 rounded-lg overflow-hidden">
<button
type="button"
onClick={() => setShowVoiceSettings(!showVoiceSettings)}
className="w-full flex items-center justify-between px-4 py-3 bg-gray-50 hover:bg-gray-100 transition-colors"
>
<div className="flex items-center gap-2">
<svg
className={`w-4 h-4 text-gray-500 transition-transform ${showVoiceSettings ? 'rotate-90' : ''}`}
fill="none"
stroke="currentColor"
viewBox="0 0 24 24"
>
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
</svg>
<span className="text-sm font-medium text-gray-700">Voice Settings</span>
</div>
<span className="text-xs text-gray-500">
{showVoiceSettings ? 'Click to collapse' : 'Click to customize TTS voice'}
</span>
</button>
{showVoiceSettings && (
<div className="p-4 border-t border-gray-200">
<VoiceSelector
selectedLanguages={['en', ...languages]}
preferences={ttsPreferences}
onChange={setTtsPreferences}
disabled={createJobMutation.isPending}
/>
</div>
)}
</div>
)}
{/* Target Languages */}
<div>
<label className="block text-sm font-medium text-gray-700 mb-2">

View file

@ -37,12 +37,31 @@ export interface Source {
detected_language?: string; // AI-detected language from Gemini
}
export type TTSProvider = "gemini" | "google" | "elevenlabs";
export interface TTSPreferences {
provider: TTSProvider;
default_voice: string;
voices_per_language: Record<string, string>;
}
export interface RequestedOutputs {
captions_vtt: boolean;
audio_description_vtt: boolean;
audio_description_mp3: boolean;
languages: string[];
transcreation: string[];
tts_preferences?: TTSPreferences;
}
export interface VoicesResponse {
voices: string[];
default: string;
}
export interface LanguagesResponse {
languages: Record<string, string>;
preview_samples: Record<string, string>;
}
export interface LangOutput {