upgrade TTS to Gemini TTS with voice selection and preview
- Add Gemini TTS service with 30 voices and 24 languages - Add TTS API endpoints for voice listing and preview - Add per-language voice selection in job creation form - Add voice override at QC approval stage - Add VoiceSelector and VoicePreviewButton components - Update TTSPreferences model with provider and voice mapping 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
46b6f25fd0
commit
29643f6683
16 changed files with 1075 additions and 55 deletions
|
|
@ -369,15 +369,23 @@ async def approve_source(
|
|||
source_language = job_doc["source"].get("language", "en")
|
||||
new_status = JobStatus.APPROVED_ENGLISH if source_language == "en" else JobStatus.APPROVED_SOURCE
|
||||
|
||||
# Build update operations
|
||||
update_set = {
|
||||
"status": new_status.value,
|
||||
"review.notes": request.notes or "",
|
||||
"review.reviewer_id": str(current_user.id),
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
|
||||
# If TTS preferences override provided, update requested_outputs.tts_preferences
|
||||
if request.tts_preferences:
|
||||
update_set["requested_outputs.tts_preferences"] = request.tts_preferences.model_dump()
|
||||
logger.info(f"Updating TTS preferences for job {job_id}: {request.tts_preferences}")
|
||||
|
||||
result = await db.jobs.find_one_and_update(
|
||||
{"_id": job_id, "status": JobStatus.PENDING_QC.value},
|
||||
{
|
||||
"$set": {
|
||||
"status": new_status.value,
|
||||
"review.notes": request.notes or "",
|
||||
"review.reviewer_id": str(current_user.id),
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$set": update_set,
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": datetime.utcnow(),
|
||||
|
|
|
|||
104
backend/app/api/v1/routes_tts.py
Normal file
104
backend/app/api/v1/routes_tts.py
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ...core.config import settings
|
||||
from ...core.logging import get_logger
|
||||
from ...services.gemini_tts import gemini_tts_service
|
||||
from ..deps import get_current_user
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/tts", tags=["tts"])
|
||||
|
||||
|
||||
class VoicePreviewRequest(BaseModel):
|
||||
"""Request to generate a voice preview"""
|
||||
voice_name: str
|
||||
language: str = "en"
|
||||
|
||||
|
||||
class VoicesResponse(BaseModel):
|
||||
"""Available TTS voices"""
|
||||
voices: list[str]
|
||||
default: str
|
||||
|
||||
|
||||
class LanguagesResponse(BaseModel):
|
||||
"""Supported TTS languages"""
|
||||
languages: dict[str, str] # code -> display name
|
||||
preview_samples: dict[str, str] # code -> sample text
|
||||
|
||||
|
||||
@router.get("/voices", response_model=VoicesResponse)
|
||||
async def list_voices(
|
||||
current_user=Depends(get_current_user)
|
||||
) -> VoicesResponse:
|
||||
"""
|
||||
List all available Gemini TTS voices.
|
||||
"""
|
||||
return VoicesResponse(
|
||||
voices=settings.gemini_tts_voices,
|
||||
default=settings.gemini_tts_default_voice
|
||||
)
|
||||
|
||||
|
||||
@router.get("/languages", response_model=LanguagesResponse)
|
||||
async def list_languages(
|
||||
current_user=Depends(get_current_user)
|
||||
) -> LanguagesResponse:
|
||||
"""
|
||||
List all supported TTS languages with display names and preview samples.
|
||||
"""
|
||||
return LanguagesResponse(
|
||||
languages=settings.gemini_tts_language_names,
|
||||
preview_samples=settings.gemini_tts_preview_samples
|
||||
)
|
||||
|
||||
|
||||
@router.post("/preview")
|
||||
async def preview_voice(
|
||||
request: VoicePreviewRequest,
|
||||
current_user=Depends(get_current_user)
|
||||
) -> Response:
|
||||
"""
|
||||
Generate a voice preview audio sample.
|
||||
Returns MP3 audio data.
|
||||
"""
|
||||
# Validate voice name
|
||||
if request.voice_name not in settings.gemini_tts_voices:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Invalid voice name. Available voices: {', '.join(settings.gemini_tts_voices)}"
|
||||
)
|
||||
|
||||
# Validate language
|
||||
if request.language not in settings.gemini_tts_languages:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported language. Available languages: {', '.join(settings.gemini_tts_languages.keys())}"
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f"Generating voice preview: voice={request.voice_name}, language={request.language}")
|
||||
|
||||
# Generate preview audio
|
||||
audio_data = await gemini_tts_service.synthesize_preview(
|
||||
voice_name=request.voice_name,
|
||||
language=request.language
|
||||
)
|
||||
|
||||
return Response(
|
||||
content=audio_data,
|
||||
media_type="audio/mpeg",
|
||||
headers={
|
||||
"Content-Disposition": f"inline; filename=preview_{request.voice_name}_{request.language}.mp3"
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Voice preview generation failed: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to generate voice preview: {str(e)}"
|
||||
) from e
|
||||
|
|
@ -39,20 +39,109 @@ class Settings(BaseSettings):
|
|||
google_tts_credentials: str = ""
|
||||
|
||||
# TTS Voice Configuration
|
||||
tts_provider: str = "google" # "google" or "elevenlabs"
|
||||
tts_provider: str = "gemini" # "gemini", "google", or "elevenlabs"
|
||||
google_tts_voices: dict[str, str] = {
|
||||
"en-US": "en-US-Neural2-D",
|
||||
"es-ES": "es-ES-Neural2-A",
|
||||
"es-ES": "es-ES-Neural2-A",
|
||||
"fr-FR": "fr-FR-Neural2-A",
|
||||
"de-DE": "de-DE-Neural2-B"
|
||||
}
|
||||
elevenlabs_voices: dict[str, str] = {
|
||||
"en-US": "21m00Tcm4TlvDq8ikWAM",
|
||||
"es-ES": "VR6AewLTigWG4xSOukaG",
|
||||
"fr-FR": "TxGEqnHWrfWFTfGW9XjX",
|
||||
"fr-FR": "TxGEqnHWrfWFTfGW9XjX",
|
||||
"de-DE": "pNInz6obpgDQGcFmaJgB"
|
||||
}
|
||||
|
||||
# Gemini TTS Configuration
|
||||
gemini_tts_model: str = "gemini-2.5-flash-preview-tts"
|
||||
gemini_tts_default_voice: str = "Kore"
|
||||
gemini_tts_voices: list[str] = [
|
||||
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
|
||||
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
|
||||
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
|
||||
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
|
||||
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
|
||||
]
|
||||
gemini_tts_languages: dict[str, str] = {
|
||||
"en": "en-US",
|
||||
"es": "es-US",
|
||||
"fr": "fr-FR",
|
||||
"de": "de-DE",
|
||||
"it": "it-IT",
|
||||
"pt": "pt-BR",
|
||||
"ja": "ja-JP",
|
||||
"ko": "ko-KR",
|
||||
"ar": "ar-EG",
|
||||
"hi": "hi-IN",
|
||||
"id": "id-ID",
|
||||
"nl": "nl-NL",
|
||||
"pl": "pl-PL",
|
||||
"ru": "ru-RU",
|
||||
"th": "th-TH",
|
||||
"tr": "tr-TR",
|
||||
"vi": "vi-VN",
|
||||
"ro": "ro-RO",
|
||||
"uk": "uk-UA",
|
||||
"bn": "bn-BD",
|
||||
"mr": "mr-IN",
|
||||
"ta": "ta-IN",
|
||||
"te": "te-IN",
|
||||
"zh": "zh-CN"
|
||||
}
|
||||
gemini_tts_language_names: dict[str, str] = {
|
||||
"en": "English",
|
||||
"es": "Spanish",
|
||||
"fr": "French",
|
||||
"de": "German",
|
||||
"it": "Italian",
|
||||
"pt": "Portuguese",
|
||||
"ja": "Japanese",
|
||||
"ko": "Korean",
|
||||
"ar": "Arabic",
|
||||
"hi": "Hindi",
|
||||
"id": "Indonesian",
|
||||
"nl": "Dutch",
|
||||
"pl": "Polish",
|
||||
"ru": "Russian",
|
||||
"th": "Thai",
|
||||
"tr": "Turkish",
|
||||
"vi": "Vietnamese",
|
||||
"ro": "Romanian",
|
||||
"uk": "Ukrainian",
|
||||
"bn": "Bengali",
|
||||
"mr": "Marathi",
|
||||
"ta": "Tamil",
|
||||
"te": "Telugu",
|
||||
"zh": "Chinese"
|
||||
}
|
||||
gemini_tts_preview_samples: dict[str, str] = {
|
||||
"en": "This is a preview of the audio description voice.",
|
||||
"es": "Esta es una vista previa de la voz de audiodescripcion.",
|
||||
"fr": "Ceci est un apercu de la voix de l'audiodescription.",
|
||||
"de": "Dies ist eine Vorschau der Audiodeskriptionsstimme.",
|
||||
"it": "Questa e un'anteprima della voce dell'audiodescrizione.",
|
||||
"pt": "Esta e uma previa da voz da audiodescricao.",
|
||||
"ja": "これは音声解説の声のプレビューです。",
|
||||
"ko": "이것은 오디오 설명 음성의 미리보기입니다.",
|
||||
"ar": "هذه معاينة لصوت الوصف الصوتي.",
|
||||
"hi": "यह ऑडियो विवरण आवाज का पूर्वावलोकन है।",
|
||||
"id": "Ini adalah pratinjau suara deskripsi audio.",
|
||||
"nl": "Dit is een voorbeeld van de audiodescriptiestem.",
|
||||
"pl": "To jest podglad glosu audiodeskrypcji.",
|
||||
"ru": "Это предварительный просмотр голоса аудиоописания.",
|
||||
"th": "นี่คือตัวอย่างเสียงบรรยายภาพ",
|
||||
"tr": "Bu, sesli betimleme sesinin bir onizlemesidir.",
|
||||
"vi": "Day la ban xem truoc giong mo ta am thanh.",
|
||||
"ro": "Aceasta este o previzualizare a vocii descrierii audio.",
|
||||
"uk": "Це попередній перегляд голосу аудіоопису.",
|
||||
"bn": "এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।",
|
||||
"mr": "हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे.",
|
||||
"ta": "இது ஆடியோ விளக்க குரலின் முன்னோட்டம்.",
|
||||
"te": "ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ.",
|
||||
"zh": "这是音频描述语音的预览。"
|
||||
}
|
||||
|
||||
# Email
|
||||
sendgrid_api_key: str
|
||||
email_from: str
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from .api.v1.routes_admin import router as admin_router
|
|||
from .api.v1.routes_auth import router as auth_router
|
||||
from .api.v1.routes_files import router as files_router
|
||||
from .api.v1.routes_jobs import router as jobs_router
|
||||
from .api.v1.routes_tts import router as tts_router
|
||||
from .api.v1.routes_websockets import router as websockets_router
|
||||
from .services.websocket import connection_manager
|
||||
from .core.config import settings
|
||||
|
|
@ -241,6 +242,7 @@ async def validation_middleware(request, call_next):
|
|||
app.include_router(auth_router, prefix="/api/v1")
|
||||
app.include_router(files_router, prefix="/api/v1")
|
||||
app.include_router(jobs_router, prefix="/api/v1")
|
||||
app.include_router(tts_router, prefix="/api/v1")
|
||||
app.include_router(admin_router, prefix="/api/v1")
|
||||
app.include_router(websockets_router, prefix="/api/v1")
|
||||
|
||||
|
|
|
|||
|
|
@ -35,12 +35,20 @@ class Source(BaseModel):
|
|||
detected_language: Optional[str] = None # AI-detected language from Gemini
|
||||
|
||||
|
||||
class TTSPreferences(BaseModel):
|
||||
"""TTS voice preferences for audio description generation"""
|
||||
provider: Literal["gemini", "google", "elevenlabs"] = "gemini"
|
||||
default_voice: str = "Kore" # Default Gemini voice
|
||||
voices_per_language: dict[str, str] = {} # {"en": "Kore", "es": "Aoede"}
|
||||
|
||||
|
||||
class RequestedOutputs(BaseModel):
|
||||
captions_vtt: bool = True
|
||||
audio_description_vtt: bool = True
|
||||
audio_description_mp3: bool = True
|
||||
languages: list[str] = []
|
||||
transcreation: list[str] = []
|
||||
tts_preferences: Optional[TTSPreferences] = None
|
||||
|
||||
|
||||
class LangOutput(BaseModel):
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
from typing import Any, Optional, Union
|
||||
from typing import Any, Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ..models.job import JobStatus, LangOutput, RequestedOutputs, Review
|
||||
from ..models.job import JobStatus, LangOutput, RequestedOutputs, Review, TTSPreferences
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
|
|
@ -43,6 +43,7 @@ class ApproveEnglishRequest(BaseModel):
|
|||
class ApproveSourceRequest(BaseModel):
|
||||
"""Request to approve source language content (works for any language)"""
|
||||
notes: Optional[str] = None
|
||||
tts_preferences: Optional[TTSPreferences] = None # Override TTS voice settings
|
||||
|
||||
|
||||
class RejectJobRequest(BaseModel):
|
||||
|
|
|
|||
256
backend/app/services/gemini_tts.py
Normal file
256
backend/app/services/gemini_tts.py
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
import io
|
||||
import wave
|
||||
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
from pydub import AudioSegment
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class GeminiTTSService:
|
||||
"""Text-to-Speech service using Gemini TTS API"""
|
||||
|
||||
def __init__(self):
|
||||
self.client = genai.Client(api_key=settings.gemini_api_key)
|
||||
self.model = settings.gemini_tts_model
|
||||
self.default_voice = settings.gemini_tts_default_voice
|
||||
logger.info(f"Gemini TTS service initialized with model: {self.model}")
|
||||
|
||||
async def synthesize_text(
|
||||
self,
|
||||
text: str,
|
||||
voice_name: str,
|
||||
language: str = "en"
|
||||
) -> bytes:
|
||||
"""
|
||||
Synthesize text to audio using Gemini TTS.
|
||||
Returns MP3 audio bytes.
|
||||
"""
|
||||
if not text.strip():
|
||||
raise ValueError("Text cannot be empty")
|
||||
|
||||
# Validate voice
|
||||
if voice_name not in settings.gemini_tts_voices:
|
||||
logger.warning(f"Unknown voice '{voice_name}', using default '{self.default_voice}'")
|
||||
voice_name = self.default_voice
|
||||
|
||||
try:
|
||||
# Generate audio using Gemini TTS
|
||||
response = self.client.models.generate_content(
|
||||
model=self.model,
|
||||
contents=text,
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=["AUDIO"],
|
||||
speech_config=types.SpeechConfig(
|
||||
voice_config=types.VoiceConfig(
|
||||
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
||||
voice_name=voice_name,
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Extract PCM audio data from response
|
||||
if not response.candidates or not response.candidates[0].content.parts:
|
||||
raise ValueError("No audio data in Gemini TTS response")
|
||||
|
||||
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
||||
|
||||
# Convert PCM to MP3
|
||||
mp3_data = self._pcm_to_mp3(pcm_data)
|
||||
|
||||
return mp3_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Gemini TTS synthesis failed: {e}")
|
||||
raise
|
||||
|
||||
async def synthesize_preview(
|
||||
self,
|
||||
voice_name: str,
|
||||
language: str = "en"
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate a preview audio sample for voice selection.
|
||||
Uses language-specific sample text.
|
||||
"""
|
||||
# Get preview sample text for the language
|
||||
sample_text = settings.gemini_tts_preview_samples.get(
|
||||
language,
|
||||
settings.gemini_tts_preview_samples.get("en", "This is a voice preview.")
|
||||
)
|
||||
|
||||
return await self.synthesize_text(sample_text, voice_name, language)
|
||||
|
||||
async def synthesize_audio_description(
|
||||
self,
|
||||
ad_vtt_content: str,
|
||||
language: str = "en",
|
||||
voice_name: str | None = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Synthesize full audio description from VTT content.
|
||||
Maintains timing alignment with original VTT cues.
|
||||
"""
|
||||
if voice_name is None:
|
||||
voice_name = self.default_voice
|
||||
|
||||
# Validate voice
|
||||
if voice_name not in settings.gemini_tts_voices:
|
||||
logger.warning(f"Unknown voice '{voice_name}', using default '{self.default_voice}'")
|
||||
voice_name = self.default_voice
|
||||
|
||||
# Parse VTT cues
|
||||
cues = self._parse_ad_cues(ad_vtt_content)
|
||||
|
||||
if not cues:
|
||||
raise ValueError("No audio description cues found in VTT content")
|
||||
|
||||
logger.info(f"Synthesizing {len(cues)} audio description cues with voice '{voice_name}'")
|
||||
|
||||
# Synthesize each cue with precise timing anchoring
|
||||
audio_segments = []
|
||||
current_audio_position = 0.0
|
||||
|
||||
for i, cue in enumerate(cues):
|
||||
target_start_time = cue["start_time"]
|
||||
|
||||
# Add silence to reach the exact VTT start time
|
||||
if target_start_time > current_audio_position:
|
||||
silence_duration = target_start_time - current_audio_position
|
||||
silence = AudioSegment.silent(duration=int(silence_duration * 1000))
|
||||
audio_segments.append(silence)
|
||||
current_audio_position = target_start_time
|
||||
|
||||
# Synthesize this cue's text
|
||||
text = cue["text"].strip()
|
||||
if text:
|
||||
# Ensure proper punctuation for natural TTS flow
|
||||
if not text.endswith(('.', '!', '?')):
|
||||
text += "."
|
||||
|
||||
try:
|
||||
audio_data = await self.synthesize_text(text, voice_name, language)
|
||||
|
||||
# Convert to AudioSegment and get actual duration
|
||||
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
audio_segments.append(audio_segment)
|
||||
|
||||
# Update position based on actual audio duration
|
||||
actual_audio_duration = len(audio_segment) / 1000.0
|
||||
current_audio_position += actual_audio_duration
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to synthesize cue {i}: {e}")
|
||||
# Add silence for failed cue
|
||||
cue_duration = cue["end_time"] - cue["start_time"]
|
||||
silence = AudioSegment.silent(duration=int(cue_duration * 1000))
|
||||
audio_segments.append(silence)
|
||||
current_audio_position += cue_duration
|
||||
|
||||
# Combine all segments
|
||||
if audio_segments:
|
||||
final_audio = sum(audio_segments, AudioSegment.empty())
|
||||
else:
|
||||
final_audio = AudioSegment.silent(duration=1000)
|
||||
|
||||
# Export to MP3
|
||||
output_buffer = io.BytesIO()
|
||||
final_audio.export(output_buffer, format="mp3", bitrate="128k")
|
||||
|
||||
logger.info(f"Audio description synthesized: {len(output_buffer.getvalue())} bytes")
|
||||
return output_buffer.getvalue()
|
||||
|
||||
def _pcm_to_mp3(self, pcm_data: bytes) -> bytes:
|
||||
"""
|
||||
Convert raw PCM audio (24kHz, 16-bit, mono) to MP3.
|
||||
Gemini TTS outputs PCM at 24000 Hz sample rate.
|
||||
"""
|
||||
# Create WAV from PCM data
|
||||
wav_buffer = io.BytesIO()
|
||||
with wave.open(wav_buffer, "wb") as wf:
|
||||
wf.setnchannels(1) # Mono
|
||||
wf.setsampwidth(2) # 16-bit (2 bytes)
|
||||
wf.setframerate(24000) # 24kHz
|
||||
wf.writeframes(pcm_data)
|
||||
|
||||
# Convert WAV to MP3 using pydub
|
||||
wav_buffer.seek(0)
|
||||
audio_segment = AudioSegment.from_wav(wav_buffer)
|
||||
|
||||
# Export as MP3
|
||||
mp3_buffer = io.BytesIO()
|
||||
audio_segment.export(mp3_buffer, format="mp3", bitrate="128k")
|
||||
|
||||
return mp3_buffer.getvalue()
|
||||
|
||||
def _parse_ad_cues(self, vtt_content: str) -> list[dict]:
|
||||
"""Parse audio description VTT and extract timing + text"""
|
||||
lines = vtt_content.strip().split('\n')
|
||||
cues = []
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Skip header and empty lines
|
||||
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for timing line
|
||||
if " --> " in line:
|
||||
timing_parts = line.split(" --> ")
|
||||
start_time = self._parse_timestamp(timing_parts[0].strip())
|
||||
end_time = self._parse_timestamp(timing_parts[1].strip())
|
||||
|
||||
# Get text from next line(s)
|
||||
i += 1
|
||||
text_lines = []
|
||||
while i < len(lines) and lines[i].strip() != "":
|
||||
text_lines.append(lines[i].strip())
|
||||
i += 1
|
||||
|
||||
if text_lines:
|
||||
cues.append({
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"text": " ".join(text_lines)
|
||||
})
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return cues
|
||||
|
||||
def _parse_timestamp(self, timestamp: str) -> float:
|
||||
"""Convert VTT timestamp to seconds"""
|
||||
parts = timestamp.split(":")
|
||||
|
||||
if len(parts) == 3: # HH:MM:SS.mmm
|
||||
hours, minutes, seconds = parts
|
||||
elif len(parts) == 2: # MM:SS.mmm
|
||||
hours, minutes, seconds = "0", parts[0], parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
||||
|
||||
sec_parts = seconds.split(".")
|
||||
seconds_val = int(sec_parts[0])
|
||||
milliseconds = int(sec_parts[1]) if len(sec_parts) > 1 else 0
|
||||
|
||||
total_seconds = (
|
||||
int(hours) * 3600 +
|
||||
int(minutes) * 60 +
|
||||
seconds_val +
|
||||
milliseconds / 1000.0
|
||||
)
|
||||
|
||||
return total_seconds
|
||||
|
||||
|
||||
# Global service instance
|
||||
gemini_tts_service = GeminiTTSService()
|
||||
|
|
@ -7,47 +7,74 @@ from pydub import AudioSegment
|
|||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from .gemini_tts import gemini_tts_service
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class TTSService:
|
||||
def __init__(self):
|
||||
# Initialize Google TTS (uses GOOGLE_APPLICATION_CREDENTIALS env var)
|
||||
# The same GCP credentials used for GCS also work for TTS
|
||||
# Check Gemini TTS availability (uses same API key as other Gemini services)
|
||||
self.gemini_available = bool(settings.gemini_api_key)
|
||||
|
||||
# Initialize Google Cloud TTS (uses GOOGLE_APPLICATION_CREDENTIALS env var)
|
||||
try:
|
||||
self.google_client = texttospeech.TextToSpeechClient()
|
||||
logger.info("Google TTS client initialized successfully")
|
||||
logger.info("Google Cloud TTS client initialized successfully")
|
||||
except Exception as e:
|
||||
logger.warning(f"Google TTS credentials not configured: {e}")
|
||||
logger.warning(f"Google Cloud TTS credentials not configured: {e}")
|
||||
self.google_client = None
|
||||
|
||||
# Check ElevenLabs availability
|
||||
self.elevenlabs_available = bool(settings.elevenlabs_api_key)
|
||||
|
||||
# Log configured provider
|
||||
logger.info(f"TTS provider configured: {settings.tts_provider}")
|
||||
|
||||
async def synthesize_audio_description(
|
||||
self,
|
||||
ad_vtt_content: str,
|
||||
language_code: str = "en-US",
|
||||
voice_name: Optional[str] = None
|
||||
voice_name: Optional[str] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate MP3 audio from audio description VTT content
|
||||
Synthesizes each cue separately and stitches them together with timing
|
||||
Uses Google TTS with ElevenLabs fallback
|
||||
Generate MP3 audio from audio description VTT content.
|
||||
Synthesizes each cue separately and stitches them together with timing.
|
||||
|
||||
Provider priority: specified provider > settings.tts_provider > fallback chain
|
||||
Fallback chain: Gemini -> Google Cloud TTS -> ElevenLabs
|
||||
"""
|
||||
# Try Google TTS first, fallback to ElevenLabs
|
||||
try:
|
||||
if self.google_client:
|
||||
return await self._synthesize_with_google(ad_vtt_content, language_code, voice_name)
|
||||
elif self.elevenlabs_available:
|
||||
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
|
||||
else:
|
||||
raise ValueError("No TTS service configured")
|
||||
except Exception as e:
|
||||
if self.elevenlabs_available and self.google_client:
|
||||
logger.warning(f"Google TTS failed, trying ElevenLabs: {e}")
|
||||
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
|
||||
raise
|
||||
# Determine which provider to use
|
||||
active_provider = provider or settings.tts_provider
|
||||
|
||||
# Extract simple language code for Gemini (e.g., "en-US" -> "en")
|
||||
simple_lang = language_code.split("-")[0] if "-" in language_code else language_code
|
||||
|
||||
# Try the configured provider first, then fallback
|
||||
if active_provider == "gemini" and self.gemini_available:
|
||||
try:
|
||||
logger.info(f"Using Gemini TTS for language: {simple_lang}, voice: {voice_name}")
|
||||
return await gemini_tts_service.synthesize_audio_description(
|
||||
ad_vtt_content, simple_lang, voice_name
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Gemini TTS failed, falling back: {e}")
|
||||
# Fall through to Google/ElevenLabs
|
||||
|
||||
if active_provider == "google" or (active_provider == "gemini" and self.google_client):
|
||||
try:
|
||||
if self.google_client:
|
||||
logger.info(f"Using Google Cloud TTS for language: {language_code}")
|
||||
return await self._synthesize_with_google(ad_vtt_content, language_code, voice_name)
|
||||
except Exception as e:
|
||||
logger.warning(f"Google Cloud TTS failed: {e}")
|
||||
|
||||
if self.elevenlabs_available:
|
||||
logger.info(f"Using ElevenLabs TTS for language: {language_code}")
|
||||
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
|
||||
|
||||
raise ValueError("No TTS service available")
|
||||
|
||||
async def _synthesize_with_google(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -269,7 +269,9 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
|
||||
# Generate TTS for languages that need MP3
|
||||
if job_doc["requested_outputs"]["audio_description_mp3"]:
|
||||
await _generate_tts_for_languages(job_id, updated_outputs, db, source_language)
|
||||
# Get TTS preferences from job
|
||||
tts_preferences = job_doc["requested_outputs"].get("tts_preferences", {})
|
||||
await _generate_tts_for_languages(job_id, updated_outputs, db, source_language, tts_preferences)
|
||||
|
||||
# Update final status
|
||||
await db.jobs.update_one(
|
||||
|
|
@ -323,33 +325,53 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
client.close()
|
||||
|
||||
|
||||
async def _generate_tts_for_languages(job_id: str, outputs: dict[str, Any], db, source_language: str = "en"):
|
||||
async def _generate_tts_for_languages(
|
||||
job_id: str,
|
||||
outputs: dict[str, Any],
|
||||
db,
|
||||
source_language: str = "en",
|
||||
tts_preferences: dict = None
|
||||
):
|
||||
"""Generate TTS audio for each language's audio description"""
|
||||
if tts_preferences is None:
|
||||
tts_preferences = {}
|
||||
|
||||
# Always generate source language MP3 first
|
||||
if source_language in outputs and "ad_vtt_gcs" in outputs[source_language]:
|
||||
await _generate_language_tts(job_id, source_language, outputs[source_language], db)
|
||||
await _generate_language_tts(job_id, source_language, outputs[source_language], db, tts_preferences)
|
||||
|
||||
# Generate for other languages
|
||||
for language, lang_output in outputs.items():
|
||||
if language != source_language and "ad_vtt_gcs" in lang_output:
|
||||
await _generate_language_tts(job_id, language, lang_output, db)
|
||||
await _generate_language_tts(job_id, language, lang_output, db, tts_preferences)
|
||||
|
||||
|
||||
async def _generate_language_tts(job_id: str, language: str, lang_output: dict, db):
|
||||
async def _generate_language_tts(job_id: str, language: str, lang_output: dict, db, tts_preferences: dict = None):
|
||||
"""Generate TTS for a specific language"""
|
||||
if tts_preferences is None:
|
||||
tts_preferences = {}
|
||||
|
||||
try:
|
||||
# Download AD VTT content
|
||||
ad_blob_path = lang_output["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
ad_blob = gcs_service.bucket.blob(ad_blob_path)
|
||||
ad_vtt_content = ad_blob.download_as_text()
|
||||
|
||||
# Get voice for this language from preferences
|
||||
voices_per_language = tts_preferences.get("voices_per_language", {})
|
||||
voice_name = voices_per_language.get(language, tts_preferences.get("default_voice"))
|
||||
provider = tts_preferences.get("provider", "gemini")
|
||||
|
||||
# Generate MP3 with retry
|
||||
language_code = f"{language}-US" if language == "en" else f"{language}-{language.upper()}"
|
||||
|
||||
|
||||
logger.info(f"Generating TTS for {language} with voice={voice_name}, provider={provider}")
|
||||
|
||||
async def synthesize():
|
||||
return await tts_service.synthesize_audio_description(ad_vtt_content, language_code)
|
||||
|
||||
return await tts_service.synthesize_audio_description(
|
||||
ad_vtt_content, language_code, voice_name=voice_name, provider=provider
|
||||
)
|
||||
|
||||
mp3_data = await retry_with_backoff(synthesize, max_retries=3)
|
||||
|
||||
# Upload MP3 to GCS
|
||||
|
|
|
|||
138
frontend/src/components/VoicePreviewButton.tsx
Normal file
138
frontend/src/components/VoicePreviewButton.tsx
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
import { useState, useRef } from 'react';
|
||||
import { api } from '../lib/api';
|
||||
|
||||
interface VoicePreviewButtonProps {
|
||||
voiceName: string;
|
||||
language: string;
|
||||
disabled?: boolean;
|
||||
}
|
||||
|
||||
export function VoicePreviewButton({ voiceName, language, disabled }: VoicePreviewButtonProps) {
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [isPlaying, setIsPlaying] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const audioUrlRef = useRef<string | null>(null);
|
||||
|
||||
const handlePreview = async () => {
|
||||
setError(null);
|
||||
|
||||
// If already playing, stop
|
||||
if (isPlaying && audioRef.current) {
|
||||
audioRef.current.pause();
|
||||
audioRef.current.currentTime = 0;
|
||||
setIsPlaying(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// If we have cached audio, play it
|
||||
if (audioUrlRef.current && audioRef.current) {
|
||||
audioRef.current.play();
|
||||
setIsPlaying(true);
|
||||
return;
|
||||
}
|
||||
|
||||
// Fetch new audio
|
||||
setIsLoading(true);
|
||||
try {
|
||||
const blob = await api.previewVoice(voiceName, language);
|
||||
const url = URL.createObjectURL(blob);
|
||||
|
||||
// Clean up old URL if exists
|
||||
if (audioUrlRef.current) {
|
||||
URL.revokeObjectURL(audioUrlRef.current);
|
||||
}
|
||||
|
||||
audioUrlRef.current = url;
|
||||
|
||||
// Create and play audio
|
||||
const audio = new Audio(url);
|
||||
audioRef.current = audio;
|
||||
|
||||
audio.onended = () => {
|
||||
setIsPlaying(false);
|
||||
};
|
||||
|
||||
audio.onerror = () => {
|
||||
setError('Failed to play audio');
|
||||
setIsPlaying(false);
|
||||
};
|
||||
|
||||
await audio.play();
|
||||
setIsPlaying(true);
|
||||
} catch (err) {
|
||||
setError('Failed to generate preview');
|
||||
console.error('Voice preview error:', err);
|
||||
} finally {
|
||||
setIsLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
// Cleanup on unmount
|
||||
// Note: We don't add cleanup in useEffect to allow audio caching within component lifecycle
|
||||
|
||||
return (
|
||||
<div className="inline-flex items-center gap-2">
|
||||
<button
|
||||
type="button"
|
||||
onClick={handlePreview}
|
||||
disabled={disabled || isLoading}
|
||||
className={`
|
||||
inline-flex items-center gap-1.5 px-3 py-1.5 text-sm font-medium rounded-md
|
||||
transition-colors duration-150
|
||||
${disabled || isLoading
|
||||
? 'bg-gray-100 text-gray-400 cursor-not-allowed'
|
||||
: isPlaying
|
||||
? 'bg-red-100 text-red-700 hover:bg-red-200'
|
||||
: 'bg-blue-100 text-blue-700 hover:bg-blue-200'
|
||||
}
|
||||
`}
|
||||
>
|
||||
{isLoading ? (
|
||||
<>
|
||||
<svg className="animate-spin h-4 w-4" viewBox="0 0 24 24">
|
||||
<circle
|
||||
className="opacity-25"
|
||||
cx="12"
|
||||
cy="12"
|
||||
r="10"
|
||||
stroke="currentColor"
|
||||
strokeWidth="4"
|
||||
fill="none"
|
||||
/>
|
||||
<path
|
||||
className="opacity-75"
|
||||
fill="currentColor"
|
||||
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
|
||||
/>
|
||||
</svg>
|
||||
Loading...
|
||||
</>
|
||||
) : isPlaying ? (
|
||||
<>
|
||||
<svg className="h-4 w-4" fill="currentColor" viewBox="0 0 20 20">
|
||||
<path
|
||||
fillRule="evenodd"
|
||||
d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zM7 8a1 1 0 011-1h4a1 1 0 110 2H8a1 1 0 01-1-1zm0 4a1 1 0 011-1h4a1 1 0 110 2H8a1 1 0 01-1-1z"
|
||||
clipRule="evenodd"
|
||||
/>
|
||||
</svg>
|
||||
Stop
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<svg className="h-4 w-4" fill="currentColor" viewBox="0 0 20 20">
|
||||
<path
|
||||
fillRule="evenodd"
|
||||
d="M10 18a8 8 0 100-16 8 8 0 000 16zM9.555 7.168A1 1 0 008 8v4a1 1 0 001.555.832l3-2a1 1 0 000-1.664l-3-2z"
|
||||
clipRule="evenodd"
|
||||
/>
|
||||
</svg>
|
||||
Preview
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
{error && <span className="text-xs text-red-600">{error}</span>}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
216
frontend/src/components/VoiceSelector.tsx
Normal file
216
frontend/src/components/VoiceSelector.tsx
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
import { useState, useEffect } from 'react';
|
||||
import { api } from '../lib/api';
|
||||
import { VoicePreviewButton } from './VoicePreviewButton';
|
||||
import type { TTSPreferences, VoicesResponse, LanguagesResponse } from '../types/api';
|
||||
|
||||
interface VoiceSelectorProps {
|
||||
selectedLanguages: string[];
|
||||
preferences: TTSPreferences;
|
||||
onChange: (preferences: TTSPreferences) => void;
|
||||
disabled?: boolean;
|
||||
}
|
||||
|
||||
export function VoiceSelector({
|
||||
selectedLanguages,
|
||||
preferences,
|
||||
onChange,
|
||||
disabled
|
||||
}: VoiceSelectorProps) {
|
||||
const [voices, setVoices] = useState<VoicesResponse | null>(null);
|
||||
const [languages, setLanguages] = useState<LanguagesResponse | null>(null);
|
||||
const [activeLanguage, setActiveLanguage] = useState<string>(selectedLanguages[0] || 'en');
|
||||
const [isLoading, setIsLoading] = useState(true);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
// Fetch voices and languages on mount
|
||||
useEffect(() => {
|
||||
const fetchData = async () => {
|
||||
try {
|
||||
setIsLoading(true);
|
||||
const [voicesData, languagesData] = await Promise.all([
|
||||
api.getVoices(),
|
||||
api.getLanguages()
|
||||
]);
|
||||
setVoices(voicesData);
|
||||
setLanguages(languagesData);
|
||||
} catch (err) {
|
||||
setError('Failed to load voice options');
|
||||
console.error('Voice selector error:', err);
|
||||
} finally {
|
||||
setIsLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
fetchData();
|
||||
}, []);
|
||||
|
||||
// Update active language when selected languages change
|
||||
useEffect(() => {
|
||||
if (selectedLanguages.length > 0 && !selectedLanguages.includes(activeLanguage)) {
|
||||
setActiveLanguage(selectedLanguages[0]);
|
||||
}
|
||||
}, [selectedLanguages, activeLanguage]);
|
||||
|
||||
const handleDefaultVoiceChange = (voice: string) => {
|
||||
onChange({
|
||||
...preferences,
|
||||
default_voice: voice
|
||||
});
|
||||
};
|
||||
|
||||
const handleLanguageVoiceChange = (language: string, voice: string) => {
|
||||
const newVoicesPerLanguage = {
|
||||
...preferences.voices_per_language,
|
||||
[language]: voice
|
||||
};
|
||||
|
||||
// If voice matches default, remove from per-language overrides
|
||||
if (voice === preferences.default_voice) {
|
||||
delete newVoicesPerLanguage[language];
|
||||
}
|
||||
|
||||
onChange({
|
||||
...preferences,
|
||||
voices_per_language: newVoicesPerLanguage
|
||||
});
|
||||
};
|
||||
|
||||
const getVoiceForLanguage = (language: string): string => {
|
||||
return preferences.voices_per_language[language] || preferences.default_voice;
|
||||
};
|
||||
|
||||
if (isLoading) {
|
||||
return (
|
||||
<div className="flex items-center gap-2 text-gray-500">
|
||||
<svg className="animate-spin h-5 w-5" viewBox="0 0 24 24">
|
||||
<circle
|
||||
className="opacity-25"
|
||||
cx="12"
|
||||
cy="12"
|
||||
r="10"
|
||||
stroke="currentColor"
|
||||
strokeWidth="4"
|
||||
fill="none"
|
||||
/>
|
||||
<path
|
||||
className="opacity-75"
|
||||
fill="currentColor"
|
||||
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
|
||||
/>
|
||||
</svg>
|
||||
Loading voice options...
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (error || !voices || !languages) {
|
||||
return (
|
||||
<div className="text-red-600 text-sm">
|
||||
{error || 'Failed to load voice options'}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// Filter languages to only show selected ones, always include English
|
||||
const displayLanguages = selectedLanguages.length > 0
|
||||
? ['en', ...selectedLanguages.filter(l => l !== 'en')]
|
||||
: ['en'];
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
{/* Default Voice Selection */}
|
||||
<div className="bg-gray-50 rounded-lg p-4">
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
Default Voice
|
||||
</label>
|
||||
<p className="text-xs text-gray-500 mb-3">
|
||||
This voice will be used for all languages unless overridden below.
|
||||
</p>
|
||||
<div className="flex items-center gap-3">
|
||||
<select
|
||||
value={preferences.default_voice}
|
||||
onChange={(e) => handleDefaultVoiceChange(e.target.value)}
|
||||
disabled={disabled}
|
||||
className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm"
|
||||
>
|
||||
{voices.voices.map((voice) => (
|
||||
<option key={voice} value={voice}>
|
||||
{voice}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
<VoicePreviewButton
|
||||
voiceName={preferences.default_voice}
|
||||
language="en"
|
||||
disabled={disabled}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Per-Language Voice Overrides */}
|
||||
{displayLanguages.length > 1 && (
|
||||
<div className="bg-gray-50 rounded-lg p-4">
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
Per-Language Voice Settings
|
||||
</label>
|
||||
<p className="text-xs text-gray-500 mb-3">
|
||||
Optionally choose different voices for specific languages.
|
||||
</p>
|
||||
|
||||
{/* Language Tabs */}
|
||||
<div className="flex flex-wrap gap-1 mb-4 border-b border-gray-200">
|
||||
{displayLanguages.map((lang) => (
|
||||
<button
|
||||
key={lang}
|
||||
type="button"
|
||||
onClick={() => setActiveLanguage(lang)}
|
||||
className={`
|
||||
px-3 py-2 text-sm font-medium rounded-t-md transition-colors
|
||||
${activeLanguage === lang
|
||||
? 'bg-white text-blue-600 border-b-2 border-blue-600 -mb-px'
|
||||
: 'text-gray-500 hover:text-gray-700 hover:bg-gray-100'
|
||||
}
|
||||
`}
|
||||
>
|
||||
{languages.languages[lang] || lang.toUpperCase()}
|
||||
{preferences.voices_per_language[lang] && (
|
||||
<span className="ml-1 text-xs text-blue-500">*</span>
|
||||
)}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
|
||||
{/* Voice Selection for Active Language */}
|
||||
<div className="flex items-center gap-3">
|
||||
<select
|
||||
value={getVoiceForLanguage(activeLanguage)}
|
||||
onChange={(e) => handleLanguageVoiceChange(activeLanguage, e.target.value)}
|
||||
disabled={disabled}
|
||||
className="flex-1 rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 text-sm"
|
||||
>
|
||||
{voices.voices.map((voice) => (
|
||||
<option key={voice} value={voice}>
|
||||
{voice}
|
||||
{voice === preferences.default_voice ? ' (default)' : ''}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
<VoicePreviewButton
|
||||
voiceName={getVoiceForLanguage(activeLanguage)}
|
||||
language={activeLanguage}
|
||||
disabled={disabled}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Sample Text Display */}
|
||||
<div className="mt-3 p-3 bg-white rounded border border-gray-200">
|
||||
<span className="text-xs text-gray-500 block mb-1">Preview text:</span>
|
||||
<span className="text-sm text-gray-700 italic">
|
||||
"{languages.preview_samples[activeLanguage] || languages.preview_samples['en']}"
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
@ -1,10 +1,11 @@
|
|||
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query';
|
||||
import { apiClient } from '../lib/api';
|
||||
import type {
|
||||
Job,
|
||||
import type {
|
||||
Job,
|
||||
JobCreateRequest,
|
||||
VttUpdateRequest,
|
||||
BulkDeleteRequest
|
||||
BulkDeleteRequest,
|
||||
TTSPreferences
|
||||
} from '../types/api';
|
||||
|
||||
// Query hooks
|
||||
|
|
@ -88,8 +89,8 @@ export function useApproveEnglish() {
|
|||
const queryClient = useQueryClient();
|
||||
|
||||
return useMutation({
|
||||
mutationFn: ({ id, notes }: { id: string; notes?: string }) =>
|
||||
apiClient.approveEnglish(id, notes),
|
||||
mutationFn: ({ id, notes, tts_preferences }: { id: string; notes?: string; tts_preferences?: TTSPreferences }) =>
|
||||
apiClient.approveSource(id, notes, tts_preferences),
|
||||
onSuccess: (_, { id }) => {
|
||||
queryClient.invalidateQueries({ queryKey: ['jobs', id] });
|
||||
queryClient.invalidateQueries({ queryKey: ['jobs'] });
|
||||
|
|
@ -101,8 +102,8 @@ export function useApproveSource() {
|
|||
const queryClient = useQueryClient();
|
||||
|
||||
return useMutation({
|
||||
mutationFn: ({ id, notes }: { id: string; notes?: string }) =>
|
||||
apiClient.approveSource(id, notes),
|
||||
mutationFn: ({ id, notes, tts_preferences }: { id: string; notes?: string; tts_preferences?: TTSPreferences }) =>
|
||||
apiClient.approveSource(id, notes, tts_preferences),
|
||||
onSuccess: (_, { id }) => {
|
||||
queryClient.invalidateQueries({ queryKey: ['jobs', id] });
|
||||
queryClient.invalidateQueries({ queryKey: ['jobs'] });
|
||||
|
|
|
|||
|
|
@ -21,6 +21,9 @@ import type {
|
|||
UpdateUserRequest,
|
||||
ResetPasswordResponse,
|
||||
AdminStatsResponse,
|
||||
VoicesResponse,
|
||||
LanguagesResponse,
|
||||
TTSPreferences,
|
||||
} from '../types/api';
|
||||
|
||||
const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || 'http://localhost:8000';
|
||||
|
|
@ -175,8 +178,11 @@ class ApiClient {
|
|||
return this.approveSource(id, notes);
|
||||
}
|
||||
|
||||
async approveSource(id: string, notes?: string): Promise<Job> {
|
||||
const response = await this.client.post(`/jobs/${id}/actions/approve_source`, { notes });
|
||||
async approveSource(id: string, notes?: string, tts_preferences?: TTSPreferences): Promise<Job> {
|
||||
const response = await this.client.post(`/jobs/${id}/actions/approve_source`, {
|
||||
notes,
|
||||
tts_preferences
|
||||
});
|
||||
return response.data;
|
||||
}
|
||||
|
||||
|
|
@ -287,6 +293,26 @@ class ApiClient {
|
|||
const response = await this.client.get('/admin/stats');
|
||||
return response.data;
|
||||
}
|
||||
|
||||
// TTS endpoints
|
||||
async getVoices(): Promise<VoicesResponse> {
|
||||
const response = await this.client.get('/tts/voices');
|
||||
return response.data;
|
||||
}
|
||||
|
||||
async getLanguages(): Promise<LanguagesResponse> {
|
||||
const response = await this.client.get('/tts/languages');
|
||||
return response.data;
|
||||
}
|
||||
|
||||
async previewVoice(voiceName: string, language: string): Promise<Blob> {
|
||||
const response = await this.client.post(
|
||||
'/tts/preview',
|
||||
{ voice_name: voiceName, language },
|
||||
{ responseType: 'blob' }
|
||||
);
|
||||
return response.data;
|
||||
}
|
||||
}
|
||||
|
||||
export const apiClient = new ApiClient();
|
||||
|
|
|
|||
|
|
@ -4,7 +4,9 @@ import { useJob, useApproveEnglish, useRejectJob, useJobVttContent, useUpdateJob
|
|||
import { StatusBadge } from '../../components/StatusBadge';
|
||||
import { VttEditor } from '../../components/VttEditor/VttEditor';
|
||||
import { VideoWithCaptions } from '../../components/VideoWithCaptions';
|
||||
import { VoiceSelector } from '../../components/VoiceSelector';
|
||||
import { useToastContext } from '../../contexts/ToastContext';
|
||||
import type { TTSPreferences } from '../../types/api';
|
||||
|
||||
export function QCDetail() {
|
||||
const { id } = useParams<{ id: string }>();
|
||||
|
|
@ -30,6 +32,12 @@ export function QCDetail() {
|
|||
const [timingOffset, setTimingOffset] = useState(0);
|
||||
const [adjustCaptions, setAdjustCaptions] = useState(true);
|
||||
const [adjustAudioDescription, setAdjustAudioDescription] = useState(true);
|
||||
const [showVoiceSettings, setShowVoiceSettings] = useState(false);
|
||||
const [ttsPreferences, setTtsPreferences] = useState<TTSPreferences>({
|
||||
provider: 'gemini',
|
||||
default_voice: 'Kore',
|
||||
voices_per_language: {}
|
||||
});
|
||||
|
||||
const isProcessing = approveEnglishMutation.isPending || rejectJobMutation.isPending || updateVttMutation.isPending || adjustTimingMutation.isPending;
|
||||
|
||||
|
|
@ -47,6 +55,13 @@ export function QCDetail() {
|
|||
}
|
||||
}, [vttContent]);
|
||||
|
||||
// Initialize TTS preferences from job when loaded
|
||||
useEffect(() => {
|
||||
if (job?.requested_outputs?.tts_preferences) {
|
||||
setTtsPreferences(job.requested_outputs.tts_preferences);
|
||||
}
|
||||
}, [job]);
|
||||
|
||||
// Keyboard shortcuts
|
||||
useEffect(() => {
|
||||
const handleKeyPress = (event: KeyboardEvent) => {
|
||||
|
|
@ -131,16 +146,20 @@ export function QCDetail() {
|
|||
|
||||
const handleApprove = async () => {
|
||||
if (!id) return;
|
||||
|
||||
|
||||
// Save any pending changes first
|
||||
if (hasUnsavedChanges) {
|
||||
await saveVttChanges();
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
await approveEnglishMutation.mutateAsync({
|
||||
id,
|
||||
notes: reviewNotes
|
||||
// Only pass TTS preferences if MP3 generation is requested
|
||||
const ttsPrefsToSend = job?.requested_outputs?.audio_description_mp3 ? ttsPreferences : undefined;
|
||||
|
||||
await approveEnglishMutation.mutateAsync({
|
||||
id,
|
||||
notes: reviewNotes,
|
||||
tts_preferences: ttsPrefsToSend
|
||||
});
|
||||
toast.toastOnly.success('Job approved successfully');
|
||||
navigate('/admin/qc');
|
||||
|
|
@ -486,6 +505,45 @@ export function QCDetail() {
|
|||
</div>
|
||||
)}
|
||||
|
||||
{/* Voice Settings - Only show if MP3 generation is requested */}
|
||||
{job?.requested_outputs?.audio_description_mp3 && (
|
||||
<div className="mb-6 border border-gray-200 rounded-lg overflow-hidden">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setShowVoiceSettings(!showVoiceSettings)}
|
||||
className="w-full flex items-center justify-between px-4 py-3 bg-gray-50 hover:bg-gray-100 transition-colors"
|
||||
>
|
||||
<div className="flex items-center gap-2">
|
||||
<svg
|
||||
className={`w-4 h-4 text-gray-500 transition-transform ${showVoiceSettings ? 'rotate-90' : ''}`}
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
viewBox="0 0 24 24"
|
||||
>
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
|
||||
</svg>
|
||||
<span className="text-sm font-medium text-gray-700">Voice Settings for Audio Description</span>
|
||||
{job.requested_outputs.tts_preferences && (
|
||||
<span className="text-xs text-blue-600">(Configured)</span>
|
||||
)}
|
||||
</div>
|
||||
<span className="text-xs text-gray-500">
|
||||
{showVoiceSettings ? 'Click to collapse' : 'Click to customize or override TTS voice'}
|
||||
</span>
|
||||
</button>
|
||||
{showVoiceSettings && (
|
||||
<div className="p-4 border-t border-gray-200">
|
||||
<VoiceSelector
|
||||
selectedLanguages={['en', ...(job.requested_outputs.languages || [])]}
|
||||
preferences={ttsPreferences}
|
||||
onChange={setTtsPreferences}
|
||||
disabled={isProcessing}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Review Notes */}
|
||||
<div className="mb-6">
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
|
|
|
|||
|
|
@ -4,9 +4,10 @@ import { useForm } from 'react-hook-form';
|
|||
import { zodResolver } from '@hookform/resolvers/zod';
|
||||
import { z } from 'zod';
|
||||
import { UploadDropzone } from '../../components/UploadDropzone/UploadDropzone';
|
||||
import { VoiceSelector } from '../../components/VoiceSelector';
|
||||
import { useCreateJob } from '../../hooks/useJob';
|
||||
import { useToastContext } from '../../contexts/ToastContext';
|
||||
import type { JobCreateRequest } from '../../types/api';
|
||||
import type { JobCreateRequest, TTSPreferences } from '../../types/api';
|
||||
|
||||
const jobSchema = z.object({
|
||||
title: z.string().min(1, 'Title is required'),
|
||||
|
|
@ -25,6 +26,12 @@ export function NewJob() {
|
|||
const [selectedFile, setSelectedFile] = useState<File | null>(null);
|
||||
const [uploadProgress, setUploadProgress] = useState(0);
|
||||
const [createdJob, setCreatedJob] = useState<string | null>(null);
|
||||
const [showVoiceSettings, setShowVoiceSettings] = useState(false);
|
||||
const [ttsPreferences, setTtsPreferences] = useState<TTSPreferences>({
|
||||
provider: 'gemini',
|
||||
default_voice: 'Kore',
|
||||
voices_per_language: {}
|
||||
});
|
||||
const navigate = useNavigate();
|
||||
const toast = useToastContext();
|
||||
const createJobMutation = useCreateJob();
|
||||
|
|
@ -51,6 +58,7 @@ export function NewJob() {
|
|||
const languages = watch('languages');
|
||||
const transcreation = watch('transcreation');
|
||||
const sourceIsEnglish = watch('sourceIsEnglish');
|
||||
const audioDescriptionMp3 = watch('audio_description_mp3');
|
||||
|
||||
const onSubmit = async (data: JobFormData) => {
|
||||
if (!selectedFile) {
|
||||
|
|
@ -68,6 +76,7 @@ export function NewJob() {
|
|||
audio_description_mp3: data.audio_description_mp3,
|
||||
languages: data.languages,
|
||||
transcreation: data.transcreation,
|
||||
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -325,6 +334,42 @@ export function NewJob() {
|
|||
</div>
|
||||
</div>
|
||||
|
||||
{/* Voice Settings - Collapsible */}
|
||||
{audioDescriptionMp3 && (
|
||||
<div className="border border-gray-200 rounded-lg overflow-hidden">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setShowVoiceSettings(!showVoiceSettings)}
|
||||
className="w-full flex items-center justify-between px-4 py-3 bg-gray-50 hover:bg-gray-100 transition-colors"
|
||||
>
|
||||
<div className="flex items-center gap-2">
|
||||
<svg
|
||||
className={`w-4 h-4 text-gray-500 transition-transform ${showVoiceSettings ? 'rotate-90' : ''}`}
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
viewBox="0 0 24 24"
|
||||
>
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
|
||||
</svg>
|
||||
<span className="text-sm font-medium text-gray-700">Voice Settings</span>
|
||||
</div>
|
||||
<span className="text-xs text-gray-500">
|
||||
{showVoiceSettings ? 'Click to collapse' : 'Click to customize TTS voice'}
|
||||
</span>
|
||||
</button>
|
||||
{showVoiceSettings && (
|
||||
<div className="p-4 border-t border-gray-200">
|
||||
<VoiceSelector
|
||||
selectedLanguages={['en', ...languages]}
|
||||
preferences={ttsPreferences}
|
||||
onChange={setTtsPreferences}
|
||||
disabled={createJobMutation.isPending}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Target Languages */}
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
|
|
|
|||
|
|
@ -37,12 +37,31 @@ export interface Source {
|
|||
detected_language?: string; // AI-detected language from Gemini
|
||||
}
|
||||
|
||||
export type TTSProvider = "gemini" | "google" | "elevenlabs";
|
||||
|
||||
export interface TTSPreferences {
|
||||
provider: TTSProvider;
|
||||
default_voice: string;
|
||||
voices_per_language: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface RequestedOutputs {
|
||||
captions_vtt: boolean;
|
||||
audio_description_vtt: boolean;
|
||||
audio_description_mp3: boolean;
|
||||
languages: string[];
|
||||
transcreation: string[];
|
||||
tts_preferences?: TTSPreferences;
|
||||
}
|
||||
|
||||
export interface VoicesResponse {
|
||||
voices: string[];
|
||||
default: string;
|
||||
}
|
||||
|
||||
export interface LanguagesResponse {
|
||||
languages: Record<string, string>;
|
||||
preview_samples: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface LangOutput {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue