From e8b940aee8395d45cb01ab8c9b4f87e943ffbf90 Mon Sep 17 00:00:00 2001 From: michael Date: Tue, 30 Dec 2025 14:26:07 -0600 Subject: [PATCH] feat: add TTS_FAILED status and robust error handling for TTS synthesis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive error handling for TTS synthesis failures: Backend: - Add TTS_FAILED status to JobStatus enum for failed synthesis jobs - Add TTSSynthesisError exception with cue index and context tracking - Improve null-safe error handling in Gemini TTS response parsing - Add _synthesize_cue_with_retry() with exponential backoff (3 attempts) - Enhanced error logging with text preview and model context Frontend: - Add TTS_FAILED status styling (red badge) in StatusBadge component - Add tts_failed to JobStatus TypeScript type 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/app/models/job.py | 1 + backend/app/services/gemini_tts.py | 177 ++++++++++++++++++++---- frontend/src/components/StatusBadge.tsx | 3 + frontend/src/types/api.ts | 1 + 4 files changed, 154 insertions(+), 28 deletions(-) diff --git a/backend/app/models/job.py b/backend/app/models/job.py index 810fdba..d5c489b 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -16,6 +16,7 @@ class JobStatus(str, Enum): QC_FEEDBACK = "qc_feedback" TRANSLATING = "translating" TTS_GENERATING = "tts_generating" + TTS_FAILED = "tts_failed" # TTS synthesis failed after retries, requires reprocessing RENDERING_VIDEO = "rendering_video" # Accessible video rendering in progress PENDING_FINAL_REVIEW = "pending_final_review" COMPLETED = "completed" diff --git a/backend/app/services/gemini_tts.py b/backend/app/services/gemini_tts.py index fd428df..3a2a1a6 100644 --- a/backend/app/services/gemini_tts.py +++ b/backend/app/services/gemini_tts.py @@ -11,6 +11,16 @@ from ..core.logging import get_logger logger = get_logger(__name__) +class TTSSynthesisError(Exception): + """Raised when TTS synthesis fails after all retries.""" + + def __init__(self, message: str, cue_index: int, cue_text: str, api_response_info: str = None): + super().__init__(message) + self.cue_index = cue_index + self.cue_text = cue_text + self.api_response_info = api_response_info + + class GeminiTTSService: """Text-to-Speech service using Gemini TTS API""" @@ -87,11 +97,43 @@ class GeminiTTSService: ) ) - # Extract PCM audio data from response - if not response.candidates or not response.candidates[0].content.parts: - raise ValueError("No audio data in Gemini TTS response") + # Extract PCM audio data from response with proper null-safe checks + if not response.candidates: + logger.error( + f"Gemini TTS response missing candidates. " + f"Response type: {type(response)}, Response: {response}" + ) + raise ValueError("No candidates in Gemini TTS response") - pcm_data = response.candidates[0].content.parts[0].inline_data.data + candidate = response.candidates[0] + + if candidate.content is None: + logger.error( + f"Gemini TTS candidate has no content. " + f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}, " + f"Safety ratings: {getattr(candidate, 'safety_ratings', 'unknown')}" + ) + raise ValueError( + f"Candidate content is None in Gemini TTS response. " + f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}" + ) + + if not candidate.content.parts: + logger.error( + f"Gemini TTS content has no parts. " + f"Content role: {getattr(candidate.content, 'role', 'unknown')}" + ) + raise ValueError("No parts in Gemini TTS response content") + + part = candidate.content.parts[0] + if not hasattr(part, 'inline_data') or part.inline_data is None: + logger.error( + f"Gemini TTS part missing inline_data. " + f"Part type: {type(part)}, Part: {part}" + ) + raise ValueError("No inline_data in Gemini TTS response part") + + pcm_data = part.inline_data.data # Convert PCM to MP3 mp3_data = self._pcm_to_mp3(pcm_data) @@ -99,7 +141,17 @@ class GeminiTTSService: return mp3_data except Exception as e: - logger.error(f"Gemini TTS synthesis failed: {e}") + # Log comprehensive error information for debugging + error_context = { + "text_length": len(text), + "text_preview": text[:100] + "..." if len(text) > 100 else text, + "voice_name": voice_name, + "language": language, + "model_id": model_id, + } + logger.error( + f"Gemini TTS synthesis failed: {e}. Context: {error_context}" + ) raise async def synthesize_preview( @@ -129,6 +181,80 @@ class GeminiTTSService: style_prompt=style_prompt ) + async def _synthesize_cue_with_retry( + self, + cue_index: int, + text: str, + voice_name: str, + language: str, + model: str, + speed: float, + style_prompt: str, + max_attempts: int = 3, + base_delay: float = 1.0 + ) -> bytes: + """ + Synthesize a single cue with exponential backoff retry. + + Args: + cue_index: Index of the cue (for error reporting) + text: Text to synthesize + voice_name: TTS voice name + language: Language code + model: Model variant + speed: Speech rate + style_prompt: Style instructions + max_attempts: Total attempts (1 initial + retries) + base_delay: Base delay in seconds for backoff + + Returns: + MP3 audio bytes + + Raises: + TTSSynthesisError: If all attempts fail + """ + import asyncio + import random + + last_exception = None + api_response_info = None + + for attempt in range(max_attempts): + try: + return await self.synthesize_text( + text, + voice_name, + language, + model=model, + speed=speed, + style_prompt=style_prompt + ) + except Exception as e: + last_exception = e + api_response_info = str(e) + + if attempt < max_attempts - 1: + # Exponential backoff with jitter + delay = base_delay * (2 ** attempt) + random.uniform(0, 1) + logger.warning( + f"TTS synthesis attempt {attempt + 1}/{max_attempts} failed for cue {cue_index}. " + f"Retrying in {delay:.2f}s. Error: {e}" + ) + await asyncio.sleep(delay) + else: + logger.error( + f"TTS synthesis FAILED after {max_attempts} attempts for cue {cue_index}. " + f"Text: {text[:50]}{'...' if len(text) > 50 else ''}. Error: {e}" + ) + + # All retries exhausted - raise hard failure + raise TTSSynthesisError( + message=f"TTS synthesis failed after {max_attempts} attempts: {last_exception}", + cue_index=cue_index, + cue_text=text, + api_response_info=api_response_info + ) + async def synthesize_audio_description( self, ad_vtt_content: str, @@ -190,31 +316,26 @@ class GeminiTTSService: if not text.endswith(('.', '!', '?')): text += "." - try: - audio_data = await self.synthesize_text( - text, - voice_name, - language, - model=model, - speed=speed, - style_prompt=style_prompt - ) + # Use retry helper - will raise TTSSynthesisError on failure after retries + audio_data = await self._synthesize_cue_with_retry( + cue_index=i, + text=text, + voice_name=voice_name, + language=language, + model=model, + speed=speed, + style_prompt=style_prompt, + max_attempts=3, + base_delay=1.0 + ) - # Convert to AudioSegment and get actual duration - audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3") - audio_segments.append(audio_segment) + # Convert to AudioSegment and get actual duration + audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3") + audio_segments.append(audio_segment) - # Update position based on actual audio duration - actual_audio_duration = len(audio_segment) / 1000.0 - current_audio_position += actual_audio_duration - - except Exception as e: - logger.warning(f"Failed to synthesize cue {i}: {e}") - # Add silence for failed cue - cue_duration = cue["end_time"] - cue["start_time"] - silence = AudioSegment.silent(duration=int(cue_duration * 1000)) - audio_segments.append(silence) - current_audio_position += cue_duration + # Update position based on actual audio duration + actual_audio_duration = len(audio_segment) / 1000.0 + current_audio_position += actual_audio_duration # Combine all segments if audio_segments: diff --git a/frontend/src/components/StatusBadge.tsx b/frontend/src/components/StatusBadge.tsx index 52ad4a7..98a57f9 100644 --- a/frontend/src/components/StatusBadge.tsx +++ b/frontend/src/components/StatusBadge.tsx @@ -19,6 +19,7 @@ export function StatusBadge({ status }: StatusBadgeProps) { case 'approved_source': return 'bg-green-100 text-green-800'; case 'rejected': + case 'tts_failed': return 'bg-red-100 text-red-800'; case 'translating': return 'bg-blue-100 text-blue-800'; @@ -51,6 +52,8 @@ export function StatusBadge({ status }: StatusBadgeProps) { return 'Approved for Translation'; case 'rejected': return 'Rejected'; + case 'tts_failed': + return 'TTS Failed'; case 'translating': return 'Translating'; case 'tts_generating': diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index 9465815..b9c7467 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -9,6 +9,7 @@ export type JobStatus = | "qc_feedback" | "translating" | "tts_generating" + | "tts_failed" // TTS synthesis failed after retries, requires reprocessing | "rendering_video" // Accessible video rendering in progress | "pending_final_review" | "completed";