From e8b940aee8395d45cb01ab8c9b4f87e943ffbf90 Mon Sep 17 00:00:00 2001
From: michael <michael@modernfreedom.com>
Date: Tue, 30 Dec 2025 14:26:07 -0600
Subject: [PATCH] feat: add TTS_FAILED status and robust error handling for TTS
 synthesis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add comprehensive error handling for TTS synthesis failures:

Backend:
- Add TTS_FAILED status to JobStatus enum for failed synthesis jobs
- Add TTSSynthesisError exception with cue index and context tracking
- Improve null-safe error handling in Gemini TTS response parsing
- Add _synthesize_cue_with_retry() with exponential backoff (3 attempts)
- Enhanced error logging with text preview and model context

Frontend:
- Add TTS_FAILED status styling (red badge) in StatusBadge component
- Add tts_failed to JobStatus TypeScript type

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 backend/app/models/job.py               |   1 +
 backend/app/services/gemini_tts.py      | 177 ++++++++++++++++++++----
 frontend/src/components/StatusBadge.tsx |   3 +
 frontend/src/types/api.ts               |   1 +
 4 files changed, 154 insertions(+), 28 deletions(-)

diff --git a/backend/app/models/job.py b/backend/app/models/job.py
index 810fdba..d5c489b 100644
--- a/backend/app/models/job.py
+++ b/backend/app/models/job.py
@@ -16,6 +16,7 @@ class JobStatus(str, Enum):
     QC_FEEDBACK = "qc_feedback"
     TRANSLATING = "translating"
     TTS_GENERATING = "tts_generating"
+    TTS_FAILED = "tts_failed"  # TTS synthesis failed after retries, requires reprocessing
     RENDERING_VIDEO = "rendering_video"  # Accessible video rendering in progress
     PENDING_FINAL_REVIEW = "pending_final_review"
     COMPLETED = "completed"
diff --git a/backend/app/services/gemini_tts.py b/backend/app/services/gemini_tts.py
index fd428df..3a2a1a6 100644
--- a/backend/app/services/gemini_tts.py
+++ b/backend/app/services/gemini_tts.py
@@ -11,6 +11,16 @@ from ..core.logging import get_logger
 logger = get_logger(__name__)
 
 
+class TTSSynthesisError(Exception):
+    """Raised when TTS synthesis fails after all retries."""
+
+    def __init__(self, message: str, cue_index: int, cue_text: str, api_response_info: str = None):
+        super().__init__(message)
+        self.cue_index = cue_index
+        self.cue_text = cue_text
+        self.api_response_info = api_response_info
+
+
 class GeminiTTSService:
     """Text-to-Speech service using Gemini TTS API"""
 
@@ -87,11 +97,43 @@ class GeminiTTSService:
                 )
             )
 
-            # Extract PCM audio data from response
-            if not response.candidates or not response.candidates[0].content.parts:
-                raise ValueError("No audio data in Gemini TTS response")
+            # Extract PCM audio data from response with proper null-safe checks
+            if not response.candidates:
+                logger.error(
+                    f"Gemini TTS response missing candidates. "
+                    f"Response type: {type(response)}, Response: {response}"
+                )
+                raise ValueError("No candidates in Gemini TTS response")
 
-            pcm_data = response.candidates[0].content.parts[0].inline_data.data
+            candidate = response.candidates[0]
+
+            if candidate.content is None:
+                logger.error(
+                    f"Gemini TTS candidate has no content. "
+                    f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}, "
+                    f"Safety ratings: {getattr(candidate, 'safety_ratings', 'unknown')}"
+                )
+                raise ValueError(
+                    f"Candidate content is None in Gemini TTS response. "
+                    f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}"
+                )
+
+            if not candidate.content.parts:
+                logger.error(
+                    f"Gemini TTS content has no parts. "
+                    f"Content role: {getattr(candidate.content, 'role', 'unknown')}"
+                )
+                raise ValueError("No parts in Gemini TTS response content")
+
+            part = candidate.content.parts[0]
+            if not hasattr(part, 'inline_data') or part.inline_data is None:
+                logger.error(
+                    f"Gemini TTS part missing inline_data. "
+                    f"Part type: {type(part)}, Part: {part}"
+                )
+                raise ValueError("No inline_data in Gemini TTS response part")
+
+            pcm_data = part.inline_data.data
 
             # Convert PCM to MP3
             mp3_data = self._pcm_to_mp3(pcm_data)
@@ -99,7 +141,17 @@ class GeminiTTSService:
             return mp3_data
 
         except Exception as e:
-            logger.error(f"Gemini TTS synthesis failed: {e}")
+            # Log comprehensive error information for debugging
+            error_context = {
+                "text_length": len(text),
+                "text_preview": text[:100] + "..." if len(text) > 100 else text,
+                "voice_name": voice_name,
+                "language": language,
+                "model_id": model_id,
+            }
+            logger.error(
+                f"Gemini TTS synthesis failed: {e}. Context: {error_context}"
+            )
             raise
 
     async def synthesize_preview(
@@ -129,6 +181,80 @@ class GeminiTTSService:
             style_prompt=style_prompt
         )
 
+    async def _synthesize_cue_with_retry(
+        self,
+        cue_index: int,
+        text: str,
+        voice_name: str,
+        language: str,
+        model: str,
+        speed: float,
+        style_prompt: str,
+        max_attempts: int = 3,
+        base_delay: float = 1.0
+    ) -> bytes:
+        """
+        Synthesize a single cue with exponential backoff retry.
+
+        Args:
+            cue_index: Index of the cue (for error reporting)
+            text: Text to synthesize
+            voice_name: TTS voice name
+            language: Language code
+            model: Model variant
+            speed: Speech rate
+            style_prompt: Style instructions
+            max_attempts: Total attempts (1 initial + retries)
+            base_delay: Base delay in seconds for backoff
+
+        Returns:
+            MP3 audio bytes
+
+        Raises:
+            TTSSynthesisError: If all attempts fail
+        """
+        import asyncio
+        import random
+
+        last_exception = None
+        api_response_info = None
+
+        for attempt in range(max_attempts):
+            try:
+                return await self.synthesize_text(
+                    text,
+                    voice_name,
+                    language,
+                    model=model,
+                    speed=speed,
+                    style_prompt=style_prompt
+                )
+            except Exception as e:
+                last_exception = e
+                api_response_info = str(e)
+
+                if attempt < max_attempts - 1:
+                    # Exponential backoff with jitter
+                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
+                    logger.warning(
+                        f"TTS synthesis attempt {attempt + 1}/{max_attempts} failed for cue {cue_index}. "
+                        f"Retrying in {delay:.2f}s. Error: {e}"
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    logger.error(
+                        f"TTS synthesis FAILED after {max_attempts} attempts for cue {cue_index}. "
+                        f"Text: {text[:50]}{'...' if len(text) > 50 else ''}. Error: {e}"
+                    )
+
+        # All retries exhausted - raise hard failure
+        raise TTSSynthesisError(
+            message=f"TTS synthesis failed after {max_attempts} attempts: {last_exception}",
+            cue_index=cue_index,
+            cue_text=text,
+            api_response_info=api_response_info
+        )
+
     async def synthesize_audio_description(
         self,
         ad_vtt_content: str,
@@ -190,31 +316,26 @@ class GeminiTTSService:
                 if not text.endswith(('.', '!', '?')):
                     text += "."
 
-                try:
-                    audio_data = await self.synthesize_text(
-                        text,
-                        voice_name,
-                        language,
-                        model=model,
-                        speed=speed,
-                        style_prompt=style_prompt
-                    )
+                # Use retry helper - will raise TTSSynthesisError on failure after retries
+                audio_data = await self._synthesize_cue_with_retry(
+                    cue_index=i,
+                    text=text,
+                    voice_name=voice_name,
+                    language=language,
+                    model=model,
+                    speed=speed,
+                    style_prompt=style_prompt,
+                    max_attempts=3,
+                    base_delay=1.0
+                )
 
-                    # Convert to AudioSegment and get actual duration
-                    audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
-                    audio_segments.append(audio_segment)
+                # Convert to AudioSegment and get actual duration
+                audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
+                audio_segments.append(audio_segment)
 
-                    # Update position based on actual audio duration
-                    actual_audio_duration = len(audio_segment) / 1000.0
-                    current_audio_position += actual_audio_duration
-
-                except Exception as e:
-                    logger.warning(f"Failed to synthesize cue {i}: {e}")
-                    # Add silence for failed cue
-                    cue_duration = cue["end_time"] - cue["start_time"]
-                    silence = AudioSegment.silent(duration=int(cue_duration * 1000))
-                    audio_segments.append(silence)
-                    current_audio_position += cue_duration
+                # Update position based on actual audio duration
+                actual_audio_duration = len(audio_segment) / 1000.0
+                current_audio_position += actual_audio_duration
 
         # Combine all segments
         if audio_segments:
diff --git a/frontend/src/components/StatusBadge.tsx b/frontend/src/components/StatusBadge.tsx
index 52ad4a7..98a57f9 100644
--- a/frontend/src/components/StatusBadge.tsx
+++ b/frontend/src/components/StatusBadge.tsx
@@ -19,6 +19,7 @@ export function StatusBadge({ status }: StatusBadgeProps) {
       case 'approved_source':
         return 'bg-green-100 text-green-800';
       case 'rejected':
+      case 'tts_failed':
         return 'bg-red-100 text-red-800';
       case 'translating':
         return 'bg-blue-100 text-blue-800';
@@ -51,6 +52,8 @@ export function StatusBadge({ status }: StatusBadgeProps) {
         return 'Approved for Translation';
       case 'rejected':
         return 'Rejected';
+      case 'tts_failed':
+        return 'TTS Failed';
       case 'translating':
         return 'Translating';
       case 'tts_generating':
diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts
index 9465815..b9c7467 100644
--- a/frontend/src/types/api.ts
+++ b/frontend/src/types/api.ts
@@ -9,6 +9,7 @@ export type JobStatus =
   | "qc_feedback"
   | "translating"
   | "tts_generating"
+  | "tts_failed"  // TTS synthesis failed after retries, requires reprocessing
   | "rendering_video"  // Accessible video rendering in progress
   | "pending_final_review"
   | "completed";