feat: add TTS_FAILED status and robust error handling for TTS synthesis

Add comprehensive error handling for TTS synthesis failures: Backend: - Add TTS_FAILED status to JobStatus enum for failed synthesis jobs - Add TTSSynthesisError exception with cue index and context tracking - Improve null-safe error handling in Gemini TTS response parsing - Add _synthesize_cue_with_retry() with exponential backoff (3 attempts) - Enhanced error logging with text preview and model context Frontend: - Add TTS_FAILED status styling (red badge) in StatusBadge component - Add tts_failed to JobStatus TypeScript type 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 14:26:07 -06:00 · 2025-12-30 14:26:07 -06:00 · e8b940aee8
commit e8b940aee8
parent 6689778be7
4 changed files with 154 additions and 28 deletions
--- a/backend/app/models/job.py
+++ b/backend/app/models/job.py
@ -16,6 +16,7 @@ class JobStatus(str, Enum):
    QC_FEEDBACK = "qc_feedback"
    TRANSLATING = "translating"
    TTS_GENERATING = "tts_generating"
+    TTS_FAILED = "tts_failed"  # TTS synthesis failed after retries, requires reprocessing
    RENDERING_VIDEO = "rendering_video"  # Accessible video rendering in progress
    PENDING_FINAL_REVIEW = "pending_final_review"
    COMPLETED = "completed"
--- a/backend/app/services/gemini_tts.py
+++ b/backend/app/services/gemini_tts.py
@ -11,6 +11,16 @@ from ..core.logging import get_logger
 logger = get_logger(__name__)


+class TTSSynthesisError(Exception):
+    """Raised when TTS synthesis fails after all retries."""
+
+    def __init__(self, message: str, cue_index: int, cue_text: str, api_response_info: str = None):
+        super().__init__(message)
+        self.cue_index = cue_index
+        self.cue_text = cue_text
+        self.api_response_info = api_response_info
+
+
 class GeminiTTSService:
    """Text-to-Speech service using Gemini TTS API"""

@ -87,11 +97,43 @@ class GeminiTTSService:
                )
            )

-            # Extract PCM audio data from response
-            if not response.candidates or not response.candidates[0].content.parts:
-                raise ValueError("No audio data in Gemini TTS response")
+            # Extract PCM audio data from response with proper null-safe checks
+            if not response.candidates:
+                logger.error(
+                    f"Gemini TTS response missing candidates. "
+                    f"Response type: {type(response)}, Response: {response}"
+                )
+                raise ValueError("No candidates in Gemini TTS response")

-            pcm_data = response.candidates[0].content.parts[0].inline_data.data
+            candidate = response.candidates[0]
+
+            if candidate.content is None:
+                logger.error(
+                    f"Gemini TTS candidate has no content. "
+                    f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}, "
+                    f"Safety ratings: {getattr(candidate, 'safety_ratings', 'unknown')}"
+                )
+                raise ValueError(
+                    f"Candidate content is None in Gemini TTS response. "
+                    f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}"
+                )
+
+            if not candidate.content.parts:
+                logger.error(
+                    f"Gemini TTS content has no parts. "
+                    f"Content role: {getattr(candidate.content, 'role', 'unknown')}"
+                )
+                raise ValueError("No parts in Gemini TTS response content")
+
+            part = candidate.content.parts[0]
+            if not hasattr(part, 'inline_data') or part.inline_data is None:
+                logger.error(
+                    f"Gemini TTS part missing inline_data. "
+                    f"Part type: {type(part)}, Part: {part}"
+                )
+                raise ValueError("No inline_data in Gemini TTS response part")
+
+            pcm_data = part.inline_data.data

            # Convert PCM to MP3
            mp3_data = self._pcm_to_mp3(pcm_data)
@ -99,7 +141,17 @@ class GeminiTTSService:
            return mp3_data

        except Exception as e:
-            logger.error(f"Gemini TTS synthesis failed: {e}")
+            # Log comprehensive error information for debugging
+            error_context = {
+                "text_length": len(text),
+                "text_preview": text[:100] + "..." if len(text) > 100 else text,
+                "voice_name": voice_name,
+                "language": language,
+                "model_id": model_id,
+            }
+            logger.error(
+                f"Gemini TTS synthesis failed: {e}. Context: {error_context}"
+            )
            raise

    async def synthesize_preview(
@ -129,6 +181,80 @@ class GeminiTTSService:
            style_prompt=style_prompt
        )

+    async def _synthesize_cue_with_retry(
+        self,
+        cue_index: int,
+        text: str,
+        voice_name: str,
+        language: str,
+        model: str,
+        speed: float,
+        style_prompt: str,
+        max_attempts: int = 3,
+        base_delay: float = 1.0
+    ) -> bytes:
+        """
+        Synthesize a single cue with exponential backoff retry.
+
+        Args:
+            cue_index: Index of the cue (for error reporting)
+            text: Text to synthesize
+            voice_name: TTS voice name
+            language: Language code
+            model: Model variant
+            speed: Speech rate
+            style_prompt: Style instructions
+            max_attempts: Total attempts (1 initial + retries)
+            base_delay: Base delay in seconds for backoff
+
+        Returns:
+            MP3 audio bytes
+
+        Raises:
+            TTSSynthesisError: If all attempts fail
+        """
+        import asyncio
+        import random
+
+        last_exception = None
+        api_response_info = None
+
+        for attempt in range(max_attempts):
+            try:
+                return await self.synthesize_text(
+                    text,
+                    voice_name,
+                    language,
+                    model=model,
+                    speed=speed,
+                    style_prompt=style_prompt
+                )
+            except Exception as e:
+                last_exception = e
+                api_response_info = str(e)
+
+                if attempt < max_attempts - 1:
+                    # Exponential backoff with jitter
+                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
+                    logger.warning(
+                        f"TTS synthesis attempt {attempt + 1}/{max_attempts} failed for cue {cue_index}. "
+                        f"Retrying in {delay:.2f}s. Error: {e}"
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    logger.error(
+                        f"TTS synthesis FAILED after {max_attempts} attempts for cue {cue_index}. "
+                        f"Text: {text[:50]}{'...' if len(text) > 50 else ''}. Error: {e}"
+                    )
+
+        # All retries exhausted - raise hard failure
+        raise TTSSynthesisError(
+            message=f"TTS synthesis failed after {max_attempts} attempts: {last_exception}",
+            cue_index=cue_index,
+            cue_text=text,
+            api_response_info=api_response_info
+        )
+
    async def synthesize_audio_description(
        self,
        ad_vtt_content: str,
@ -190,31 +316,26 @@ class GeminiTTSService:
                if not text.endswith(('.', '!', '?')):
                    text += "."

-                try:
-                    audio_data = await self.synthesize_text(
-                        text,
-                        voice_name,
-                        language,
-                        model=model,
-                        speed=speed,
-                        style_prompt=style_prompt
-                    )
+                # Use retry helper - will raise TTSSynthesisError on failure after retries
+                audio_data = await self._synthesize_cue_with_retry(
+                    cue_index=i,
+                    text=text,
+                    voice_name=voice_name,
+                    language=language,
+                    model=model,
+                    speed=speed,
+                    style_prompt=style_prompt,
+                    max_attempts=3,
+                    base_delay=1.0
+                )

-                    # Convert to AudioSegment and get actual duration
-                    audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
-                    audio_segments.append(audio_segment)
+                # Convert to AudioSegment and get actual duration
+                audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
+                audio_segments.append(audio_segment)

-                    # Update position based on actual audio duration
-                    actual_audio_duration = len(audio_segment) / 1000.0
-                    current_audio_position += actual_audio_duration
-
-                except Exception as e:
-                    logger.warning(f"Failed to synthesize cue {i}: {e}")
-                    # Add silence for failed cue
-                    cue_duration = cue["end_time"] - cue["start_time"]
-                    silence = AudioSegment.silent(duration=int(cue_duration * 1000))
-                    audio_segments.append(silence)
-                    current_audio_position += cue_duration
+                # Update position based on actual audio duration
+                actual_audio_duration = len(audio_segment) / 1000.0
+                current_audio_position += actual_audio_duration

        # Combine all segments
        if audio_segments:
--- a/frontend/src/components/StatusBadge.tsx
+++ b/frontend/src/components/StatusBadge.tsx
@ -19,6 +19,7 @@ export function StatusBadge({ status }: StatusBadgeProps) {
      case 'approved_source':
        return 'bg-green-100 text-green-800';
      case 'rejected':
+      case 'tts_failed':
        return 'bg-red-100 text-red-800';
      case 'translating':
        return 'bg-blue-100 text-blue-800';
@ -51,6 +52,8 @@ export function StatusBadge({ status }: StatusBadgeProps) {
        return 'Approved for Translation';
      case 'rejected':
        return 'Rejected';
+      case 'tts_failed':
+        return 'TTS Failed';
      case 'translating':
        return 'Translating';
      case 'tts_generating':
--- a/frontend/src/types/api.ts
+++ b/frontend/src/types/api.ts
@ -9,6 +9,7 @@ export type JobStatus =
  | "qc_feedback"
  | "translating"
  | "tts_generating"
+  | "tts_failed"  // TTS synthesis failed after retries, requires reprocessing
  | "rendering_video"  // Accessible video rendering in progress
  | "pending_final_review"
  | "completed";