feat: add TTS_FAILED status and robust error handling for TTS synthesis

Add comprehensive error handling for TTS synthesis failures:

Backend:
- Add TTS_FAILED status to JobStatus enum for failed synthesis jobs
- Add TTSSynthesisError exception with cue index and context tracking
- Improve null-safe error handling in Gemini TTS response parsing
- Add _synthesize_cue_with_retry() with exponential backoff (3 attempts)
- Enhanced error logging with text preview and model context

Frontend:
- Add TTS_FAILED status styling (red badge) in StatusBadge component
- Add tts_failed to JobStatus TypeScript type

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
michael 2025-12-30 14:26:07 -06:00
parent 6689778be7
commit e8b940aee8
4 changed files with 154 additions and 28 deletions

View file

@ -16,6 +16,7 @@ class JobStatus(str, Enum):
QC_FEEDBACK = "qc_feedback"
TRANSLATING = "translating"
TTS_GENERATING = "tts_generating"
TTS_FAILED = "tts_failed" # TTS synthesis failed after retries, requires reprocessing
RENDERING_VIDEO = "rendering_video" # Accessible video rendering in progress
PENDING_FINAL_REVIEW = "pending_final_review"
COMPLETED = "completed"

View file

@ -11,6 +11,16 @@ from ..core.logging import get_logger
logger = get_logger(__name__)
class TTSSynthesisError(Exception):
"""Raised when TTS synthesis fails after all retries."""
def __init__(self, message: str, cue_index: int, cue_text: str, api_response_info: str = None):
super().__init__(message)
self.cue_index = cue_index
self.cue_text = cue_text
self.api_response_info = api_response_info
class GeminiTTSService:
"""Text-to-Speech service using Gemini TTS API"""
@ -87,11 +97,43 @@ class GeminiTTSService:
)
)
# Extract PCM audio data from response
if not response.candidates or not response.candidates[0].content.parts:
raise ValueError("No audio data in Gemini TTS response")
# Extract PCM audio data from response with proper null-safe checks
if not response.candidates:
logger.error(
f"Gemini TTS response missing candidates. "
f"Response type: {type(response)}, Response: {response}"
)
raise ValueError("No candidates in Gemini TTS response")
pcm_data = response.candidates[0].content.parts[0].inline_data.data
candidate = response.candidates[0]
if candidate.content is None:
logger.error(
f"Gemini TTS candidate has no content. "
f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}, "
f"Safety ratings: {getattr(candidate, 'safety_ratings', 'unknown')}"
)
raise ValueError(
f"Candidate content is None in Gemini TTS response. "
f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}"
)
if not candidate.content.parts:
logger.error(
f"Gemini TTS content has no parts. "
f"Content role: {getattr(candidate.content, 'role', 'unknown')}"
)
raise ValueError("No parts in Gemini TTS response content")
part = candidate.content.parts[0]
if not hasattr(part, 'inline_data') or part.inline_data is None:
logger.error(
f"Gemini TTS part missing inline_data. "
f"Part type: {type(part)}, Part: {part}"
)
raise ValueError("No inline_data in Gemini TTS response part")
pcm_data = part.inline_data.data
# Convert PCM to MP3
mp3_data = self._pcm_to_mp3(pcm_data)
@ -99,7 +141,17 @@ class GeminiTTSService:
return mp3_data
except Exception as e:
logger.error(f"Gemini TTS synthesis failed: {e}")
# Log comprehensive error information for debugging
error_context = {
"text_length": len(text),
"text_preview": text[:100] + "..." if len(text) > 100 else text,
"voice_name": voice_name,
"language": language,
"model_id": model_id,
}
logger.error(
f"Gemini TTS synthesis failed: {e}. Context: {error_context}"
)
raise
async def synthesize_preview(
@ -129,6 +181,80 @@ class GeminiTTSService:
style_prompt=style_prompt
)
async def _synthesize_cue_with_retry(
self,
cue_index: int,
text: str,
voice_name: str,
language: str,
model: str,
speed: float,
style_prompt: str,
max_attempts: int = 3,
base_delay: float = 1.0
) -> bytes:
"""
Synthesize a single cue with exponential backoff retry.
Args:
cue_index: Index of the cue (for error reporting)
text: Text to synthesize
voice_name: TTS voice name
language: Language code
model: Model variant
speed: Speech rate
style_prompt: Style instructions
max_attempts: Total attempts (1 initial + retries)
base_delay: Base delay in seconds for backoff
Returns:
MP3 audio bytes
Raises:
TTSSynthesisError: If all attempts fail
"""
import asyncio
import random
last_exception = None
api_response_info = None
for attempt in range(max_attempts):
try:
return await self.synthesize_text(
text,
voice_name,
language,
model=model,
speed=speed,
style_prompt=style_prompt
)
except Exception as e:
last_exception = e
api_response_info = str(e)
if attempt < max_attempts - 1:
# Exponential backoff with jitter
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
logger.warning(
f"TTS synthesis attempt {attempt + 1}/{max_attempts} failed for cue {cue_index}. "
f"Retrying in {delay:.2f}s. Error: {e}"
)
await asyncio.sleep(delay)
else:
logger.error(
f"TTS synthesis FAILED after {max_attempts} attempts for cue {cue_index}. "
f"Text: {text[:50]}{'...' if len(text) > 50 else ''}. Error: {e}"
)
# All retries exhausted - raise hard failure
raise TTSSynthesisError(
message=f"TTS synthesis failed after {max_attempts} attempts: {last_exception}",
cue_index=cue_index,
cue_text=text,
api_response_info=api_response_info
)
async def synthesize_audio_description(
self,
ad_vtt_content: str,
@ -190,31 +316,26 @@ class GeminiTTSService:
if not text.endswith(('.', '!', '?')):
text += "."
try:
audio_data = await self.synthesize_text(
text,
voice_name,
language,
model=model,
speed=speed,
style_prompt=style_prompt
)
# Use retry helper - will raise TTSSynthesisError on failure after retries
audio_data = await self._synthesize_cue_with_retry(
cue_index=i,
text=text,
voice_name=voice_name,
language=language,
model=model,
speed=speed,
style_prompt=style_prompt,
max_attempts=3,
base_delay=1.0
)
# Convert to AudioSegment and get actual duration
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
audio_segments.append(audio_segment)
# Convert to AudioSegment and get actual duration
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
audio_segments.append(audio_segment)
# Update position based on actual audio duration
actual_audio_duration = len(audio_segment) / 1000.0
current_audio_position += actual_audio_duration
except Exception as e:
logger.warning(f"Failed to synthesize cue {i}: {e}")
# Add silence for failed cue
cue_duration = cue["end_time"] - cue["start_time"]
silence = AudioSegment.silent(duration=int(cue_duration * 1000))
audio_segments.append(silence)
current_audio_position += cue_duration
# Update position based on actual audio duration
actual_audio_duration = len(audio_segment) / 1000.0
current_audio_position += actual_audio_duration
# Combine all segments
if audio_segments:

View file

@ -19,6 +19,7 @@ export function StatusBadge({ status }: StatusBadgeProps) {
case 'approved_source':
return 'bg-green-100 text-green-800';
case 'rejected':
case 'tts_failed':
return 'bg-red-100 text-red-800';
case 'translating':
return 'bg-blue-100 text-blue-800';
@ -51,6 +52,8 @@ export function StatusBadge({ status }: StatusBadgeProps) {
return 'Approved for Translation';
case 'rejected':
return 'Rejected';
case 'tts_failed':
return 'TTS Failed';
case 'translating':
return 'Translating';
case 'tts_generating':

View file

@ -9,6 +9,7 @@ export type JobStatus =
| "qc_feedback"
| "translating"
| "tts_generating"
| "tts_failed" // TTS synthesis failed after retries, requires reprocessing
| "rendering_video" // Accessible video rendering in progress
| "pending_final_review"
| "completed";