feat: add TTS_FAILED status and robust error handling for TTS synthesis
Add comprehensive error handling for TTS synthesis failures: Backend: - Add TTS_FAILED status to JobStatus enum for failed synthesis jobs - Add TTSSynthesisError exception with cue index and context tracking - Improve null-safe error handling in Gemini TTS response parsing - Add _synthesize_cue_with_retry() with exponential backoff (3 attempts) - Enhanced error logging with text preview and model context Frontend: - Add TTS_FAILED status styling (red badge) in StatusBadge component - Add tts_failed to JobStatus TypeScript type 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
6689778be7
commit
e8b940aee8
4 changed files with 154 additions and 28 deletions
|
|
@ -16,6 +16,7 @@ class JobStatus(str, Enum):
|
|||
QC_FEEDBACK = "qc_feedback"
|
||||
TRANSLATING = "translating"
|
||||
TTS_GENERATING = "tts_generating"
|
||||
TTS_FAILED = "tts_failed" # TTS synthesis failed after retries, requires reprocessing
|
||||
RENDERING_VIDEO = "rendering_video" # Accessible video rendering in progress
|
||||
PENDING_FINAL_REVIEW = "pending_final_review"
|
||||
COMPLETED = "completed"
|
||||
|
|
|
|||
|
|
@ -11,6 +11,16 @@ from ..core.logging import get_logger
|
|||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class TTSSynthesisError(Exception):
|
||||
"""Raised when TTS synthesis fails after all retries."""
|
||||
|
||||
def __init__(self, message: str, cue_index: int, cue_text: str, api_response_info: str = None):
|
||||
super().__init__(message)
|
||||
self.cue_index = cue_index
|
||||
self.cue_text = cue_text
|
||||
self.api_response_info = api_response_info
|
||||
|
||||
|
||||
class GeminiTTSService:
|
||||
"""Text-to-Speech service using Gemini TTS API"""
|
||||
|
||||
|
|
@ -87,11 +97,43 @@ class GeminiTTSService:
|
|||
)
|
||||
)
|
||||
|
||||
# Extract PCM audio data from response
|
||||
if not response.candidates or not response.candidates[0].content.parts:
|
||||
raise ValueError("No audio data in Gemini TTS response")
|
||||
# Extract PCM audio data from response with proper null-safe checks
|
||||
if not response.candidates:
|
||||
logger.error(
|
||||
f"Gemini TTS response missing candidates. "
|
||||
f"Response type: {type(response)}, Response: {response}"
|
||||
)
|
||||
raise ValueError("No candidates in Gemini TTS response")
|
||||
|
||||
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
||||
candidate = response.candidates[0]
|
||||
|
||||
if candidate.content is None:
|
||||
logger.error(
|
||||
f"Gemini TTS candidate has no content. "
|
||||
f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}, "
|
||||
f"Safety ratings: {getattr(candidate, 'safety_ratings', 'unknown')}"
|
||||
)
|
||||
raise ValueError(
|
||||
f"Candidate content is None in Gemini TTS response. "
|
||||
f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}"
|
||||
)
|
||||
|
||||
if not candidate.content.parts:
|
||||
logger.error(
|
||||
f"Gemini TTS content has no parts. "
|
||||
f"Content role: {getattr(candidate.content, 'role', 'unknown')}"
|
||||
)
|
||||
raise ValueError("No parts in Gemini TTS response content")
|
||||
|
||||
part = candidate.content.parts[0]
|
||||
if not hasattr(part, 'inline_data') or part.inline_data is None:
|
||||
logger.error(
|
||||
f"Gemini TTS part missing inline_data. "
|
||||
f"Part type: {type(part)}, Part: {part}"
|
||||
)
|
||||
raise ValueError("No inline_data in Gemini TTS response part")
|
||||
|
||||
pcm_data = part.inline_data.data
|
||||
|
||||
# Convert PCM to MP3
|
||||
mp3_data = self._pcm_to_mp3(pcm_data)
|
||||
|
|
@ -99,7 +141,17 @@ class GeminiTTSService:
|
|||
return mp3_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Gemini TTS synthesis failed: {e}")
|
||||
# Log comprehensive error information for debugging
|
||||
error_context = {
|
||||
"text_length": len(text),
|
||||
"text_preview": text[:100] + "..." if len(text) > 100 else text,
|
||||
"voice_name": voice_name,
|
||||
"language": language,
|
||||
"model_id": model_id,
|
||||
}
|
||||
logger.error(
|
||||
f"Gemini TTS synthesis failed: {e}. Context: {error_context}"
|
||||
)
|
||||
raise
|
||||
|
||||
async def synthesize_preview(
|
||||
|
|
@ -129,6 +181,80 @@ class GeminiTTSService:
|
|||
style_prompt=style_prompt
|
||||
)
|
||||
|
||||
async def _synthesize_cue_with_retry(
|
||||
self,
|
||||
cue_index: int,
|
||||
text: str,
|
||||
voice_name: str,
|
||||
language: str,
|
||||
model: str,
|
||||
speed: float,
|
||||
style_prompt: str,
|
||||
max_attempts: int = 3,
|
||||
base_delay: float = 1.0
|
||||
) -> bytes:
|
||||
"""
|
||||
Synthesize a single cue with exponential backoff retry.
|
||||
|
||||
Args:
|
||||
cue_index: Index of the cue (for error reporting)
|
||||
text: Text to synthesize
|
||||
voice_name: TTS voice name
|
||||
language: Language code
|
||||
model: Model variant
|
||||
speed: Speech rate
|
||||
style_prompt: Style instructions
|
||||
max_attempts: Total attempts (1 initial + retries)
|
||||
base_delay: Base delay in seconds for backoff
|
||||
|
||||
Returns:
|
||||
MP3 audio bytes
|
||||
|
||||
Raises:
|
||||
TTSSynthesisError: If all attempts fail
|
||||
"""
|
||||
import asyncio
|
||||
import random
|
||||
|
||||
last_exception = None
|
||||
api_response_info = None
|
||||
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
return await self.synthesize_text(
|
||||
text,
|
||||
voice_name,
|
||||
language,
|
||||
model=model,
|
||||
speed=speed,
|
||||
style_prompt=style_prompt
|
||||
)
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
api_response_info = str(e)
|
||||
|
||||
if attempt < max_attempts - 1:
|
||||
# Exponential backoff with jitter
|
||||
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
||||
logger.warning(
|
||||
f"TTS synthesis attempt {attempt + 1}/{max_attempts} failed for cue {cue_index}. "
|
||||
f"Retrying in {delay:.2f}s. Error: {e}"
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
logger.error(
|
||||
f"TTS synthesis FAILED after {max_attempts} attempts for cue {cue_index}. "
|
||||
f"Text: {text[:50]}{'...' if len(text) > 50 else ''}. Error: {e}"
|
||||
)
|
||||
|
||||
# All retries exhausted - raise hard failure
|
||||
raise TTSSynthesisError(
|
||||
message=f"TTS synthesis failed after {max_attempts} attempts: {last_exception}",
|
||||
cue_index=cue_index,
|
||||
cue_text=text,
|
||||
api_response_info=api_response_info
|
||||
)
|
||||
|
||||
async def synthesize_audio_description(
|
||||
self,
|
||||
ad_vtt_content: str,
|
||||
|
|
@ -190,31 +316,26 @@ class GeminiTTSService:
|
|||
if not text.endswith(('.', '!', '?')):
|
||||
text += "."
|
||||
|
||||
try:
|
||||
audio_data = await self.synthesize_text(
|
||||
text,
|
||||
voice_name,
|
||||
language,
|
||||
model=model,
|
||||
speed=speed,
|
||||
style_prompt=style_prompt
|
||||
)
|
||||
# Use retry helper - will raise TTSSynthesisError on failure after retries
|
||||
audio_data = await self._synthesize_cue_with_retry(
|
||||
cue_index=i,
|
||||
text=text,
|
||||
voice_name=voice_name,
|
||||
language=language,
|
||||
model=model,
|
||||
speed=speed,
|
||||
style_prompt=style_prompt,
|
||||
max_attempts=3,
|
||||
base_delay=1.0
|
||||
)
|
||||
|
||||
# Convert to AudioSegment and get actual duration
|
||||
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
audio_segments.append(audio_segment)
|
||||
# Convert to AudioSegment and get actual duration
|
||||
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
audio_segments.append(audio_segment)
|
||||
|
||||
# Update position based on actual audio duration
|
||||
actual_audio_duration = len(audio_segment) / 1000.0
|
||||
current_audio_position += actual_audio_duration
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to synthesize cue {i}: {e}")
|
||||
# Add silence for failed cue
|
||||
cue_duration = cue["end_time"] - cue["start_time"]
|
||||
silence = AudioSegment.silent(duration=int(cue_duration * 1000))
|
||||
audio_segments.append(silence)
|
||||
current_audio_position += cue_duration
|
||||
# Update position based on actual audio duration
|
||||
actual_audio_duration = len(audio_segment) / 1000.0
|
||||
current_audio_position += actual_audio_duration
|
||||
|
||||
# Combine all segments
|
||||
if audio_segments:
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ export function StatusBadge({ status }: StatusBadgeProps) {
|
|||
case 'approved_source':
|
||||
return 'bg-green-100 text-green-800';
|
||||
case 'rejected':
|
||||
case 'tts_failed':
|
||||
return 'bg-red-100 text-red-800';
|
||||
case 'translating':
|
||||
return 'bg-blue-100 text-blue-800';
|
||||
|
|
@ -51,6 +52,8 @@ export function StatusBadge({ status }: StatusBadgeProps) {
|
|||
return 'Approved for Translation';
|
||||
case 'rejected':
|
||||
return 'Rejected';
|
||||
case 'tts_failed':
|
||||
return 'TTS Failed';
|
||||
case 'translating':
|
||||
return 'Translating';
|
||||
case 'tts_generating':
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ export type JobStatus =
|
|||
| "qc_feedback"
|
||||
| "translating"
|
||||
| "tts_generating"
|
||||
| "tts_failed" // TTS synthesis failed after retries, requires reprocessing
|
||||
| "rendering_video" // Accessible video rendering in progress
|
||||
| "pending_final_review"
|
||||
| "completed";
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue