feat: auto-rewrite TTS cues that fail synthesis

When TTS synthesis fails after 3 retries, the system now:
- Sends problematic cue text to Gemini for TTS-safe rewriting
- Updates the VTT file in GCS with rewritten text
- Retries TTS synthesis with the new text
- Records successful rewrites in job.tts_rewrites field

UI changes:
- JobDetail shows amber caution box with original/rewritten text
- JobsList shows warning icon next to jobs with rewrites
- Error display clarifies text shown is "after rewrite attempt"

Files changed:
- backend/app/models/job.py: Add tts_rewrites field
- backend/app/prompts/gemini_tts_rewrite.md: New prompt template
- backend/app/services/gemini.py: Add rewrite_tts_cue method
- backend/app/tasks/tts_synthesis.py: Add VTT update utilities
- backend/app/tasks/translate_and_synthesize.py: Rewrite+retry logic
- frontend/src/types/api.ts: Add TTSRewriteItem type
- frontend/src/routes/jobs/JobDetail.tsx: Caution display
- frontend/src/routes/jobs/JobsList.tsx: Warning indicator

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
michael 2026-01-05 14:42:50 -06:00
parent 76c4c60b0d
commit e44210ea64
8 changed files with 326 additions and 17 deletions

View file

@ -116,6 +116,7 @@ class Job(BaseModel):
accessible_video_progress: Optional[dict[str, AccessibleVideoProgressItem]] = None
ai: Optional[AISection] = None
error: Optional[dict[str, Any]] = None
tts_rewrites: Optional[list[dict[str, Any]]] = None # Track auto-rewritten TTS cues
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None

View file

@ -0,0 +1,19 @@
SYSTEM:
You are an accessibility content editor specializing in text-to-speech optimization.
USER:
The following audio description cue failed text-to-speech synthesis after multiple attempts. Rewrite it to be TTS-friendly while preserving the exact same visual information being described.
Original text: "{ORIGINAL_TEXT}"
Language: {LANGUAGE}
Guidelines for TTS-safe text:
- Avoid special characters, symbols, or unusual punctuation
- Spell out abbreviations and acronyms (e.g., "Hz" becomes "hertz")
- Use simple, common words when possible
- Keep sentences concise (under 100 characters preferred)
- Avoid words that may be difficult to pronounce or sound medical/technical
- Do not use quotation marks within the text
- Use natural spoken phrasing
Return ONLY the rewritten text, nothing else. Do not include quotes around the response.

View file

@ -786,6 +786,64 @@ VTT Content to translate:
logger.error(f"Gemini translation failed for {target_language}: {e}")
raise
async def rewrite_tts_cue(
self,
original_text: str,
language: str = "en"
) -> str:
"""
Rewrite an audio description cue to be TTS-friendly.
Called when TTS synthesis fails for a cue after retries. Uses Gemini
to rephrase the text while preserving the visual information being described.
Args:
original_text: The cue text that failed TTS synthesis
language: Language code for context (default: 'en')
Returns:
Rewritten text optimized for TTS synthesis
"""
prompt_template = self._load_prompt("gemini_tts_rewrite.md")
prompt = prompt_template.replace(
"{ORIGINAL_TEXT}", original_text
).replace(
"{LANGUAGE}", language
)
try:
logger.info(f"Rewriting TTS cue for safety: '{original_text[:50]}...'")
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[genai.types.Part.from_text(text=prompt)]
)
result = response.text.strip()
# Remove any markdown formatting or quotes that Gemini might add
if result.startswith("```"):
lines = result.split("\n")
filtered_lines = [
line for line in lines
if not line.strip().startswith("```")
]
result = "\n".join(filtered_lines).strip()
# Remove surrounding quotes if present
if result.startswith('"') and result.endswith('"'):
result = result[1:-1]
if result.startswith("'") and result.endswith("'"):
result = result[1:-1]
logger.info(f"Rewrote TTS cue: '{original_text[:30]}...' -> '{result[:30]}...'")
return result
except Exception as e:
logger.error(f"Failed to rewrite TTS cue: {e}")
raise
# Global service instance
gemini_service = GeminiService()

View file

@ -573,7 +573,7 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict,
import io
from celery.result import allow_join_result
from pydub import AudioSegment
from .tts_synthesis import dispatch_language_tts, parse_ad_cues
from .tts_synthesis import dispatch_language_tts, parse_ad_cues, synthesize_cue_task, update_vtt_in_gcs
if tts_preferences is None:
tts_preferences = {}
@ -651,16 +651,106 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict,
cue_results = processed_results
# Check for failures
# Check for failures and attempt automatic rewrite+retry
failed_cues = [r for r in cue_results if not r.get("success", False)]
if failed_cues:
first_failure = failed_cues[0]
raise TTSSynthesisError(
message=f"{len(failed_cues)} cue(s) failed: {first_failure.get('error_message', 'Unknown error')}",
cue_index=first_failure["cue_index"],
cue_text=first_failure.get("text", ""),
api_response_info=first_failure.get("error_message")
)
logger.info(f"TTS failed for {len(failed_cues)} cue(s), attempting automatic rewrite")
# Extract TTS settings for retry (same as dispatch_language_tts)
voices_per_language = tts_preferences.get("voices_per_language", {})
voice_name = voices_per_language.get(language, tts_preferences.get("default_voice"))
provider = tts_preferences.get("provider", "gemini")
model = tts_preferences.get("model", "flash")
speed = tts_preferences.get("speed", 1.0)
style_preset = tts_preferences.get("style_preset", "neutral")
custom_style_prompt = tts_preferences.get("custom_style_prompt")
if style_preset == "custom" and custom_style_prompt:
style_prompt = custom_style_prompt
else:
style_prompt = settings.gemini_tts_style_prompts.get(style_preset, "")
rewrites_made = []
still_failed = []
for failure in failed_cues:
cue_idx = failure["cue_index"]
original_text = failure.get("text", "")
try:
# Step 1: Get Gemini rewrite
logger.info(f"Requesting Gemini rewrite for cue {cue_idx}: '{original_text[:50]}...'")
rewritten_text = await gemini_service.rewrite_tts_cue(original_text, language)
# Step 2: Update VTT in GCS
await update_vtt_in_gcs(job_id, language, cue_idx, rewritten_text)
# Step 3: Retry TTS with rewritten text
logger.info(f"Retrying TTS for cue {cue_idx} with rewritten text")
retry_result = synthesize_cue_task.apply_async(
kwargs={
"job_id": job_id,
"language": language,
"cue_index": cue_idx,
"text": rewritten_text,
"start_time": failure["start_time"],
"end_time": failure["end_time"],
"voice_name": voice_name,
"provider": provider,
"model": model,
"speed": speed,
"style_prompt": style_prompt
},
queue="tts"
)
# Wait for retry result
with allow_join_result():
retry_cue_result = retry_result.get(timeout=120)
if retry_cue_result.get("success"):
# Success! Record rewrite
logger.info(f"Rewrite+retry succeeded for cue {cue_idx}")
rewrites_made.append({
"language": language,
"cue_index": cue_idx,
"original_text": original_text,
"rewritten_text": rewritten_text,
"timestamp": datetime.utcnow().isoformat()
})
# Update cue_results with successful result
for i, r in enumerate(cue_results):
if r["cue_index"] == cue_idx:
cue_results[i] = retry_cue_result
break
else:
logger.warning(f"TTS still failed after rewrite for cue {cue_idx}")
still_failed.append({**failure, "rewritten_text": rewritten_text})
except Exception as e:
logger.error(f"Rewrite attempt failed for cue {cue_idx}: {e}")
still_failed.append(failure)
# Store rewrite history in job for UI caution display
if rewrites_made:
await db.jobs.update_one(
{"_id": job_id},
{
"$push": {"tts_rewrites": {"$each": rewrites_made}},
"$set": {"updated_at": datetime.utcnow()}
}
)
logger.info(f"Recorded {len(rewrites_made)} TTS rewrite(s) for job {job_id}")
# If any cues still failed after rewrite, raise error
if still_failed:
first_failure = still_failed[0]
raise TTSSynthesisError(
message=f"{len(still_failed)} cue(s) failed even after rewrite: {first_failure.get('error_message', 'Unknown error')}",
cue_index=first_failure["cue_index"],
cue_text=first_failure.get("rewritten_text", first_failure.get("text", "")),
api_response_info=first_failure.get("error_message")
)
logger.info(f"All {len(cue_results)} TTS cues synthesized for {language}, assembling combined MP3")

View file

@ -376,3 +376,87 @@ def _parse_timestamp(timestamp: str) -> float:
)
return total_seconds
def update_vtt_cue_text(vtt_content: str, cue_index: int, new_text: str) -> str:
"""
Update a specific cue's text in VTT content.
Args:
vtt_content: Original VTT file content
cue_index: Zero-based index of cue to update
new_text: New text for the cue
Returns:
Updated VTT content
"""
lines = vtt_content.strip().split('\n')
result_lines = []
current_cue = -1
i = 0
while i < len(lines):
line = lines[i]
# Skip header and notes
if line.strip() == "WEBVTT" or line.strip() == "" or line.strip().startswith("NOTE"):
result_lines.append(line)
i += 1
continue
# Check for timing line
if " --> " in line:
current_cue += 1
result_lines.append(line)
i += 1
# Process text lines for this cue
if current_cue == cue_index:
# Skip old text lines
while i < len(lines) and lines[i].strip() != "":
i += 1
# Add new text
result_lines.append(new_text)
else:
# Keep existing text lines
while i < len(lines) and lines[i].strip() != "":
result_lines.append(lines[i])
i += 1
else:
result_lines.append(line)
i += 1
return '\n'.join(result_lines)
async def update_vtt_in_gcs(
job_id: str,
language: str,
cue_index: int,
new_text: str
) -> str:
"""
Update a cue in the AD VTT file stored in GCS.
Args:
job_id: Job identifier
language: Language code
cue_index: Index of cue to update
new_text: New text for the cue
Returns:
Updated VTT content
"""
# Download current VTT
vtt_blob_path = f"{job_id}/{language}/ad.vtt"
blob = gcs_service.bucket.blob(vtt_blob_path)
current_vtt = blob.download_as_text()
# Update the cue
updated_vtt = update_vtt_cue_text(current_vtt, cue_index, new_text)
# Upload back to GCS
blob.upload_from_string(updated_vtt, content_type="text/vtt")
logger.info(f"Updated VTT cue {cue_index} in GCS: {vtt_blob_path}")
return updated_vtt

View file

@ -475,12 +475,12 @@ export function JobDetail() {
)}
{typeof job.error.cue_text === 'string' && (
<div className="mt-2 p-2 bg-red-100 rounded text-xs text-red-700">
<span className="font-medium">Cue Text:</span>
<span className="font-medium">Cue Text (after rewrite attempt):</span>
<p className="mt-1 italic">"{job.error.cue_text}"</p>
</div>
)}
<p className="text-xs text-red-600 mt-2">
This may be caused by content safety filters. Try editing the audio description text to rephrase the blocked cue, then retry.
This cue failed even after automatic rewriting. Manual editing may be required.
</p>
</div>
)}
@ -505,6 +505,41 @@ export function JobDetail() {
</div>
)}
{/* TTS Rewrites Caution Display */}
{job.tts_rewrites && job.tts_rewrites.length > 0 && (
<div className="bg-amber-50 border border-amber-200 rounded-lg p-4">
<div className="flex items-start gap-2 mb-2">
<svg className="w-5 h-5 text-amber-600 flex-shrink-0 mt-0.5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
</svg>
<h3 className="text-sm font-medium text-amber-800">Audio Description Rewrites</h3>
</div>
<p className="text-xs text-amber-700 mb-3">
{job.tts_rewrites.length} cue{job.tts_rewrites.length > 1 ? 's were' : ' was'} automatically rewritten to pass TTS synthesis.
Please review for accuracy.
</p>
<div className="space-y-3 max-h-64 overflow-y-auto">
{job.tts_rewrites.map((rewrite, idx) => (
<div key={idx} className="border-t border-amber-200 pt-2">
<p className="text-xs text-amber-800 font-medium">
{rewrite.language.toUpperCase()} - Cue #{rewrite.cue_index + 1}
</p>
<div className="mt-1 grid grid-cols-1 gap-2">
<div className="p-2 bg-amber-100 rounded text-xs">
<span className="font-medium text-amber-700">Original:</span>
<p className="mt-1 text-amber-800 line-through">"{rewrite.original_text}"</p>
</div>
<div className="p-2 bg-green-100 rounded text-xs">
<span className="font-medium text-green-700">Rewritten:</span>
<p className="mt-1 text-green-800">"{rewrite.rewritten_text}"</p>
</div>
</div>
</div>
))}
</div>
</div>
)}
{/* Retry TTS Button for failed jobs */}
{job.status === 'tts_failed' && (
<div className="bg-orange-50 border border-orange-200 rounded-lg p-4">

View file

@ -723,12 +723,25 @@ export function JobsList() {
)}
{/* Job Name */}
<td className="px-4 py-4 whitespace-nowrap">
<Link
to={`/jobs/${job.id}`}
className="text-sm font-medium text-gray-900 hover:text-blue-600"
>
{job.title}
</Link>
<div className="flex items-center gap-2">
<Link
to={`/jobs/${job.id}`}
className="text-sm font-medium text-gray-900 hover:text-blue-600"
>
{job.title}
</Link>
{/* TTS Rewrite Caution Indicator */}
{job.tts_rewrites && job.tts_rewrites.length > 0 && (
<span
title={`${job.tts_rewrites.length} cue${job.tts_rewrites.length > 1 ? 's were' : ' was'} auto-rewritten for TTS`}
className="inline-flex items-center"
>
<svg className="w-4 h-4 text-amber-500" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
</svg>
</span>
)}
</div>
</td>
{/* Created By */}
<td className="px-4 py-4 whitespace-nowrap text-sm text-gray-500">

View file

@ -139,6 +139,14 @@ export interface AccessibleVideoProgressItem {
completed_at?: string;
}
export interface TTSRewriteItem {
language: string;
cue_index: number;
original_text: string;
rewritten_text: string;
timestamp: string;
}
export interface Job {
id: string;
client_id: string;
@ -151,6 +159,7 @@ export interface Job {
accessible_video_progress?: Record<string, AccessibleVideoProgressItem>;
ai?: AISection;
error?: Record<string, unknown>;
tts_rewrites?: TTSRewriteItem[]; // Track auto-rewritten TTS cues
created_at: string;
updated_at: string;
created_by_name?: string; // User's full_name who created the job