feat: auto-rewrite TTS cues that fail synthesis

When TTS synthesis fails after 3 retries, the system now: - Sends problematic cue text to Gemini for TTS-safe rewriting - Updates the VTT file in GCS with rewritten text - Retries TTS synthesis with the new text - Records successful rewrites in job.tts_rewrites field UI changes: - JobDetail shows amber caution box with original/rewritten text - JobsList shows warning icon next to jobs with rewrites - Error display clarifies text shown is "after rewrite attempt" Files changed: - backend/app/models/job.py: Add tts_rewrites field - backend/app/prompts/gemini_tts_rewrite.md: New prompt template - backend/app/services/gemini.py: Add rewrite_tts_cue method - backend/app/tasks/tts_synthesis.py: Add VTT update utilities - backend/app/tasks/translate_and_synthesize.py: Rewrite+retry logic - frontend/src/types/api.ts: Add TTSRewriteItem type - frontend/src/routes/jobs/JobDetail.tsx: Caution display - frontend/src/routes/jobs/JobsList.tsx: Warning indicator 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-05 14:42:50 -06:00 · 2026-01-05 14:42:50 -06:00 · e44210ea64
commit e44210ea64
parent 76c4c60b0d
8 changed files with 326 additions and 17 deletions
--- a/backend/app/models/job.py
+++ b/backend/app/models/job.py
@ -116,6 +116,7 @@ class Job(BaseModel):
    accessible_video_progress: Optional[dict[str, AccessibleVideoProgressItem]] = None
    ai: Optional[AISection] = None
    error: Optional[dict[str, Any]] = None
+    tts_rewrites: Optional[list[dict[str, Any]]] = None  # Track auto-rewritten TTS cues
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None

--- a/backend/app/prompts/gemini_tts_rewrite.md
+++ b/backend/app/prompts/gemini_tts_rewrite.md
@ -0,0 +1,19 @@
+SYSTEM:
+You are an accessibility content editor specializing in text-to-speech optimization.
+
+USER:
+The following audio description cue failed text-to-speech synthesis after multiple attempts. Rewrite it to be TTS-friendly while preserving the exact same visual information being described.
+
+Original text: "{ORIGINAL_TEXT}"
+Language: {LANGUAGE}
+
+Guidelines for TTS-safe text:
+- Avoid special characters, symbols, or unusual punctuation
+- Spell out abbreviations and acronyms (e.g., "Hz" becomes "hertz")
+- Use simple, common words when possible
+- Keep sentences concise (under 100 characters preferred)
+- Avoid words that may be difficult to pronounce or sound medical/technical
+- Do not use quotation marks within the text
+- Use natural spoken phrasing
+
+Return ONLY the rewritten text, nothing else. Do not include quotes around the response.
--- a/backend/app/services/gemini.py
+++ b/backend/app/services/gemini.py
@ -786,6 +786,64 @@ VTT Content to translate:
            logger.error(f"Gemini translation failed for {target_language}: {e}")
            raise

+    async def rewrite_tts_cue(
+        self,
+        original_text: str,
+        language: str = "en"
+    ) -> str:
+        """
+        Rewrite an audio description cue to be TTS-friendly.
+
+        Called when TTS synthesis fails for a cue after retries. Uses Gemini
+        to rephrase the text while preserving the visual information being described.
+
+        Args:
+            original_text: The cue text that failed TTS synthesis
+            language: Language code for context (default: 'en')
+
+        Returns:
+            Rewritten text optimized for TTS synthesis
+        """
+        prompt_template = self._load_prompt("gemini_tts_rewrite.md")
+        prompt = prompt_template.replace(
+            "{ORIGINAL_TEXT}", original_text
+        ).replace(
+            "{LANGUAGE}", language
+        )
+
+        try:
+            logger.info(f"Rewriting TTS cue for safety: '{original_text[:50]}...'")
+
+            response = await asyncio.to_thread(
+                client.models.generate_content,
+                model=self.model_name,
+                contents=[genai.types.Part.from_text(text=prompt)]
+            )
+
+            result = response.text.strip()
+
+            # Remove any markdown formatting or quotes that Gemini might add
+            if result.startswith("```"):
+                lines = result.split("\n")
+                filtered_lines = [
+                    line for line in lines
+                    if not line.strip().startswith("```")
+                ]
+                result = "\n".join(filtered_lines).strip()
+
+            # Remove surrounding quotes if present
+            if result.startswith('"') and result.endswith('"'):
+                result = result[1:-1]
+            if result.startswith("'") and result.endswith("'"):
+                result = result[1:-1]
+
+            logger.info(f"Rewrote TTS cue: '{original_text[:30]}...' -> '{result[:30]}...'")
+            return result
+
+        except Exception as e:
+            logger.error(f"Failed to rewrite TTS cue: {e}")
+            raise
+

 # Global service instance
 gemini_service = GeminiService()
--- a/backend/app/tasks/translate_and_synthesize.py
+++ b/backend/app/tasks/translate_and_synthesize.py
@ -573,7 +573,7 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict,
    import io
    from celery.result import allow_join_result
    from pydub import AudioSegment
-    from .tts_synthesis import dispatch_language_tts, parse_ad_cues
+    from .tts_synthesis import dispatch_language_tts, parse_ad_cues, synthesize_cue_task, update_vtt_in_gcs

    if tts_preferences is None:
        tts_preferences = {}
@ -651,16 +651,106 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict,

        cue_results = processed_results

-        # Check for failures
+        # Check for failures and attempt automatic rewrite+retry
        failed_cues = [r for r in cue_results if not r.get("success", False)]
        if failed_cues:
-            first_failure = failed_cues[0]
-            raise TTSSynthesisError(
-                message=f"{len(failed_cues)} cue(s) failed: {first_failure.get('error_message', 'Unknown error')}",
-                cue_index=first_failure["cue_index"],
-                cue_text=first_failure.get("text", ""),
-                api_response_info=first_failure.get("error_message")
-            )
+            logger.info(f"TTS failed for {len(failed_cues)} cue(s), attempting automatic rewrite")
+
+            # Extract TTS settings for retry (same as dispatch_language_tts)
+            voices_per_language = tts_preferences.get("voices_per_language", {})
+            voice_name = voices_per_language.get(language, tts_preferences.get("default_voice"))
+            provider = tts_preferences.get("provider", "gemini")
+            model = tts_preferences.get("model", "flash")
+            speed = tts_preferences.get("speed", 1.0)
+            style_preset = tts_preferences.get("style_preset", "neutral")
+            custom_style_prompt = tts_preferences.get("custom_style_prompt")
+
+            if style_preset == "custom" and custom_style_prompt:
+                style_prompt = custom_style_prompt
+            else:
+                style_prompt = settings.gemini_tts_style_prompts.get(style_preset, "")
+
+            rewrites_made = []
+            still_failed = []
+
+            for failure in failed_cues:
+                cue_idx = failure["cue_index"]
+                original_text = failure.get("text", "")
+
+                try:
+                    # Step 1: Get Gemini rewrite
+                    logger.info(f"Requesting Gemini rewrite for cue {cue_idx}: '{original_text[:50]}...'")
+                    rewritten_text = await gemini_service.rewrite_tts_cue(original_text, language)
+
+                    # Step 2: Update VTT in GCS
+                    await update_vtt_in_gcs(job_id, language, cue_idx, rewritten_text)
+
+                    # Step 3: Retry TTS with rewritten text
+                    logger.info(f"Retrying TTS for cue {cue_idx} with rewritten text")
+                    retry_result = synthesize_cue_task.apply_async(
+                        kwargs={
+                            "job_id": job_id,
+                            "language": language,
+                            "cue_index": cue_idx,
+                            "text": rewritten_text,
+                            "start_time": failure["start_time"],
+                            "end_time": failure["end_time"],
+                            "voice_name": voice_name,
+                            "provider": provider,
+                            "model": model,
+                            "speed": speed,
+                            "style_prompt": style_prompt
+                        },
+                        queue="tts"
+                    )
+
+                    # Wait for retry result
+                    with allow_join_result():
+                        retry_cue_result = retry_result.get(timeout=120)
+
+                    if retry_cue_result.get("success"):
+                        # Success! Record rewrite
+                        logger.info(f"Rewrite+retry succeeded for cue {cue_idx}")
+                        rewrites_made.append({
+                            "language": language,
+                            "cue_index": cue_idx,
+                            "original_text": original_text,
+                            "rewritten_text": rewritten_text,
+                            "timestamp": datetime.utcnow().isoformat()
+                        })
+                        # Update cue_results with successful result
+                        for i, r in enumerate(cue_results):
+                            if r["cue_index"] == cue_idx:
+                                cue_results[i] = retry_cue_result
+                                break
+                    else:
+                        logger.warning(f"TTS still failed after rewrite for cue {cue_idx}")
+                        still_failed.append({**failure, "rewritten_text": rewritten_text})
+
+                except Exception as e:
+                    logger.error(f"Rewrite attempt failed for cue {cue_idx}: {e}")
+                    still_failed.append(failure)
+
+            # Store rewrite history in job for UI caution display
+            if rewrites_made:
+                await db.jobs.update_one(
+                    {"_id": job_id},
+                    {
+                        "$push": {"tts_rewrites": {"$each": rewrites_made}},
+                        "$set": {"updated_at": datetime.utcnow()}
+                    }
+                )
+                logger.info(f"Recorded {len(rewrites_made)} TTS rewrite(s) for job {job_id}")
+
+            # If any cues still failed after rewrite, raise error
+            if still_failed:
+                first_failure = still_failed[0]
+                raise TTSSynthesisError(
+                    message=f"{len(still_failed)} cue(s) failed even after rewrite: {first_failure.get('error_message', 'Unknown error')}",
+                    cue_index=first_failure["cue_index"],
+                    cue_text=first_failure.get("rewritten_text", first_failure.get("text", "")),
+                    api_response_info=first_failure.get("error_message")
+                )

        logger.info(f"All {len(cue_results)} TTS cues synthesized for {language}, assembling combined MP3")

--- a/backend/app/tasks/tts_synthesis.py
+++ b/backend/app/tasks/tts_synthesis.py
@ -376,3 +376,87 @@ def _parse_timestamp(timestamp: str) -> float:
    )

    return total_seconds
+
+
+def update_vtt_cue_text(vtt_content: str, cue_index: int, new_text: str) -> str:
+    """
+    Update a specific cue's text in VTT content.
+
+    Args:
+        vtt_content: Original VTT file content
+        cue_index: Zero-based index of cue to update
+        new_text: New text for the cue
+
+    Returns:
+        Updated VTT content
+    """
+    lines = vtt_content.strip().split('\n')
+    result_lines = []
+    current_cue = -1
+    i = 0
+
+    while i < len(lines):
+        line = lines[i]
+
+        # Skip header and notes
+        if line.strip() == "WEBVTT" or line.strip() == "" or line.strip().startswith("NOTE"):
+            result_lines.append(line)
+            i += 1
+            continue
+
+        # Check for timing line
+        if " --> " in line:
+            current_cue += 1
+            result_lines.append(line)
+            i += 1
+
+            # Process text lines for this cue
+            if current_cue == cue_index:
+                # Skip old text lines
+                while i < len(lines) and lines[i].strip() != "":
+                    i += 1
+                # Add new text
+                result_lines.append(new_text)
+            else:
+                # Keep existing text lines
+                while i < len(lines) and lines[i].strip() != "":
+                    result_lines.append(lines[i])
+                    i += 1
+        else:
+            result_lines.append(line)
+            i += 1
+
+    return '\n'.join(result_lines)
+
+
+async def update_vtt_in_gcs(
+    job_id: str,
+    language: str,
+    cue_index: int,
+    new_text: str
+) -> str:
+    """
+    Update a cue in the AD VTT file stored in GCS.
+
+    Args:
+        job_id: Job identifier
+        language: Language code
+        cue_index: Index of cue to update
+        new_text: New text for the cue
+
+    Returns:
+        Updated VTT content
+    """
+    # Download current VTT
+    vtt_blob_path = f"{job_id}/{language}/ad.vtt"
+    blob = gcs_service.bucket.blob(vtt_blob_path)
+    current_vtt = blob.download_as_text()
+
+    # Update the cue
+    updated_vtt = update_vtt_cue_text(current_vtt, cue_index, new_text)
+
+    # Upload back to GCS
+    blob.upload_from_string(updated_vtt, content_type="text/vtt")
+
+    logger.info(f"Updated VTT cue {cue_index} in GCS: {vtt_blob_path}")
+    return updated_vtt
--- a/frontend/src/routes/jobs/JobDetail.tsx
+++ b/frontend/src/routes/jobs/JobDetail.tsx
@ -475,12 +475,12 @@ export function JobDetail() {
                  )}
                  {typeof job.error.cue_text === 'string' && (
                    <div className="mt-2 p-2 bg-red-100 rounded text-xs text-red-700">
-                      <span className="font-medium">Cue Text:</span>
+                      <span className="font-medium">Cue Text (after rewrite attempt):</span>
                      <p className="mt-1 italic">"{job.error.cue_text}"</p>
                    </div>
                  )}
                  <p className="text-xs text-red-600 mt-2">
-                    This may be caused by content safety filters. Try editing the audio description text to rephrase the blocked cue, then retry.
+                    This cue failed even after automatic rewriting. Manual editing may be required.
                  </p>
                </div>
              )}
@ -505,6 +505,41 @@ export function JobDetail() {
            </div>
          )}

+          {/* TTS Rewrites Caution Display */}
+          {job.tts_rewrites && job.tts_rewrites.length > 0 && (
+            <div className="bg-amber-50 border border-amber-200 rounded-lg p-4">
+              <div className="flex items-start gap-2 mb-2">
+                <svg className="w-5 h-5 text-amber-600 flex-shrink-0 mt-0.5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                  <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
+                </svg>
+                <h3 className="text-sm font-medium text-amber-800">Audio Description Rewrites</h3>
+              </div>
+              <p className="text-xs text-amber-700 mb-3">
+                {job.tts_rewrites.length} cue{job.tts_rewrites.length > 1 ? 's were' : ' was'} automatically rewritten to pass TTS synthesis.
+                Please review for accuracy.
+              </p>
+              <div className="space-y-3 max-h-64 overflow-y-auto">
+                {job.tts_rewrites.map((rewrite, idx) => (
+                  <div key={idx} className="border-t border-amber-200 pt-2">
+                    <p className="text-xs text-amber-800 font-medium">
+                      {rewrite.language.toUpperCase()} - Cue #{rewrite.cue_index + 1}
+                    </p>
+                    <div className="mt-1 grid grid-cols-1 gap-2">
+                      <div className="p-2 bg-amber-100 rounded text-xs">
+                        <span className="font-medium text-amber-700">Original:</span>
+                        <p className="mt-1 text-amber-800 line-through">"{rewrite.original_text}"</p>
+                      </div>
+                      <div className="p-2 bg-green-100 rounded text-xs">
+                        <span className="font-medium text-green-700">Rewritten:</span>
+                        <p className="mt-1 text-green-800">"{rewrite.rewritten_text}"</p>
+                      </div>
+                    </div>
+                  </div>
+                ))}
+              </div>
+            </div>
+          )}
+
          {/* Retry TTS Button for failed jobs */}
          {job.status === 'tts_failed' && (
            <div className="bg-orange-50 border border-orange-200 rounded-lg p-4">
--- a/frontend/src/routes/jobs/JobsList.tsx
+++ b/frontend/src/routes/jobs/JobsList.tsx
@ -723,12 +723,25 @@ export function JobsList() {
                    )}
                    {/* Job Name */}
                    <td className="px-4 py-4 whitespace-nowrap">
-                      <Link
-                        to={`/jobs/${job.id}`}
-                        className="text-sm font-medium text-gray-900 hover:text-blue-600"
-                      >
-                        {job.title}
-                      </Link>
+                      <div className="flex items-center gap-2">
+                        <Link
+                          to={`/jobs/${job.id}`}
+                          className="text-sm font-medium text-gray-900 hover:text-blue-600"
+                        >
+                          {job.title}
+                        </Link>
+                        {/* TTS Rewrite Caution Indicator */}
+                        {job.tts_rewrites && job.tts_rewrites.length > 0 && (
+                          <span
+                            title={`${job.tts_rewrites.length} cue${job.tts_rewrites.length > 1 ? 's were' : ' was'} auto-rewritten for TTS`}
+                            className="inline-flex items-center"
+                          >
+                            <svg className="w-4 h-4 text-amber-500" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                              <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
+                            </svg>
+                          </span>
+                        )}
+                      </div>
                    </td>
                    {/* Created By */}
                    <td className="px-4 py-4 whitespace-nowrap text-sm text-gray-500">
--- a/frontend/src/types/api.ts
+++ b/frontend/src/types/api.ts
@ -139,6 +139,14 @@ export interface AccessibleVideoProgressItem {
  completed_at?: string;
 }

+export interface TTSRewriteItem {
+  language: string;
+  cue_index: number;
+  original_text: string;
+  rewritten_text: string;
+  timestamp: string;
+}
+
 export interface Job {
  id: string;
  client_id: string;
@ -151,6 +159,7 @@ export interface Job {
  accessible_video_progress?: Record<string, AccessibleVideoProgressItem>;
  ai?: AISection;
  error?: Record<string, unknown>;
+  tts_rewrites?: TTSRewriteItem[];  // Track auto-rewritten TTS cues
  created_at: string;
  updated_at: string;
  created_by_name?: string;  // User's full_name who created the job