diff --git a/backend/app/models/job.py b/backend/app/models/job.py index 2f2cf94..f7bc1ae 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -116,6 +116,7 @@ class Job(BaseModel): accessible_video_progress: Optional[dict[str, AccessibleVideoProgressItem]] = None ai: Optional[AISection] = None error: Optional[dict[str, Any]] = None + tts_rewrites: Optional[list[dict[str, Any]]] = None # Track auto-rewritten TTS cues created_at: Optional[datetime] = None updated_at: Optional[datetime] = None diff --git a/backend/app/prompts/gemini_tts_rewrite.md b/backend/app/prompts/gemini_tts_rewrite.md new file mode 100644 index 0000000..5857a0d --- /dev/null +++ b/backend/app/prompts/gemini_tts_rewrite.md @@ -0,0 +1,19 @@ +SYSTEM: +You are an accessibility content editor specializing in text-to-speech optimization. + +USER: +The following audio description cue failed text-to-speech synthesis after multiple attempts. Rewrite it to be TTS-friendly while preserving the exact same visual information being described. + +Original text: "{ORIGINAL_TEXT}" +Language: {LANGUAGE} + +Guidelines for TTS-safe text: +- Avoid special characters, symbols, or unusual punctuation +- Spell out abbreviations and acronyms (e.g., "Hz" becomes "hertz") +- Use simple, common words when possible +- Keep sentences concise (under 100 characters preferred) +- Avoid words that may be difficult to pronounce or sound medical/technical +- Do not use quotation marks within the text +- Use natural spoken phrasing + +Return ONLY the rewritten text, nothing else. Do not include quotes around the response. diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py index 7c7be2b..405ba2d 100644 --- a/backend/app/services/gemini.py +++ b/backend/app/services/gemini.py @@ -786,6 +786,64 @@ VTT Content to translate: logger.error(f"Gemini translation failed for {target_language}: {e}") raise + async def rewrite_tts_cue( + self, + original_text: str, + language: str = "en" + ) -> str: + """ + Rewrite an audio description cue to be TTS-friendly. + + Called when TTS synthesis fails for a cue after retries. Uses Gemini + to rephrase the text while preserving the visual information being described. + + Args: + original_text: The cue text that failed TTS synthesis + language: Language code for context (default: 'en') + + Returns: + Rewritten text optimized for TTS synthesis + """ + prompt_template = self._load_prompt("gemini_tts_rewrite.md") + prompt = prompt_template.replace( + "{ORIGINAL_TEXT}", original_text + ).replace( + "{LANGUAGE}", language + ) + + try: + logger.info(f"Rewriting TTS cue for safety: '{original_text[:50]}...'") + + response = await asyncio.to_thread( + client.models.generate_content, + model=self.model_name, + contents=[genai.types.Part.from_text(text=prompt)] + ) + + result = response.text.strip() + + # Remove any markdown formatting or quotes that Gemini might add + if result.startswith("```"): + lines = result.split("\n") + filtered_lines = [ + line for line in lines + if not line.strip().startswith("```") + ] + result = "\n".join(filtered_lines).strip() + + # Remove surrounding quotes if present + if result.startswith('"') and result.endswith('"'): + result = result[1:-1] + if result.startswith("'") and result.endswith("'"): + result = result[1:-1] + + logger.info(f"Rewrote TTS cue: '{original_text[:30]}...' -> '{result[:30]}...'") + return result + + except Exception as e: + logger.error(f"Failed to rewrite TTS cue: {e}") + raise + # Global service instance gemini_service = GeminiService() diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py index baaedc3..418107c 100644 --- a/backend/app/tasks/translate_and_synthesize.py +++ b/backend/app/tasks/translate_and_synthesize.py @@ -573,7 +573,7 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict, import io from celery.result import allow_join_result from pydub import AudioSegment - from .tts_synthesis import dispatch_language_tts, parse_ad_cues + from .tts_synthesis import dispatch_language_tts, parse_ad_cues, synthesize_cue_task, update_vtt_in_gcs if tts_preferences is None: tts_preferences = {} @@ -651,16 +651,106 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict, cue_results = processed_results - # Check for failures + # Check for failures and attempt automatic rewrite+retry failed_cues = [r for r in cue_results if not r.get("success", False)] if failed_cues: - first_failure = failed_cues[0] - raise TTSSynthesisError( - message=f"{len(failed_cues)} cue(s) failed: {first_failure.get('error_message', 'Unknown error')}", - cue_index=first_failure["cue_index"], - cue_text=first_failure.get("text", ""), - api_response_info=first_failure.get("error_message") - ) + logger.info(f"TTS failed for {len(failed_cues)} cue(s), attempting automatic rewrite") + + # Extract TTS settings for retry (same as dispatch_language_tts) + voices_per_language = tts_preferences.get("voices_per_language", {}) + voice_name = voices_per_language.get(language, tts_preferences.get("default_voice")) + provider = tts_preferences.get("provider", "gemini") + model = tts_preferences.get("model", "flash") + speed = tts_preferences.get("speed", 1.0) + style_preset = tts_preferences.get("style_preset", "neutral") + custom_style_prompt = tts_preferences.get("custom_style_prompt") + + if style_preset == "custom" and custom_style_prompt: + style_prompt = custom_style_prompt + else: + style_prompt = settings.gemini_tts_style_prompts.get(style_preset, "") + + rewrites_made = [] + still_failed = [] + + for failure in failed_cues: + cue_idx = failure["cue_index"] + original_text = failure.get("text", "") + + try: + # Step 1: Get Gemini rewrite + logger.info(f"Requesting Gemini rewrite for cue {cue_idx}: '{original_text[:50]}...'") + rewritten_text = await gemini_service.rewrite_tts_cue(original_text, language) + + # Step 2: Update VTT in GCS + await update_vtt_in_gcs(job_id, language, cue_idx, rewritten_text) + + # Step 3: Retry TTS with rewritten text + logger.info(f"Retrying TTS for cue {cue_idx} with rewritten text") + retry_result = synthesize_cue_task.apply_async( + kwargs={ + "job_id": job_id, + "language": language, + "cue_index": cue_idx, + "text": rewritten_text, + "start_time": failure["start_time"], + "end_time": failure["end_time"], + "voice_name": voice_name, + "provider": provider, + "model": model, + "speed": speed, + "style_prompt": style_prompt + }, + queue="tts" + ) + + # Wait for retry result + with allow_join_result(): + retry_cue_result = retry_result.get(timeout=120) + + if retry_cue_result.get("success"): + # Success! Record rewrite + logger.info(f"Rewrite+retry succeeded for cue {cue_idx}") + rewrites_made.append({ + "language": language, + "cue_index": cue_idx, + "original_text": original_text, + "rewritten_text": rewritten_text, + "timestamp": datetime.utcnow().isoformat() + }) + # Update cue_results with successful result + for i, r in enumerate(cue_results): + if r["cue_index"] == cue_idx: + cue_results[i] = retry_cue_result + break + else: + logger.warning(f"TTS still failed after rewrite for cue {cue_idx}") + still_failed.append({**failure, "rewritten_text": rewritten_text}) + + except Exception as e: + logger.error(f"Rewrite attempt failed for cue {cue_idx}: {e}") + still_failed.append(failure) + + # Store rewrite history in job for UI caution display + if rewrites_made: + await db.jobs.update_one( + {"_id": job_id}, + { + "$push": {"tts_rewrites": {"$each": rewrites_made}}, + "$set": {"updated_at": datetime.utcnow()} + } + ) + logger.info(f"Recorded {len(rewrites_made)} TTS rewrite(s) for job {job_id}") + + # If any cues still failed after rewrite, raise error + if still_failed: + first_failure = still_failed[0] + raise TTSSynthesisError( + message=f"{len(still_failed)} cue(s) failed even after rewrite: {first_failure.get('error_message', 'Unknown error')}", + cue_index=first_failure["cue_index"], + cue_text=first_failure.get("rewritten_text", first_failure.get("text", "")), + api_response_info=first_failure.get("error_message") + ) logger.info(f"All {len(cue_results)} TTS cues synthesized for {language}, assembling combined MP3") diff --git a/backend/app/tasks/tts_synthesis.py b/backend/app/tasks/tts_synthesis.py index b9275d6..66cdb3f 100644 --- a/backend/app/tasks/tts_synthesis.py +++ b/backend/app/tasks/tts_synthesis.py @@ -376,3 +376,87 @@ def _parse_timestamp(timestamp: str) -> float: ) return total_seconds + + +def update_vtt_cue_text(vtt_content: str, cue_index: int, new_text: str) -> str: + """ + Update a specific cue's text in VTT content. + + Args: + vtt_content: Original VTT file content + cue_index: Zero-based index of cue to update + new_text: New text for the cue + + Returns: + Updated VTT content + """ + lines = vtt_content.strip().split('\n') + result_lines = [] + current_cue = -1 + i = 0 + + while i < len(lines): + line = lines[i] + + # Skip header and notes + if line.strip() == "WEBVTT" or line.strip() == "" or line.strip().startswith("NOTE"): + result_lines.append(line) + i += 1 + continue + + # Check for timing line + if " --> " in line: + current_cue += 1 + result_lines.append(line) + i += 1 + + # Process text lines for this cue + if current_cue == cue_index: + # Skip old text lines + while i < len(lines) and lines[i].strip() != "": + i += 1 + # Add new text + result_lines.append(new_text) + else: + # Keep existing text lines + while i < len(lines) and lines[i].strip() != "": + result_lines.append(lines[i]) + i += 1 + else: + result_lines.append(line) + i += 1 + + return '\n'.join(result_lines) + + +async def update_vtt_in_gcs( + job_id: str, + language: str, + cue_index: int, + new_text: str +) -> str: + """ + Update a cue in the AD VTT file stored in GCS. + + Args: + job_id: Job identifier + language: Language code + cue_index: Index of cue to update + new_text: New text for the cue + + Returns: + Updated VTT content + """ + # Download current VTT + vtt_blob_path = f"{job_id}/{language}/ad.vtt" + blob = gcs_service.bucket.blob(vtt_blob_path) + current_vtt = blob.download_as_text() + + # Update the cue + updated_vtt = update_vtt_cue_text(current_vtt, cue_index, new_text) + + # Upload back to GCS + blob.upload_from_string(updated_vtt, content_type="text/vtt") + + logger.info(f"Updated VTT cue {cue_index} in GCS: {vtt_blob_path}") + return updated_vtt diff --git a/frontend/src/routes/jobs/JobDetail.tsx b/frontend/src/routes/jobs/JobDetail.tsx index d58b1e9..9a9942c 100644 --- a/frontend/src/routes/jobs/JobDetail.tsx +++ b/frontend/src/routes/jobs/JobDetail.tsx @@ -475,12 +475,12 @@ export function JobDetail() { )} {typeof job.error.cue_text === 'string' && (
- Cue Text: + Cue Text (after rewrite attempt):

"{job.error.cue_text}"

)}

- This may be caused by content safety filters. Try editing the audio description text to rephrase the blocked cue, then retry. + This cue failed even after automatic rewriting. Manual editing may be required.

)} @@ -505,6 +505,41 @@ export function JobDetail() { )} + {/* TTS Rewrites Caution Display */} + {job.tts_rewrites && job.tts_rewrites.length > 0 && ( +
+
+ + + +

Audio Description Rewrites

+
+

+ {job.tts_rewrites.length} cue{job.tts_rewrites.length > 1 ? 's were' : ' was'} automatically rewritten to pass TTS synthesis. + Please review for accuracy. +

+
+ {job.tts_rewrites.map((rewrite, idx) => ( +
+

+ {rewrite.language.toUpperCase()} - Cue #{rewrite.cue_index + 1} +

+
+
+ Original: +

"{rewrite.original_text}"

+
+
+ Rewritten: +

"{rewrite.rewritten_text}"

+
+
+
+ ))} +
+
+ )} + {/* Retry TTS Button for failed jobs */} {job.status === 'tts_failed' && (
diff --git a/frontend/src/routes/jobs/JobsList.tsx b/frontend/src/routes/jobs/JobsList.tsx index c325dc8..8f9ec53 100644 --- a/frontend/src/routes/jobs/JobsList.tsx +++ b/frontend/src/routes/jobs/JobsList.tsx @@ -723,12 +723,25 @@ export function JobsList() { )} {/* Job Name */} - - {job.title} - +
+ + {job.title} + + {/* TTS Rewrite Caution Indicator */} + {job.tts_rewrites && job.tts_rewrites.length > 0 && ( + 1 ? 's were' : ' was'} auto-rewritten for TTS`} + className="inline-flex items-center" + > + + + + + )} +
{/* Created By */} diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index 706d6b4..459dadc 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -139,6 +139,14 @@ export interface AccessibleVideoProgressItem { completed_at?: string; } +export interface TTSRewriteItem { + language: string; + cue_index: number; + original_text: string; + rewritten_text: string; + timestamp: string; +} + export interface Job { id: string; client_id: string; @@ -151,6 +159,7 @@ export interface Job { accessible_video_progress?: Record; ai?: AISection; error?: Record; + tts_rewrites?: TTSRewriteItem[]; // Track auto-rewritten TTS cues created_at: string; updated_at: string; created_by_name?: string; // User's full_name who created the job