feat: auto-rewrite TTS cues that fail synthesis
When TTS synthesis fails after 3 retries, the system now: - Sends problematic cue text to Gemini for TTS-safe rewriting - Updates the VTT file in GCS with rewritten text - Retries TTS synthesis with the new text - Records successful rewrites in job.tts_rewrites field UI changes: - JobDetail shows amber caution box with original/rewritten text - JobsList shows warning icon next to jobs with rewrites - Error display clarifies text shown is "after rewrite attempt" Files changed: - backend/app/models/job.py: Add tts_rewrites field - backend/app/prompts/gemini_tts_rewrite.md: New prompt template - backend/app/services/gemini.py: Add rewrite_tts_cue method - backend/app/tasks/tts_synthesis.py: Add VTT update utilities - backend/app/tasks/translate_and_synthesize.py: Rewrite+retry logic - frontend/src/types/api.ts: Add TTSRewriteItem type - frontend/src/routes/jobs/JobDetail.tsx: Caution display - frontend/src/routes/jobs/JobsList.tsx: Warning indicator 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
76c4c60b0d
commit
e44210ea64
8 changed files with 326 additions and 17 deletions
|
|
@ -116,6 +116,7 @@ class Job(BaseModel):
|
|||
accessible_video_progress: Optional[dict[str, AccessibleVideoProgressItem]] = None
|
||||
ai: Optional[AISection] = None
|
||||
error: Optional[dict[str, Any]] = None
|
||||
tts_rewrites: Optional[list[dict[str, Any]]] = None # Track auto-rewritten TTS cues
|
||||
created_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None
|
||||
|
||||
|
|
|
|||
19
backend/app/prompts/gemini_tts_rewrite.md
Normal file
19
backend/app/prompts/gemini_tts_rewrite.md
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
SYSTEM:
|
||||
You are an accessibility content editor specializing in text-to-speech optimization.
|
||||
|
||||
USER:
|
||||
The following audio description cue failed text-to-speech synthesis after multiple attempts. Rewrite it to be TTS-friendly while preserving the exact same visual information being described.
|
||||
|
||||
Original text: "{ORIGINAL_TEXT}"
|
||||
Language: {LANGUAGE}
|
||||
|
||||
Guidelines for TTS-safe text:
|
||||
- Avoid special characters, symbols, or unusual punctuation
|
||||
- Spell out abbreviations and acronyms (e.g., "Hz" becomes "hertz")
|
||||
- Use simple, common words when possible
|
||||
- Keep sentences concise (under 100 characters preferred)
|
||||
- Avoid words that may be difficult to pronounce or sound medical/technical
|
||||
- Do not use quotation marks within the text
|
||||
- Use natural spoken phrasing
|
||||
|
||||
Return ONLY the rewritten text, nothing else. Do not include quotes around the response.
|
||||
|
|
@ -786,6 +786,64 @@ VTT Content to translate:
|
|||
logger.error(f"Gemini translation failed for {target_language}: {e}")
|
||||
raise
|
||||
|
||||
async def rewrite_tts_cue(
|
||||
self,
|
||||
original_text: str,
|
||||
language: str = "en"
|
||||
) -> str:
|
||||
"""
|
||||
Rewrite an audio description cue to be TTS-friendly.
|
||||
|
||||
Called when TTS synthesis fails for a cue after retries. Uses Gemini
|
||||
to rephrase the text while preserving the visual information being described.
|
||||
|
||||
Args:
|
||||
original_text: The cue text that failed TTS synthesis
|
||||
language: Language code for context (default: 'en')
|
||||
|
||||
Returns:
|
||||
Rewritten text optimized for TTS synthesis
|
||||
"""
|
||||
prompt_template = self._load_prompt("gemini_tts_rewrite.md")
|
||||
prompt = prompt_template.replace(
|
||||
"{ORIGINAL_TEXT}", original_text
|
||||
).replace(
|
||||
"{LANGUAGE}", language
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f"Rewriting TTS cue for safety: '{original_text[:50]}...'")
|
||||
|
||||
response = await asyncio.to_thread(
|
||||
client.models.generate_content,
|
||||
model=self.model_name,
|
||||
contents=[genai.types.Part.from_text(text=prompt)]
|
||||
)
|
||||
|
||||
result = response.text.strip()
|
||||
|
||||
# Remove any markdown formatting or quotes that Gemini might add
|
||||
if result.startswith("```"):
|
||||
lines = result.split("\n")
|
||||
filtered_lines = [
|
||||
line for line in lines
|
||||
if not line.strip().startswith("```")
|
||||
]
|
||||
result = "\n".join(filtered_lines).strip()
|
||||
|
||||
# Remove surrounding quotes if present
|
||||
if result.startswith('"') and result.endswith('"'):
|
||||
result = result[1:-1]
|
||||
if result.startswith("'") and result.endswith("'"):
|
||||
result = result[1:-1]
|
||||
|
||||
logger.info(f"Rewrote TTS cue: '{original_text[:30]}...' -> '{result[:30]}...'")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to rewrite TTS cue: {e}")
|
||||
raise
|
||||
|
||||
|
||||
# Global service instance
|
||||
gemini_service = GeminiService()
|
||||
|
|
|
|||
|
|
@ -573,7 +573,7 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict,
|
|||
import io
|
||||
from celery.result import allow_join_result
|
||||
from pydub import AudioSegment
|
||||
from .tts_synthesis import dispatch_language_tts, parse_ad_cues
|
||||
from .tts_synthesis import dispatch_language_tts, parse_ad_cues, synthesize_cue_task, update_vtt_in_gcs
|
||||
|
||||
if tts_preferences is None:
|
||||
tts_preferences = {}
|
||||
|
|
@ -651,16 +651,106 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict,
|
|||
|
||||
cue_results = processed_results
|
||||
|
||||
# Check for failures
|
||||
# Check for failures and attempt automatic rewrite+retry
|
||||
failed_cues = [r for r in cue_results if not r.get("success", False)]
|
||||
if failed_cues:
|
||||
first_failure = failed_cues[0]
|
||||
raise TTSSynthesisError(
|
||||
message=f"{len(failed_cues)} cue(s) failed: {first_failure.get('error_message', 'Unknown error')}",
|
||||
cue_index=first_failure["cue_index"],
|
||||
cue_text=first_failure.get("text", ""),
|
||||
api_response_info=first_failure.get("error_message")
|
||||
)
|
||||
logger.info(f"TTS failed for {len(failed_cues)} cue(s), attempting automatic rewrite")
|
||||
|
||||
# Extract TTS settings for retry (same as dispatch_language_tts)
|
||||
voices_per_language = tts_preferences.get("voices_per_language", {})
|
||||
voice_name = voices_per_language.get(language, tts_preferences.get("default_voice"))
|
||||
provider = tts_preferences.get("provider", "gemini")
|
||||
model = tts_preferences.get("model", "flash")
|
||||
speed = tts_preferences.get("speed", 1.0)
|
||||
style_preset = tts_preferences.get("style_preset", "neutral")
|
||||
custom_style_prompt = tts_preferences.get("custom_style_prompt")
|
||||
|
||||
if style_preset == "custom" and custom_style_prompt:
|
||||
style_prompt = custom_style_prompt
|
||||
else:
|
||||
style_prompt = settings.gemini_tts_style_prompts.get(style_preset, "")
|
||||
|
||||
rewrites_made = []
|
||||
still_failed = []
|
||||
|
||||
for failure in failed_cues:
|
||||
cue_idx = failure["cue_index"]
|
||||
original_text = failure.get("text", "")
|
||||
|
||||
try:
|
||||
# Step 1: Get Gemini rewrite
|
||||
logger.info(f"Requesting Gemini rewrite for cue {cue_idx}: '{original_text[:50]}...'")
|
||||
rewritten_text = await gemini_service.rewrite_tts_cue(original_text, language)
|
||||
|
||||
# Step 2: Update VTT in GCS
|
||||
await update_vtt_in_gcs(job_id, language, cue_idx, rewritten_text)
|
||||
|
||||
# Step 3: Retry TTS with rewritten text
|
||||
logger.info(f"Retrying TTS for cue {cue_idx} with rewritten text")
|
||||
retry_result = synthesize_cue_task.apply_async(
|
||||
kwargs={
|
||||
"job_id": job_id,
|
||||
"language": language,
|
||||
"cue_index": cue_idx,
|
||||
"text": rewritten_text,
|
||||
"start_time": failure["start_time"],
|
||||
"end_time": failure["end_time"],
|
||||
"voice_name": voice_name,
|
||||
"provider": provider,
|
||||
"model": model,
|
||||
"speed": speed,
|
||||
"style_prompt": style_prompt
|
||||
},
|
||||
queue="tts"
|
||||
)
|
||||
|
||||
# Wait for retry result
|
||||
with allow_join_result():
|
||||
retry_cue_result = retry_result.get(timeout=120)
|
||||
|
||||
if retry_cue_result.get("success"):
|
||||
# Success! Record rewrite
|
||||
logger.info(f"Rewrite+retry succeeded for cue {cue_idx}")
|
||||
rewrites_made.append({
|
||||
"language": language,
|
||||
"cue_index": cue_idx,
|
||||
"original_text": original_text,
|
||||
"rewritten_text": rewritten_text,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
})
|
||||
# Update cue_results with successful result
|
||||
for i, r in enumerate(cue_results):
|
||||
if r["cue_index"] == cue_idx:
|
||||
cue_results[i] = retry_cue_result
|
||||
break
|
||||
else:
|
||||
logger.warning(f"TTS still failed after rewrite for cue {cue_idx}")
|
||||
still_failed.append({**failure, "rewritten_text": rewritten_text})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Rewrite attempt failed for cue {cue_idx}: {e}")
|
||||
still_failed.append(failure)
|
||||
|
||||
# Store rewrite history in job for UI caution display
|
||||
if rewrites_made:
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$push": {"tts_rewrites": {"$each": rewrites_made}},
|
||||
"$set": {"updated_at": datetime.utcnow()}
|
||||
}
|
||||
)
|
||||
logger.info(f"Recorded {len(rewrites_made)} TTS rewrite(s) for job {job_id}")
|
||||
|
||||
# If any cues still failed after rewrite, raise error
|
||||
if still_failed:
|
||||
first_failure = still_failed[0]
|
||||
raise TTSSynthesisError(
|
||||
message=f"{len(still_failed)} cue(s) failed even after rewrite: {first_failure.get('error_message', 'Unknown error')}",
|
||||
cue_index=first_failure["cue_index"],
|
||||
cue_text=first_failure.get("rewritten_text", first_failure.get("text", "")),
|
||||
api_response_info=first_failure.get("error_message")
|
||||
)
|
||||
|
||||
logger.info(f"All {len(cue_results)} TTS cues synthesized for {language}, assembling combined MP3")
|
||||
|
||||
|
|
|
|||
|
|
@ -376,3 +376,87 @@ def _parse_timestamp(timestamp: str) -> float:
|
|||
)
|
||||
|
||||
return total_seconds
|
||||
|
||||
|
||||
def update_vtt_cue_text(vtt_content: str, cue_index: int, new_text: str) -> str:
|
||||
"""
|
||||
Update a specific cue's text in VTT content.
|
||||
|
||||
Args:
|
||||
vtt_content: Original VTT file content
|
||||
cue_index: Zero-based index of cue to update
|
||||
new_text: New text for the cue
|
||||
|
||||
Returns:
|
||||
Updated VTT content
|
||||
"""
|
||||
lines = vtt_content.strip().split('\n')
|
||||
result_lines = []
|
||||
current_cue = -1
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# Skip header and notes
|
||||
if line.strip() == "WEBVTT" or line.strip() == "" or line.strip().startswith("NOTE"):
|
||||
result_lines.append(line)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for timing line
|
||||
if " --> " in line:
|
||||
current_cue += 1
|
||||
result_lines.append(line)
|
||||
i += 1
|
||||
|
||||
# Process text lines for this cue
|
||||
if current_cue == cue_index:
|
||||
# Skip old text lines
|
||||
while i < len(lines) and lines[i].strip() != "":
|
||||
i += 1
|
||||
# Add new text
|
||||
result_lines.append(new_text)
|
||||
else:
|
||||
# Keep existing text lines
|
||||
while i < len(lines) and lines[i].strip() != "":
|
||||
result_lines.append(lines[i])
|
||||
i += 1
|
||||
else:
|
||||
result_lines.append(line)
|
||||
i += 1
|
||||
|
||||
return '\n'.join(result_lines)
|
||||
|
||||
|
||||
async def update_vtt_in_gcs(
|
||||
job_id: str,
|
||||
language: str,
|
||||
cue_index: int,
|
||||
new_text: str
|
||||
) -> str:
|
||||
"""
|
||||
Update a cue in the AD VTT file stored in GCS.
|
||||
|
||||
Args:
|
||||
job_id: Job identifier
|
||||
language: Language code
|
||||
cue_index: Index of cue to update
|
||||
new_text: New text for the cue
|
||||
|
||||
Returns:
|
||||
Updated VTT content
|
||||
"""
|
||||
# Download current VTT
|
||||
vtt_blob_path = f"{job_id}/{language}/ad.vtt"
|
||||
blob = gcs_service.bucket.blob(vtt_blob_path)
|
||||
current_vtt = blob.download_as_text()
|
||||
|
||||
# Update the cue
|
||||
updated_vtt = update_vtt_cue_text(current_vtt, cue_index, new_text)
|
||||
|
||||
# Upload back to GCS
|
||||
blob.upload_from_string(updated_vtt, content_type="text/vtt")
|
||||
|
||||
logger.info(f"Updated VTT cue {cue_index} in GCS: {vtt_blob_path}")
|
||||
return updated_vtt
|
||||
|
|
|
|||
|
|
@ -475,12 +475,12 @@ export function JobDetail() {
|
|||
)}
|
||||
{typeof job.error.cue_text === 'string' && (
|
||||
<div className="mt-2 p-2 bg-red-100 rounded text-xs text-red-700">
|
||||
<span className="font-medium">Cue Text:</span>
|
||||
<span className="font-medium">Cue Text (after rewrite attempt):</span>
|
||||
<p className="mt-1 italic">"{job.error.cue_text}"</p>
|
||||
</div>
|
||||
)}
|
||||
<p className="text-xs text-red-600 mt-2">
|
||||
This may be caused by content safety filters. Try editing the audio description text to rephrase the blocked cue, then retry.
|
||||
This cue failed even after automatic rewriting. Manual editing may be required.
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
|
|
@ -505,6 +505,41 @@ export function JobDetail() {
|
|||
</div>
|
||||
)}
|
||||
|
||||
{/* TTS Rewrites Caution Display */}
|
||||
{job.tts_rewrites && job.tts_rewrites.length > 0 && (
|
||||
<div className="bg-amber-50 border border-amber-200 rounded-lg p-4">
|
||||
<div className="flex items-start gap-2 mb-2">
|
||||
<svg className="w-5 h-5 text-amber-600 flex-shrink-0 mt-0.5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
|
||||
</svg>
|
||||
<h3 className="text-sm font-medium text-amber-800">Audio Description Rewrites</h3>
|
||||
</div>
|
||||
<p className="text-xs text-amber-700 mb-3">
|
||||
{job.tts_rewrites.length} cue{job.tts_rewrites.length > 1 ? 's were' : ' was'} automatically rewritten to pass TTS synthesis.
|
||||
Please review for accuracy.
|
||||
</p>
|
||||
<div className="space-y-3 max-h-64 overflow-y-auto">
|
||||
{job.tts_rewrites.map((rewrite, idx) => (
|
||||
<div key={idx} className="border-t border-amber-200 pt-2">
|
||||
<p className="text-xs text-amber-800 font-medium">
|
||||
{rewrite.language.toUpperCase()} - Cue #{rewrite.cue_index + 1}
|
||||
</p>
|
||||
<div className="mt-1 grid grid-cols-1 gap-2">
|
||||
<div className="p-2 bg-amber-100 rounded text-xs">
|
||||
<span className="font-medium text-amber-700">Original:</span>
|
||||
<p className="mt-1 text-amber-800 line-through">"{rewrite.original_text}"</p>
|
||||
</div>
|
||||
<div className="p-2 bg-green-100 rounded text-xs">
|
||||
<span className="font-medium text-green-700">Rewritten:</span>
|
||||
<p className="mt-1 text-green-800">"{rewrite.rewritten_text}"</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Retry TTS Button for failed jobs */}
|
||||
{job.status === 'tts_failed' && (
|
||||
<div className="bg-orange-50 border border-orange-200 rounded-lg p-4">
|
||||
|
|
|
|||
|
|
@ -723,12 +723,25 @@ export function JobsList() {
|
|||
)}
|
||||
{/* Job Name */}
|
||||
<td className="px-4 py-4 whitespace-nowrap">
|
||||
<Link
|
||||
to={`/jobs/${job.id}`}
|
||||
className="text-sm font-medium text-gray-900 hover:text-blue-600"
|
||||
>
|
||||
{job.title}
|
||||
</Link>
|
||||
<div className="flex items-center gap-2">
|
||||
<Link
|
||||
to={`/jobs/${job.id}`}
|
||||
className="text-sm font-medium text-gray-900 hover:text-blue-600"
|
||||
>
|
||||
{job.title}
|
||||
</Link>
|
||||
{/* TTS Rewrite Caution Indicator */}
|
||||
{job.tts_rewrites && job.tts_rewrites.length > 0 && (
|
||||
<span
|
||||
title={`${job.tts_rewrites.length} cue${job.tts_rewrites.length > 1 ? 's were' : ' was'} auto-rewritten for TTS`}
|
||||
className="inline-flex items-center"
|
||||
>
|
||||
<svg className="w-4 h-4 text-amber-500" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
|
||||
</svg>
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</td>
|
||||
{/* Created By */}
|
||||
<td className="px-4 py-4 whitespace-nowrap text-sm text-gray-500">
|
||||
|
|
|
|||
|
|
@ -139,6 +139,14 @@ export interface AccessibleVideoProgressItem {
|
|||
completed_at?: string;
|
||||
}
|
||||
|
||||
export interface TTSRewriteItem {
|
||||
language: string;
|
||||
cue_index: number;
|
||||
original_text: string;
|
||||
rewritten_text: string;
|
||||
timestamp: string;
|
||||
}
|
||||
|
||||
export interface Job {
|
||||
id: string;
|
||||
client_id: string;
|
||||
|
|
@ -151,6 +159,7 @@ export interface Job {
|
|||
accessible_video_progress?: Record<string, AccessibleVideoProgressItem>;
|
||||
ai?: AISection;
|
||||
error?: Record<string, unknown>;
|
||||
tts_rewrites?: TTSRewriteItem[]; // Track auto-rewritten TTS cues
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
created_by_name?: string; // User's full_name who created the job
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue