feat: DCMP compliance, descriptive transcript, new languages, QA bug fixes

- Rewrote VTT translation to two-step (text-only → Gemini → apply to original timestamps) preventing caption timing desync
- Added polling fallback for all processing states and Safari visibilitychange WebSocket reconnect
- Added 11 new TTS languages (cs, da, fi, hu, no, sk, sv, es-419, pt-BR, fr-CA)
- Updated caption/AD prompts to DCMP Captioning Key & Description Key standards (line splitting, ♪ music notation, italic tags, caption positioning, ethics guidelines)
- Added descriptive transcript generation (WCAG 2.1 §1.2.1) combining captions + AD into plain text
- Fixed amix normalize=0 to prevent audio loss in rendered videos
- Fixed AD re-timing double-count when source_ms is None
- Fixed cue block numbering to be 1-based in VttEditor and Timeline Preview

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-03-27 11:50:43 +00:00
parent f4ddcce066
commit 6f963ff7c4
22 changed files with 522 additions and 97 deletions

View file

@ -1091,6 +1091,15 @@ async def get_job_downloads(
except Exception as e:
logger.warning(f"Failed to generate signed URL for retimed captions {language}: {e}")
# Descriptive Transcript TXT (WCAG 2.1 1.2.1)
if "descriptive_transcript_gcs" in lang_output:
blob_path = lang_output["descriptive_transcript_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
signed_url = await get_signed_download_url(blob_path, 24)
lang_downloads["descriptive_transcript"] = signed_url
except Exception as e:
logger.warning(f"Failed to generate signed URL for descriptive transcript {language}: {e}")
if lang_downloads:
downloads[language] = lang_downloads

View file

@ -83,7 +83,17 @@ class Settings(BaseSettings):
"mr": "mr-IN",
"ta": "ta-IN",
"te": "te-IN",
"zh": "zh-CN"
"zh": "zh-CN",
"cs": "cs-CZ",
"da": "da-DK",
"fi": "fi-FI",
"hu": "hu-HU",
"no": "nb-NO",
"sk": "sk-SK",
"sv": "sv-SE",
"es-419": "es-US",
"pt-BR": "pt-BR",
"fr-CA": "fr-CA"
}
gemini_tts_language_names: dict[str, str] = {
"en": "English",
@ -109,7 +119,17 @@ class Settings(BaseSettings):
"mr": "Marathi",
"ta": "Tamil",
"te": "Telugu",
"zh": "Chinese"
"zh": "Chinese",
"cs": "Czech",
"da": "Danish",
"fi": "Finnish",
"hu": "Hungarian",
"no": "Norwegian",
"sk": "Slovak",
"sv": "Swedish",
"es-419": "Spanish (Latin America)",
"pt-BR": "Portuguese (Brazil)",
"fr-CA": "French (Canada)"
}
gemini_tts_preview_samples: dict[str, str] = {
"en": "This is a preview of the audio description voice.",
@ -135,7 +155,17 @@ class Settings(BaseSettings):
"mr": "हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे.",
"ta": "இது ஆடியோ விளக்க குரலின் முன்னோட்டம்.",
"te": "ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ.",
"zh": "这是音频描述语音的预览。"
"zh": "这是音频描述语音的预览。",
"cs": "Toto je náhled hlasu zvukového popisu.",
"da": "Dette er en forhåndsvisning af lydbeskrivelsesstemmen.",
"fi": "Tämä on äänikuvauksen äänen esikatselu.",
"hu": "Ez a hangos leírás hangjának előnézete.",
"no": "Dette er en forhåndsvisning av lydbeskrivelsesstemmen.",
"sk": "Toto je náhľad hlasu zvukového popisu.",
"sv": "Det här är en förhandsgranskning av ljudbeskrivningsrösten.",
"es-419": "Esta es una vista previa de la voz de audiodescripción.",
"pt-BR": "Esta é uma prévia da voz da audiodescrição.",
"fr-CA": "Ceci est un aperçu de la voix de l'audiodescription."
}
# Gemini TTS Model Options

View file

@ -206,6 +206,37 @@ class VTTEditor:
except Exception:
return 0.0
@staticmethod
def validate_translation_timing(source_vtt: str, translated_vtt: str) -> tuple[bool, list[str]]:
"""Verify that translated VTT has identical timestamps to the source VTT"""
errors = []
try:
source_cues = VTTParser.parse(source_vtt)
translated_cues = VTTParser.parse(translated_vtt)
if len(source_cues) != len(translated_cues):
errors.append(
f"Cue count mismatch: source has {len(source_cues)}, "
f"translation has {len(translated_cues)}"
)
return False, errors
for i, (src, tgt) in enumerate(zip(source_cues, translated_cues)):
if abs(src.start_time - tgt.start_time) > 0.001:
errors.append(
f"Cue {i + 1}: start time changed "
f"({src.start_time:.3f}s -> {tgt.start_time:.3f}s)"
)
if abs(src.end_time - tgt.end_time) > 0.001:
errors.append(
f"Cue {i + 1}: end time changed "
f"({src.end_time:.3f}s -> {tgt.end_time:.3f}s)"
)
except Exception as e:
errors.append(f"Validation error: {str(e)}")
return len(errors) == 0, errors
@staticmethod
def adjust_timing_offset(vtt_content: str, offset_seconds: float) -> str:
"""

View file

@ -124,6 +124,7 @@ class LangOutput(BaseModel):
accessible_video_edit_state: Optional[AccessibleVideoEditState] = None
origin: Optional[Literal["translate", "transcreate", "gemini_translate", "video_native"]] = None
qa_notes: Optional[str] = None
descriptive_transcript_gcs: Optional[str] = None # WCAG-compliant combined speech+description transcript
class ReviewHistoryItem(BaseModel):

View file

@ -42,30 +42,65 @@ CRITICAL TIMING REQUIREMENTS:
BRAND NAMES AND PRODUCTS:
{BRAND_CONTEXT}
- When you can clearly identify a product that matches a brand in the provided list, use the brand name rather than a generic descriptor (e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes")
- Only use brand names when you are confident of the identification from visible labels, logos, or distinctive design
- If a product is not on the list or is unclear, use a generic descriptor — do not guess
- You MUST use the exact brand names from the list whenever those products are visible on screen
- Always prefer the brand name over a generic description — e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes", "3M VetBond" not "tissue adhesive"
- If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification
- If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names
ETHICAL GUIDELINES FOR DESCRIBING PEOPLE:
- Describe people objectively and factually based on what is clearly visible — do not interpret, assume, or editorialize
- Use person-first, inclusive language (e.g., "a person using a wheelchair" not "a wheelchair-bound person"; "officer" not "policeman")
- Describe race, ethnicity, gender, age, or other personal characteristics ONLY when they are relevant to the narrative or plot. When you describe these characteristics for one person, be consistent and describe them for all relevant people in the same scene
- Do NOT guess at racial, ethnic, gender, or religious identity if it is not clearly confirmed by visual context or dialogue — use general descriptors instead (e.g., "a middle-aged person" rather than specifying ethnicity when uncertain)
- For disabilities or medical conditions: describe observable facts only (e.g., "a person with a prosthetic leg" — do not interpret emotional state or capability)
CAPTION FORMATTING (DCMP standard):
- Maximum TWO lines per caption. Never use three or more lines.
- Each line should be no longer than ~37 characters where possible (42 absolute max)
- Do NOT split lines in the middle of: a modifier and its word, a prepositional phrase, a person's name and title, after a conjunction (and/or/but/because), or between an auxiliary verb and the word it modifies
- Do NOT start a new sentence on the same line where a previous sentence ends (unless both sentences are very short and closely related)
- Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds
- Use mixed case. Use ALL CAPS only for screaming or shouting
SOUND AND MUSIC FORMATTING (DCMP standard):
- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
- Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks]
- Background music with lyrics: use the ♪ music symbol, e.g., ♪ Here comes the sun ♪
- End of a song: use double music symbols, e.g., ♪ last lyric line ♪♪
- Offscreen sounds and offscreen music: wrap in VTT italic tags, e.g., <i>[phone ringing]</i>, <i>♪ upbeat jazz ♪</i>
- Do not caption background music shorter than 5 seconds
- Describe music mood objectively using brackets, e.g., [suspenseful orchestral music] — avoid subjective words like "beautiful" or "haunting"
- For standard captions (non-SDH): include significant non-speech audio events in brackets when they are essential for understanding the content
CAPTION PLACEMENT:
- Captions are normally positioned at the bottom of the screen
- When visible text, graphics, logos, or on-screen information appear at the bottom of the frame during a caption cue, add the VTT cue setting "line:0%" to move that caption to the top — format: "00:00:01.000 --> 00:00:03.000 line:0%"
ETHICAL GUIDELINES FOR DESCRIBING PEOPLE (DCMP standard):
- Consistently identify people/characters by name. When a name is not yet known, identify by the most obvious visible attribute (e.g., "the person in the red jacket") until the name is established, then switch to the name and use it consistently
- Describe discernable physical attributes and observable gestures — do NOT infer or state emotional states
- CORRECT: "She furrows her brow and clasps her hands."
- INCORRECT: "She looks worried."
- CORRECT: "He raises his fist."
- INCORRECT: "He appears angry."
- Race/ethnicity: use currently-accepted, respectful terminology; if you describe one person's racial or ethnic background, be consistent and describe all people in the same scene
- Do NOT guess at racial, ethnic, gender, or religious identity if not clearly confirmed — use neutral descriptors (e.g., "a middle-aged person") rather than specifying when uncertain
- For disabilities: describe observable facts only (e.g., "a person using a motorized wheelchair") — do not interpret capability or emotional state
- CORRECT: "A person using a motorized wheelchair enters the room."
- INCORRECT: "A disabled person in a wheelchair rolls in."
- Avoid language that stereotypes, sensationalises, or assigns motivation based on appearance
- For animals: use "it" or "the [animal]" — do NOT use gendered pronouns (he/she) unless the animal's sex is explicitly established in the dialogue or on-screen content
- CORRECT: "The cat stretches on the windowsill."
- INCORRECT: "He stretches his paw across the windowsill." (when sex is not established)
AUDIO DESCRIPTION GUIDELINES:
AUDIO DESCRIPTION GUIDELINES (DCMP standard):
Priority order for what to describe (use available time wisely):
1. ESSENTIAL: Actions and details critical for following the narrative; information that would cause confusion if omitted; scene context and setting
2. HIGH PRIORITY: Significant character appearance relevant to the story; visual details supporting understanding; scene changes and time passages
3. TIME-PERMITTING: Additional aesthetic or contextual details
Rules:
- Start generally (establish the context), then move to specific details
- Place descriptions BEFORE the visual content they refer to when possible (pre-teaching), not after
- Use present tense, active voice, and third-person narrative
- Describe actions and observable gestures; do NOT infer or state emotions unless clearly displayed (e.g., "She covers her face with her hands" not "She looks devastated")
- Use present tense, active voice, and third-person narrative (e.g., "Ted breaks the window" not "The window was broken by Ted")
- Use clear, concise, complete sentences; sentence fragments only if time is severely limited
- Describe objectively without personal interpretation, censorship, or comment
- Do NOT use cinematic terminology such as "close-up", "pan", "cut to", "flashback", or "montage" unless absolutely necessary for comprehension
- ON-SCREEN TEXT (MANDATORY): You MUST describe ALL visible text — titles, lower-thirds, signs, captions, graphics, URLs, phone numbers, product labels. Use the format: "Text on screen reads: [exact text]". Include it even if similar content is spoken; if the spoken dialogue covers it exactly, use "Title card: [text]" to note its presence. Do NOT silently skip any on-screen text. If text appears for less than one second, note it in the nearest available gap.
- ON-SCREEN TEXT (MANDATORY): You MUST describe ALL visible text — titles, lower-thirds, signs, captions, graphics, URLs, phone numbers, product labels. Use the format: "Text on screen reads: [exact text]". If text appears for less than one second, note it in the nearest available gap. Do NOT silently skip any on-screen text.
- Treat song lyrics as dialogue — describe the song, do not describe over the lyrics while they are playing
- Describe colors, clothing, setting, and spatial relationships when relevant to understanding
- Be succinct — omit redundant or self-evident details
- Do NOT duplicate information already in the spoken dialogue
@ -81,9 +116,9 @@ Example output format:
"confidence": 0.95,
"summary": "A tutorial video showing how to use a web application dashboard.",
"transcript_plaintext": "Hello everyone, welcome to this tutorial. Today we'll be exploring the dashboard interface. First, let's log in to the system.",
"captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHello everyone, welcome to this tutorial.\n\n00:00:04.000 --> 00:00:07.200\nToday we'll be exploring the dashboard interface.\n\n00:00:08.000 --> 00:00:10.500\nFirst, let's log in to the system.",
"audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nA bright computer monitor displays a clean, modern login page with blue and white corporate branding. The interface features prominently positioned username and password fields.\n\n00:00:05.000 --> 00:00:07.000\nA cursor arrow hovers over the rectangular username input field, which highlights with a subtle blue border as the user prepares to type.\n\n00:00:10.000 --> 00:00:12.000\nThe screen transitions to reveal a comprehensive dashboard filled with colorful charts, data widgets, and navigation panels arranged in an organized grid layout."
"captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHello everyone, welcome\nto this tutorial.\n\n00:00:04.000 --> 00:00:07.200\nToday we'll be exploring\nthe dashboard interface.\n\n00:00:08.000 --> 00:00:10.500\nFirst, let's log in to the system.",
"audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nA bright computer monitor displays a clean, modern login page with blue and white corporate branding.\n\n00:00:05.000 --> 00:00:07.000\nA cursor hovers over the username input field, which highlights with a subtle blue border.\n\n00:00:10.000 --> 00:00:12.000\nThe screen reveals a comprehensive dashboard filled with colorful charts and data widgets."
}
```
Follow this exact structure and formatting.
Follow this exact structure and formatting.

View file

@ -46,30 +46,65 @@ CRITICAL TIMING REQUIREMENTS:
BRAND NAMES AND PRODUCTS:
{BRAND_CONTEXT}
- When you can clearly identify a product that matches a brand in the provided list, use the brand name rather than a generic descriptor (e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes")
- Only use brand names when you are confident of the identification from visible labels, logos, or distinctive design
- If a product is not on the list or is unclear, use a generic descriptor — do not guess
- You MUST use the exact brand names from the list whenever those products are visible on screen
- Always prefer the brand name over a generic description — e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes", "3M VetBond" not "tissue adhesive"
- If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification
- If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names
ETHICAL GUIDELINES FOR DESCRIBING PEOPLE:
- Describe people objectively and factually based on what is clearly visible — do not interpret, assume, or editorialize
- Use person-first, inclusive language (e.g., "a person using a wheelchair" not "a wheelchair-bound person"; "officer" not "policeman")
- Describe race, ethnicity, gender, age, or other personal characteristics ONLY when they are relevant to the narrative or plot. When you describe these characteristics for one person, be consistent and describe them for all relevant people in the same scene
- Do NOT guess at racial, ethnic, gender, or religious identity if it is not clearly confirmed by visual context or dialogue — use general descriptors instead
- For disabilities or medical conditions: describe observable facts only — do not interpret emotional state or capability
CAPTION FORMATTING (DCMP standard):
- Maximum TWO lines per caption. Never use three or more lines.
- Each line should be no longer than ~37 characters where possible (42 absolute max)
- Do NOT split lines in the middle of: a modifier and its word, a prepositional phrase, a person's name and title, after a conjunction (and/or/but/because), or between an auxiliary verb and the word it modifies
- Do NOT start a new sentence on the same line where a previous sentence ends (unless both sentences are very short and closely related)
- Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds
- Use mixed case. Use ALL CAPS only for screaming or shouting
SOUND AND MUSIC FORMATTING (DCMP standard):
- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
- Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks]
- Background music with lyrics: use the ♪ music symbol, e.g., ♪ Here comes the sun ♪
- End of a song: use double music symbols, e.g., ♪ last lyric line ♪♪
- Offscreen sounds and offscreen music: wrap in VTT italic tags, e.g., <i>[phone ringing]</i>, <i>♪ upbeat jazz ♪</i>
- Do not caption background music shorter than 5 seconds
- Describe music mood objectively using brackets, e.g., [suspenseful orchestral music] — avoid subjective words like "beautiful" or "haunting"
- For standard captions (non-SDH): include significant non-speech audio events in brackets when they are essential for understanding the content
CAPTION PLACEMENT:
- Captions are normally positioned at the bottom of the screen
- When visible text, graphics, logos, or on-screen information appear at the bottom of the frame during a caption cue, add the VTT cue setting "line:0%" to move that caption to the top — format: "00:00:01.000 --> 00:00:03.000 line:0%"
ETHICAL GUIDELINES FOR DESCRIBING PEOPLE (DCMP standard):
- Consistently identify people/characters by name. When a name is not yet known, identify by the most obvious visible attribute (e.g., "the person in the red jacket") until the name is established, then switch to the name and use it consistently
- Describe discernable physical attributes and observable gestures — do NOT infer or state emotional states
- CORRECT: "She furrows her brow and clasps her hands."
- INCORRECT: "She looks worried."
- CORRECT: "He raises his fist."
- INCORRECT: "He appears angry."
- Race/ethnicity: use currently-accepted, respectful terminology; if you describe one person's racial or ethnic background, be consistent and describe all people in the same scene
- Do NOT guess at racial, ethnic, gender, or religious identity if not clearly confirmed — use neutral descriptors (e.g., "a middle-aged person") rather than specifying when uncertain
- For disabilities: describe observable facts only (e.g., "a person using a motorized wheelchair") — do not interpret capability or emotional state
- CORRECT: "A person using a motorized wheelchair enters the room."
- INCORRECT: "A disabled person in a wheelchair rolls in."
- Avoid language that stereotypes, sensationalises, or assigns motivation based on appearance
- For animals: use "it" or "the [animal]" — do NOT use gendered pronouns (he/she) unless the animal's sex is explicitly established in the dialogue or on-screen content
- CORRECT: "The cat stretches on the windowsill."
- INCORRECT: "He stretches his paw across the windowsill." (when sex is not established)
AUDIO DESCRIPTION GUIDELINES:
AUDIO DESCRIPTION GUIDELINES (DCMP standard):
Priority order for what to describe (use available time wisely):
1. ESSENTIAL: Actions and details critical for following the narrative; information that would cause confusion if omitted; scene context and setting
2. HIGH PRIORITY: Significant character appearance relevant to the story; visual details supporting understanding; scene changes and time passages
3. TIME-PERMITTING: Additional aesthetic or contextual details
Rules:
- Start generally (establish the context), then move to specific details
- Place descriptions BEFORE the visual content they refer to when possible (pre-teaching), not after
- Use present tense, active voice, and third-person narrative
- Describe actions and observable gestures; do NOT infer or state emotions unless clearly displayed (e.g., describe the gesture, not the inferred feeling)
- Use present tense, active voice, and third-person narrative (e.g., "Ted breaks the window" not "The window was broken by Ted")
- Use clear, concise, complete sentences; sentence fragments only if time is severely limited
- Describe objectively without personal interpretation, censorship, or comment
- Do NOT use cinematic terminology such as "close-up", "pan", "cut to", "flashback", or "montage" unless absolutely necessary for comprehension
- ON-SCREEN TEXT (MANDATORY): You MUST describe ALL visible text — titles, lower-thirds, signs, captions, graphics, URLs, phone numbers, product labels. Use the format: "Text on screen reads: [exact text]". Include it even if similar content is spoken; if the spoken dialogue covers it exactly, use "Title card: [text]" to note its presence. Do NOT silently skip any on-screen text. If text appears for less than one second, note it in the nearest available gap.
- ON-SCREEN TEXT (MANDATORY): You MUST describe ALL visible text — titles, lower-thirds, signs, captions, graphics, URLs, phone numbers, product labels. Use the format: "Text on screen reads: [exact text]". If text appears for less than one second, note it in the nearest available gap. Do NOT silently skip any on-screen text.
- Treat song lyrics as dialogue — describe the song, do not describe over the lyrics while they are playing
- Describe colors, clothing, setting, and spatial relationships when relevant to understanding
- Be succinct — omit redundant or self-evident details
- Do NOT duplicate information already in the spoken dialogue
@ -86,8 +121,8 @@ Example output format (if TARGET_LANGUAGE were Spanish):
"confidence": 0.95,
"summary": "Un video tutorial que muestra como usar una aplicacion web de panel de control.",
"transcript_plaintext": "Hola a todos, bienvenidos a este tutorial. Hoy vamos a explorar la interfaz del panel de control. Primero, iniciemos sesion en el sistema.",
"captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHola a todos, bienvenidos a este tutorial.\n\n00:00:04.000 --> 00:00:07.200\nHoy vamos a explorar la interfaz del panel de control.\n\n00:00:08.000 --> 00:00:10.500\nPrimero, iniciemos sesion en el sistema.",
"audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nUna pantalla de computadora brillante muestra una pagina de inicio de sesion moderna y limpia con marca corporativa azul y blanca.\n\n00:00:05.000 --> 00:00:07.000\nUn cursor se desplaza sobre el campo de entrada de nombre de usuario, que se resalta con un borde azul sutil.\n\n00:00:10.000 --> 00:00:12.000\nLa pantalla cambia para revelar un panel completo lleno de graficos coloridos y widgets de datos."
"captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHola a todos, bienvenidos\na este tutorial.\n\n00:00:04.000 --> 00:00:07.200\nHoy vamos a explorar la interfaz\ndel panel de control.\n\n00:00:08.000 --> 00:00:10.500\nPrimero, iniciemos sesion en el sistema.",
"audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nUna pantalla de computadora muestra una pagina de inicio de sesion moderna con marca corporativa azul y blanca.\n\n00:00:05.000 --> 00:00:07.000\nUn cursor se desplaza sobre el campo de nombre de usuario, que se resalta con un borde azul sutil.\n\n00:00:10.000 --> 00:00:12.000\nLa pantalla revela un panel completo con graficos coloridos y widgets de datos."
}
```

View file

@ -0,0 +1,89 @@
"""
Descriptive Transcript Service
Generates a WCAG-compliant descriptive transcript by merging captions (speech)
and audio descriptions (visuals) into a single chronological plain text document.
Format:
(SPEECH)
[MUSIC PLAYING]
(DESCRIPTION)
Jennifer runs across the stage.
(SPEECH)
In sports, there is no substitute for sweat.
Reference: WCAG 2.1 Success Criterion 1.2.1
"""
from ..lib.vtt import VTTCue, VTTParser
from ..core.logging import get_logger
logger = get_logger(__name__)
def generate_descriptive_transcript(captions_vtt: str, ad_vtt: str) -> str:
"""
Merge captions VTT and audio description VTT into a descriptive transcript.
Args:
captions_vtt: WebVTT content for captions (speech + non-speech audio)
ad_vtt: WebVTT content for audio descriptions (visual descriptions)
Returns:
Plain text descriptive transcript with (SPEECH) and (DESCRIPTION) sections
"""
try:
caption_cues = VTTParser.parse(captions_vtt) if captions_vtt else []
except Exception as e:
logger.warning(f"Failed to parse captions VTT for transcript: {e}")
caption_cues = []
try:
ad_cues = VTTParser.parse(ad_vtt) if ad_vtt else []
except Exception as e:
logger.warning(f"Failed to parse AD VTT for transcript: {e}")
ad_cues = []
if not caption_cues and not ad_cues:
return ""
# Tag each cue with its type
tagged: list[tuple[str, VTTCue]] = (
[("speech", cue) for cue in caption_cues] +
[("description", cue) for cue in ad_cues]
)
# Sort chronologically by start time; descriptions before captions at the same time
tagged.sort(key=lambda x: (x[1].start_time, 0 if x[0] == "description" else 1))
# Group consecutive same-type cues together
sections: list[tuple[str, list[str]]] = []
current_type: str | None = None
current_texts: list[str] = []
for cue_type, cue in tagged:
text = cue.text.strip()
if not text:
continue
if cue_type == current_type:
current_texts.append(text)
else:
if current_type is not None and current_texts:
sections.append((current_type, current_texts))
current_type = cue_type
current_texts = [text]
if current_type is not None and current_texts:
sections.append((current_type, current_texts))
# Build the output text
output_lines: list[str] = []
for section_type, texts in sections:
header = "(SPEECH)" if section_type == "speech" else "(DESCRIPTION)"
output_lines.append(header)
output_lines.append("\n".join(texts))
output_lines.append("") # blank line between sections
return "\n".join(output_lines).strip()

View file

@ -779,63 +779,101 @@ JSON:
source_language: str = "en"
) -> str:
"""
Translate VTT content using Gemini, preserving timing and structure.
More cost-effective alternative to Google Translate API (6-36x cheaper).
Translate VTT content using Gemini, preserving timing programmatically.
Args:
vtt_content: The VTT file content to translate
target_language: The language code to translate to (e.g., 'es', 'fr')
source_language: The source language code (default: 'en')
Uses a two-step approach to guarantee timestamp integrity:
1. Send only the text cues (no timestamps) to Gemini as a numbered list
2. Apply translated texts back onto the original VTT using translate_preserving_timing()
Returns:
Translated VTT content with preserved timestamps
This avoids any possibility of Gemini drifting or altering timestamps.
"""
prompt = f"""Translate this WebVTT subtitle file from {source_language} to {target_language}.
from ..lib.vtt import VTTParser, VTTEditor
CRITICAL REQUIREMENTS:
- Preserve ALL timestamps exactly as-is (do not modify any timing)
- Keep the WEBVTT header line
- Translate ONLY the text content between timestamps
- Maintain readable line lengths (~32-40 characters per line)
- Handle idioms and slang naturally in {target_language}
- Preserve any speaker labels (e.g., "[Speaker 1]:")
- Do NOT add any explanation or markdown - return ONLY the translated VTT
source_cues = VTTParser.parse(vtt_content)
if not source_cues:
logger.warning(f"No cues found in VTT for {target_language} translation")
return vtt_content
VTT Content to translate:
{vtt_content}"""
cue_count = len(source_cues)
async def _attempt_translation(extra_instruction: str = "") -> list[str]:
numbered_texts = "\n".join(
f"{i + 1}. {cue.text.replace(chr(10), ' ')}"
for i, cue in enumerate(source_cues)
)
prompt = f"""Translate the following {cue_count} numbered text segments from {source_language} to {target_language}.
REQUIREMENTS:
- Return EXACTLY {cue_count} numbered lines, one translation per line
- Format: "1. translated text", "2. translated text", etc.
- Preserve speaker labels like [Speaker 1]: unchanged
- Use natural, idiomatic {target_language}
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}
Segments to translate:
{numbered_texts}"""
try:
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[genai.types.Part.from_text(text=prompt)]
)
return self._parse_numbered_translation(response.text.strip(), cue_count)
result = response.text.strip()
try:
translated_texts = await _attempt_translation()
# Handle potential markdown formatting
if result.startswith("```"):
# Remove markdown code blocks
lines = result.split("\n")
# Filter out lines that are just ``` or ```vtt or ```webvtt
filtered_lines = [
line for line in lines
if not line.strip().startswith("```")
]
result = "\n".join(filtered_lines).strip()
if len(translated_texts) != cue_count:
logger.warning(
f"Translation cue count mismatch for {target_language}: "
f"expected {cue_count}, got {len(translated_texts)}. Retrying."
)
translated_texts = await _attempt_translation(
extra_instruction=f"\n- You MUST return exactly {cue_count} lines, no more, no less"
)
# Validate VTT format
if not result.startswith("WEBVTT"):
logger.warning("Gemini translation missing WEBVTT header, adding it")
result = "WEBVTT\n\n" + result
if len(translated_texts) != cue_count:
# Pad or truncate as last resort to avoid breaking downstream
logger.warning(
f"Retried translation still mismatched ({len(translated_texts)} vs {cue_count}). "
f"Padding/truncating to match."
)
if len(translated_texts) < cue_count:
translated_texts.extend(
source_cues[i].text
for i in range(len(translated_texts), cue_count)
)
else:
translated_texts = translated_texts[:cue_count]
logger.info(f"Successfully translated VTT to {target_language} using Gemini")
result = VTTEditor.translate_preserving_timing(vtt_content, translated_texts)
logger.info(f"Successfully translated VTT to {target_language} ({cue_count} cues)")
return result
except Exception as e:
logger.error(f"Gemini translation failed for {target_language}: {e}")
raise
@staticmethod
def _parse_numbered_translation(response_text: str, expected_count: int) -> list[str]:
"""Parse a numbered list response from Gemini into a list of translated texts."""
import re
lines = response_text.strip().split("\n")
results = []
for line in lines:
line = line.strip()
if not line:
continue
# Match "1. text", "1) text", or just text if already stripped
match = re.match(r"^\d+[.)]\s+(.+)$", line)
if match:
results.append(match.group(1).strip())
elif results or re.match(r"^\d+[.)]", line) is None:
# Non-numbered continuation line — append to last result or skip
if results:
results[-1] += " " + line
return results
async def rewrite_tts_cue(
self,
original_text: str,

View file

@ -446,7 +446,7 @@ class VideoRendererService:
all_audio = base_audio + "".join(ad_labels)
num_inputs = 1 + len(ad_labels)
filter_parts.append(
f"{all_audio}amix=inputs={num_inputs}:duration=first:dropout_transition=0[mixed]"
f"{all_audio}amix=inputs={num_inputs}:duration=first:dropout_transition=0:normalize=0[mixed]"
)
audio_output = "[mixed]"
else:

View file

@ -48,6 +48,7 @@ FILE_TYPE_MAPPING = {
"ad_mp3_gcs": "ad.mp3",
"accessible_video_gcs": "accessible_video.mp4",
"retimed_captions_vtt_gcs": "accessible_captions.vtt",
"descriptive_transcript_gcs": "descriptive_transcript.txt",
}

View file

@ -262,12 +262,31 @@ async def ingest_and_ai_task_impl(job_id: str):
f"{job_id}/{source_language}/sdh_captions.vtt"
)
# Generate descriptive transcript (WCAG 2.1 1.2.1)
transcript_gcs_uri = None
try:
from ..services.descriptive_transcript import generate_descriptive_transcript
transcript_text = generate_descriptive_transcript(
ai_result["captions_vtt"],
ai_result["audio_description_vtt"]
)
if transcript_text:
transcript_gcs_uri = await upload_vtt_to_gcs(
transcript_text,
f"{job_id}/{source_language}/descriptive_transcript.txt"
)
logger.info(f"Generated descriptive transcript for job {job_id}, language {source_language}")
except Exception as e:
logger.warning(f"Failed to generate descriptive transcript for job {job_id}: {e}")
source_lang_output = {
"captions_vtt_gcs": captions_gcs_uri,
"ad_vtt_gcs": ad_gcs_uri,
}
if sdh_gcs_uri:
source_lang_output["sdh_captions_vtt_gcs"] = sdh_gcs_uri
if transcript_gcs_uri:
source_lang_output["descriptive_transcript_gcs"] = transcript_gcs_uri
# Update job with AI results, detected language, and outputs
# Set status to TRANSLATING to trigger translation pipeline before QC

View file

@ -514,23 +514,30 @@ def _build_placements_with_adjustments(
adjusted_ms = pp.get("adjusted_ms")
# Fallback for data without source_ms (backward compatibility)
# When source_ms is missing we cannot reliably map rendered adjustments
# back to source coordinates, so we skip the delta and use original_ms as-is.
if source_ms is None:
logger.warning(
f"Cue {cue_idx}: No source_ms found, falling back to original_ms. "
"Job may need to be re-processed from initial render."
f"Cue {cue_idx}: No source_ms found, falling back to original_ms "
"without applying adjustment delta. "
"Job may need to be re-processed from initial render for timing adjustments to work."
)
source_ms = original_ms
# Use original_ms directly; skip adjustment to avoid double-counting
pause_time_s = original_ms / 1000.0 if original_ms is not None else cues[cue_idx].start_time if cue_idx < len(cues) else 0.0
adjusted_pause_by_cue[cue_idx] = pause_time_s
continue
# Apply user adjustment as relative offset
# Apply user adjustment as relative offset in source coordinates
if adjusted_ms is not None and original_ms is not None:
# User adjusted in rendered timeline - apply same delta to source
adjustment_delta = adjusted_ms - original_ms
source_ms = source_ms + adjustment_delta
adjusted_source_ms = source_ms + adjustment_delta
logger.info(
f"Cue {cue_idx}: Applying adjustment delta {adjustment_delta:.1f}ms "
f"(rendered: {original_ms:.1f} -> {adjusted_ms:.1f}, "
f"source: {source_ms - adjustment_delta:.1f} -> {source_ms:.1f})"
f"source: {source_ms:.1f} -> {adjusted_source_ms:.1f})"
)
source_ms = adjusted_source_ms
# Convert to seconds for placement
pause_time_s = source_ms / 1000.0

View file

@ -7,6 +7,7 @@ import time
import random
from celery import Task
from celery.exceptions import SoftTimeLimitExceeded
from motor.motor_asyncio import AsyncIOMotorClient
from ..core.config import settings
@ -96,7 +97,39 @@ async def retry_with_backoff(func, max_retries=3, base_delay=1):
raise last_exception
@celery_app.task(bind=True)
async def _mark_task_timed_out(job_id: str) -> None:
"""Mark a job as failed due to task timeout and notify via WebSocket"""
try:
client = AsyncIOMotorClient(settings.mongodb_uri)
db = client[settings.mongodb_db]
job_doc = await db.jobs.find_one({"_id": job_id}, {"status": 1, "title": 1})
if not job_doc:
return
current_status = job_doc.get("status", "")
# Map current processing state to appropriate failure status
if current_status in ("tts_generating", "translating"):
fail_status = JobStatus.TTS_FAILED.value
else:
fail_status = current_status # Leave as-is for other states
await db.jobs.update_one(
{"_id": job_id},
{"$set": {
"status": fail_status,
"updated_at": datetime.utcnow(),
"error": "Processing timed out. Please retry."
}}
)
broadcast_status_update(
job_id=job_id,
status=fail_status,
job_title=job_doc.get("title"),
message="Processing timed out. Please retry."
)
except Exception as e:
logger.error(f"Failed to mark job {job_id} as timed out: {e}")
@celery_app.task(bind=True, time_limit=3600, soft_time_limit=3400)
def translate_and_synthesize_task(self, job_id: str):
"""
Pipeline 2: Translation & MP3 Generation
@ -109,6 +142,11 @@ def translate_and_synthesize_task(self, job_id: str):
result = asyncio.run(_async_translate_and_synthesize(job_id))
logger.info(f"✅ CELERY TASK COMPLETED successfully for job {job_id}")
return result
except SoftTimeLimitExceeded:
logger.error(f"⏰ translate_and_synthesize_task soft time limit exceeded for job {job_id}")
import asyncio as _asyncio
_asyncio.run(_mark_task_timed_out(job_id))
raise
except Exception as e:
logger.error(f"❌ CELERY TASK FAILED for job {job_id}: {str(e)}")
logger.error(f"❌ Exception type: {type(e).__name__}")
@ -274,12 +312,25 @@ async def _async_translate_and_synthesize(job_id: str):
f"{job_id}/{lang}/sdh_captions.vtt"
)
# Generate descriptive transcript (WCAG 2.1 1.2.1)
transcript_gcs_uri = None
try:
from ..services.descriptive_transcript import generate_descriptive_transcript
transcript_text = generate_descriptive_transcript(translated_captions, translated_ad)
if transcript_text:
transcript_gcs_uri = await upload_vtt_to_gcs(
transcript_text,
f"{job_id}/{lang}/descriptive_transcript.txt"
)
except Exception as transcript_err:
logger.warning(f"Failed to generate descriptive transcript for {lang}: {transcript_err}")
logger.info(f"Completed video-native translation for {lang}")
return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, None)
return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, transcript_gcs_uri, None)
except Exception as e:
logger.error(f"Video-native translation failed for {lang}: {e}")
return (lang, None, None, None, str(e))
return (lang, None, None, None, None, str(e))
# Run all translations in parallel (limited by semaphore)
if target_languages:
@ -298,7 +349,7 @@ async def _async_translate_and_synthesize(job_id: str):
"qa_notes": f"Translation failed: {str(result)}"
}
else:
lang, captions_uri, ad_uri, sdh_uri, error_msg = result
lang, captions_uri, ad_uri, sdh_uri, transcript_uri, error_msg = result
if error_msg:
updated_outputs[lang] = {
"origin": "video_native",
@ -312,6 +363,8 @@ async def _async_translate_and_synthesize(job_id: str):
}
if sdh_uri:
lang_out["sdh_captions_vtt_gcs"] = sdh_uri
if transcript_uri:
lang_out["descriptive_transcript_gcs"] = transcript_uri
updated_outputs[lang] = lang_out
logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)")
@ -381,6 +434,19 @@ async def _async_translate_and_synthesize(job_id: str):
)
lang_out["sdh_captions_vtt_gcs"] = sdh_gcs_uri
# Generate descriptive transcript (WCAG 2.1 1.2.1)
try:
from ..services.descriptive_transcript import generate_descriptive_transcript
transcript_text = generate_descriptive_transcript(translated_captions, translated_ad)
if transcript_text:
transcript_gcs_uri = await upload_vtt_to_gcs(
transcript_text,
f"{job_id}/{language}/descriptive_transcript.txt"
)
lang_out["descriptive_transcript_gcs"] = transcript_gcs_uri
except Exception as transcript_err:
logger.warning(f"Failed to generate descriptive transcript for {language}: {transcript_err}")
# Store language outputs
updated_outputs[language] = lang_out
@ -653,7 +719,8 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict,
logger.warning(f"No cues to synthesize for job {job_id}, language {language}")
return
# Poll for group completion with async sleep
# Poll for group completion with async sleep (max 15 minutes per language)
TTS_POLL_TIMEOUT_SECONDS = 900
poll_count = 0
while not group_result.ready():
await asyncio.sleep(1.0)
@ -663,6 +730,21 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict,
f"Waiting for TTS group ({language}): {poll_count}s elapsed, "
f"completed={group_result.completed_count()}/{len(cues)}"
)
if poll_count >= TTS_POLL_TIMEOUT_SECONDS:
logger.error(
f"TTS group timed out for job {job_id}, language {language} "
f"after {TTS_POLL_TIMEOUT_SECONDS}s — "
f"{group_result.completed_count()}/{len(cues)} cues completed"
)
raise TTSSynthesisError(
message=(
f"TTS synthesis timed out after {TTS_POLL_TIMEOUT_SECONDS}s "
f"({group_result.completed_count()}/{len(cues)} cues completed)"
),
cue_index=group_result.completed_count(),
cue_text="",
api_response_info="timeout"
)
# Get results from all cue tasks
# Use propagate=False to get results even if some tasks failed

View file

@ -116,7 +116,7 @@ export function TimelinePreview({
onClick={() => handleSegmentClick(segment)}
title={
segment.is_freeze_frame
? `AD Cue ${segment.cue_index}${isRegenerationQueued ? ' (Regenerate queued)' : ''}`
? `AD Cue ${segment.cue_index !== null ? segment.cue_index + 1 : ''}${isRegenerationQueued ? ' (Regenerate queued)' : ''}`
: `Video segment ${segment.segment_index}`
}
>
@ -124,7 +124,7 @@ export function TimelinePreview({
{segment.is_freeze_frame && segment.cue_index !== null && widthPercent > 2 && (
<div className="absolute inset-0 flex items-center justify-center">
<span className="text-xs font-bold text-white drop-shadow">
{segment.cue_index}
{segment.cue_index !== null ? segment.cue_index + 1 : ''}
</span>
</div>
)}

View file

@ -305,9 +305,9 @@ export function VideoReviewPlayer({ job, downloads }: VideoReviewPlayerProps) {
</div>
)}
{/* Caption Overlay */}
{/* Caption Overlay — respects VTT line:0% positioning to avoid obscuring on-screen text */}
{showCaptions && currentCaption && (
<div className="absolute bottom-16 left-1/2 transform -translate-x-1/2 bg-black bg-opacity-80 text-white px-4 py-2 rounded max-w-[90%]">
<div className={`absolute ${currentCaption.positionTop ? 'top-4' : 'bottom-16'} left-1/2 transform -translate-x-1/2 bg-black bg-opacity-80 text-white px-4 py-2 rounded max-w-[90%]`}>
<div className="text-center whitespace-pre-wrap">
{currentCaption.text}
</div>

View file

@ -211,9 +211,9 @@ export function VideoWithCaptions({
Your browser does not support the video tag.
</video>
{/* Caption Overlay */}
{/* Caption Overlay — respects VTT line:0% positioning to avoid obscuring on-screen text */}
{showCaptions && currentCaption && (
<div className="absolute bottom-16 left-1/2 transform -translate-x-1/2 bg-black bg-opacity-80 text-white px-4 py-2 rounded max-w-[90%]">
<div className={`absolute ${currentCaption.positionTop ? 'top-4' : 'bottom-16'} left-1/2 transform -translate-x-1/2 bg-black bg-opacity-80 text-white px-4 py-2 rounded max-w-[90%]`}>
<div className="text-center whitespace-pre-wrap">
{currentCaption.text}
</div>

View file

@ -239,7 +239,7 @@ export function VttEditor({ vttContent, onChange, onCueSave, onCueInserted, onCu
<div className="flex items-center gap-1">
{/* Cue Number */}
<span className="inline-flex items-center justify-center w-6 h-6 text-xs font-bold text-white bg-orange-400 rounded-full mr-2">
{index}
{index + 1}
</span>
{readOnly ? (
<span className="text-sm text-gray-500 font-mono">

View file

@ -31,7 +31,11 @@ export function useJob(jobId: string) {
refetchOnWindowFocus: false,
refetchInterval: (query) => {
const status = query.state.data?.status;
return status === 'rendering_qc' ? 5000 : false;
const processingStatuses = new Set([
'created', 'ingesting', 'ai_processing', 'translating',
'tts_generating', 'rendering_video', 'rendering_qc'
]);
return status && processingStatuses.has(status) ? 10000 : false;
},
});
}

View file

@ -380,7 +380,7 @@ export function useJobStatusWebSocket(
if (currentToken && isAuthenticated) {
connect();
}
return () => {
mountedRef.current = false;
disconnect();
@ -388,6 +388,35 @@ export function useJobStatusWebSocket(
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [isAuthenticated, jobId]); // Reconnect when auth state or jobId changes
// Reconnect WebSocket when tab becomes visible again (Safari suspends background tabs)
useEffect(() => {
const handleVisibilityChange = () => {
if (document.visibilityState === 'visible' && isAuthenticated) {
const ws = wsRef.current;
const isDisconnected = !ws ||
ws.readyState === WebSocket.CLOSED ||
ws.readyState === WebSocket.CLOSING;
if (isDisconnected) {
log('Tab became visible, reconnecting WebSocket');
reconnectAttemptsRef.current = 0;
connect();
// Immediately refresh job data in case we missed status updates
if (jobId) {
queryClient.invalidateQueries({ queryKey: ['jobs', jobId] });
} else {
queryClient.invalidateQueries({ queryKey: ['jobs'] });
}
}
}
};
document.addEventListener('visibilitychange', handleVisibilityChange);
return () => {
document.removeEventListener('visibilitychange', handleVisibilityChange);
};
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [isAuthenticated, jobId]);
// Cleanup on unmount
useEffect(() => {
return () => {

View file

@ -3,6 +3,8 @@ export interface VTTCue {
endTime: number; // seconds
text: string;
identifier?: string;
/** When true, caption should be rendered at the top of the video (line:0% cue setting) */
positionTop?: boolean;
}
export class VTTParser {
@ -30,11 +32,14 @@ export class VTTParser {
// Parse timing line
const currentLine = lines[i].trim();
if (currentLine.includes(' --> ')) {
const timingMatch = currentLine.match(/([\d:.,]+)\s+-->\s+([\d:.,]+)/);
const timingMatch = currentLine.match(/([\d:.,]+)\s+-->\s+([\d:.,]+)(.*)/);
if (timingMatch) {
const startTime = this.parseTimestamp(timingMatch[1]);
const endTime = this.parseTimestamp(timingMatch[2]);
// Parse optional cue settings (e.g., "line:0%" means position at top)
const cueSettings = timingMatch[3] || '';
const positionTop = cueSettings.includes('line:0%');
// Collect text lines until empty line or next cue
i++;
const textLines: string[] = [];
@ -42,13 +47,14 @@ export class VTTParser {
textLines.push(lines[i].trim());
i++;
}
if (textLines.length > 0) {
cues.push({
startTime,
endTime,
text: textLines.join('\n'),
identifier
identifier,
...(positionTop ? { positionTop: true } : {})
});
}
}

View file

@ -52,7 +52,8 @@ const DownloadCard = ({
'audio_description_vtt': 'audio_descriptions.vtt',
'audio_description_mp3': 'audio_descriptions.mp3',
'accessible_video_mp4': 'accessible_video.mp4',
'accessible_captions_vtt': 'accessible_captions.vtt'
'accessible_captions_vtt': 'accessible_captions.vtt',
'descriptive_transcript': 'descriptive_transcript.txt'
};
return `${sanitizedTitle}_${language}_${extensions[type] || type}`;
};
@ -68,7 +69,8 @@ const DownloadCard = ({
'audio_description_vtt': 'Audio Descriptions (VTT)',
'audio_description_mp3': 'Audio Descriptions (MP3)',
'accessible_video_mp4': 'Accessible Video (MP4)',
'accessible_captions_vtt': 'Re-timed Captions (VTT)'
'accessible_captions_vtt': 'Re-timed Captions (VTT)',
'descriptive_transcript': 'Descriptive Transcript (TXT)'
};
return (

View file

@ -42,6 +42,13 @@ const LANGUAGE_NAMES: Record<string, string> = {
et: 'Estonian',
lv: 'Latvian',
lt: 'Lithuanian',
bn: 'Bengali',
mr: 'Marathi',
ta: 'Tamil',
te: 'Telugu',
'es-419': 'Spanish (Latin America)',
'pt-BR': 'Portuguese (Brazil)',
'fr-CA': 'French (Canada)',
};
/**