diff --git a/backend/app/api/v1/routes_jobs.py b/backend/app/api/v1/routes_jobs.py index aa2deeb..bdb8ca3 100644 --- a/backend/app/api/v1/routes_jobs.py +++ b/backend/app/api/v1/routes_jobs.py @@ -1091,6 +1091,15 @@ async def get_job_downloads( except Exception as e: logger.warning(f"Failed to generate signed URL for retimed captions {language}: {e}") + # Descriptive Transcript TXT (WCAG 2.1 1.2.1) + if "descriptive_transcript_gcs" in lang_output: + blob_path = lang_output["descriptive_transcript_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") + try: + signed_url = await get_signed_download_url(blob_path, 24) + lang_downloads["descriptive_transcript"] = signed_url + except Exception as e: + logger.warning(f"Failed to generate signed URL for descriptive transcript {language}: {e}") + if lang_downloads: downloads[language] = lang_downloads diff --git a/backend/app/core/config.py b/backend/app/core/config.py index e82befc..2434e58 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -83,7 +83,17 @@ class Settings(BaseSettings): "mr": "mr-IN", "ta": "ta-IN", "te": "te-IN", - "zh": "zh-CN" + "zh": "zh-CN", + "cs": "cs-CZ", + "da": "da-DK", + "fi": "fi-FI", + "hu": "hu-HU", + "no": "nb-NO", + "sk": "sk-SK", + "sv": "sv-SE", + "es-419": "es-US", + "pt-BR": "pt-BR", + "fr-CA": "fr-CA" } gemini_tts_language_names: dict[str, str] = { "en": "English", @@ -109,7 +119,17 @@ class Settings(BaseSettings): "mr": "Marathi", "ta": "Tamil", "te": "Telugu", - "zh": "Chinese" + "zh": "Chinese", + "cs": "Czech", + "da": "Danish", + "fi": "Finnish", + "hu": "Hungarian", + "no": "Norwegian", + "sk": "Slovak", + "sv": "Swedish", + "es-419": "Spanish (Latin America)", + "pt-BR": "Portuguese (Brazil)", + "fr-CA": "French (Canada)" } gemini_tts_preview_samples: dict[str, str] = { "en": "This is a preview of the audio description voice.", @@ -135,7 +155,17 @@ class Settings(BaseSettings): "mr": "हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे.", "ta": "இது ஆடியோ விளக்க குரலின் முன்னோட்டம்.", "te": "ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ.", - "zh": "这是音频描述语音的预览。" + "zh": "这是音频描述语音的预览。", + "cs": "Toto je náhled hlasu zvukového popisu.", + "da": "Dette er en forhåndsvisning af lydbeskrivelsesstemmen.", + "fi": "Tämä on äänikuvauksen äänen esikatselu.", + "hu": "Ez a hangos leírás hangjának előnézete.", + "no": "Dette er en forhåndsvisning av lydbeskrivelsesstemmen.", + "sk": "Toto je náhľad hlasu zvukového popisu.", + "sv": "Det här är en förhandsgranskning av ljudbeskrivningsrösten.", + "es-419": "Esta es una vista previa de la voz de audiodescripción.", + "pt-BR": "Esta é uma prévia da voz da audiodescrição.", + "fr-CA": "Ceci est un aperçu de la voix de l'audiodescription." } # Gemini TTS Model Options diff --git a/backend/app/lib/vtt.py b/backend/app/lib/vtt.py index 97dfe2e..994b2b9 100644 --- a/backend/app/lib/vtt.py +++ b/backend/app/lib/vtt.py @@ -206,6 +206,37 @@ class VTTEditor: except Exception: return 0.0 + @staticmethod + def validate_translation_timing(source_vtt: str, translated_vtt: str) -> tuple[bool, list[str]]: + """Verify that translated VTT has identical timestamps to the source VTT""" + errors = [] + try: + source_cues = VTTParser.parse(source_vtt) + translated_cues = VTTParser.parse(translated_vtt) + + if len(source_cues) != len(translated_cues): + errors.append( + f"Cue count mismatch: source has {len(source_cues)}, " + f"translation has {len(translated_cues)}" + ) + return False, errors + + for i, (src, tgt) in enumerate(zip(source_cues, translated_cues)): + if abs(src.start_time - tgt.start_time) > 0.001: + errors.append( + f"Cue {i + 1}: start time changed " + f"({src.start_time:.3f}s -> {tgt.start_time:.3f}s)" + ) + if abs(src.end_time - tgt.end_time) > 0.001: + errors.append( + f"Cue {i + 1}: end time changed " + f"({src.end_time:.3f}s -> {tgt.end_time:.3f}s)" + ) + except Exception as e: + errors.append(f"Validation error: {str(e)}") + + return len(errors) == 0, errors + @staticmethod def adjust_timing_offset(vtt_content: str, offset_seconds: float) -> str: """ diff --git a/backend/app/models/job.py b/backend/app/models/job.py index f5f5b0f..38b806b 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -124,6 +124,7 @@ class LangOutput(BaseModel): accessible_video_edit_state: Optional[AccessibleVideoEditState] = None origin: Optional[Literal["translate", "transcreate", "gemini_translate", "video_native"]] = None qa_notes: Optional[str] = None + descriptive_transcript_gcs: Optional[str] = None # WCAG-compliant combined speech+description transcript class ReviewHistoryItem(BaseModel): diff --git a/backend/app/prompts/gemini_ingestion.md b/backend/app/prompts/gemini_ingestion.md index be8dccb..f52a745 100644 --- a/backend/app/prompts/gemini_ingestion.md +++ b/backend/app/prompts/gemini_ingestion.md @@ -42,30 +42,65 @@ CRITICAL TIMING REQUIREMENTS: BRAND NAMES AND PRODUCTS: {BRAND_CONTEXT} -- When you can clearly identify a product that matches a brand in the provided list, use the brand name rather than a generic descriptor (e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes") -- Only use brand names when you are confident of the identification from visible labels, logos, or distinctive design -- If a product is not on the list or is unclear, use a generic descriptor — do not guess +- You MUST use the exact brand names from the list whenever those products are visible on screen +- Always prefer the brand name over a generic description — e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes", "3M VetBond" not "tissue adhesive" +- If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification +- If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names -ETHICAL GUIDELINES FOR DESCRIBING PEOPLE: -- Describe people objectively and factually based on what is clearly visible — do not interpret, assume, or editorialize -- Use person-first, inclusive language (e.g., "a person using a wheelchair" not "a wheelchair-bound person"; "officer" not "policeman") -- Describe race, ethnicity, gender, age, or other personal characteristics ONLY when they are relevant to the narrative or plot. When you describe these characteristics for one person, be consistent and describe them for all relevant people in the same scene -- Do NOT guess at racial, ethnic, gender, or religious identity if it is not clearly confirmed by visual context or dialogue — use general descriptors instead (e.g., "a middle-aged person" rather than specifying ethnicity when uncertain) -- For disabilities or medical conditions: describe observable facts only (e.g., "a person with a prosthetic leg" — do not interpret emotional state or capability) +CAPTION FORMATTING (DCMP standard): +- Maximum TWO lines per caption. Never use three or more lines. +- Each line should be no longer than ~37 characters where possible (42 absolute max) +- Do NOT split lines in the middle of: a modifier and its word, a prepositional phrase, a person's name and title, after a conjunction (and/or/but/because), or between an auxiliary verb and the word it modifies +- Do NOT start a new sentence on the same line where a previous sentence ends (unless both sentences are very short and closely related) +- Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds +- Use mixed case. Use ALL CAPS only for screaming or shouting + +SOUND AND MUSIC FORMATTING (DCMP standard): +- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching] +- Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks] +- Background music with lyrics: use the ♪ music symbol, e.g., ♪ Here comes the sun ♪ +- End of a song: use double music symbols, e.g., ♪ last lyric line ♪♪ +- Offscreen sounds and offscreen music: wrap in VTT italic tags, e.g., [phone ringing], ♪ upbeat jazz ♪ +- Do not caption background music shorter than 5 seconds +- Describe music mood objectively using brackets, e.g., [suspenseful orchestral music] — avoid subjective words like "beautiful" or "haunting" +- For standard captions (non-SDH): include significant non-speech audio events in brackets when they are essential for understanding the content + +CAPTION PLACEMENT: +- Captions are normally positioned at the bottom of the screen +- When visible text, graphics, logos, or on-screen information appear at the bottom of the frame during a caption cue, add the VTT cue setting "line:0%" to move that caption to the top — format: "00:00:01.000 --> 00:00:03.000 line:0%" + +ETHICAL GUIDELINES FOR DESCRIBING PEOPLE (DCMP standard): +- Consistently identify people/characters by name. When a name is not yet known, identify by the most obvious visible attribute (e.g., "the person in the red jacket") until the name is established, then switch to the name and use it consistently +- Describe discernable physical attributes and observable gestures — do NOT infer or state emotional states + - CORRECT: "She furrows her brow and clasps her hands." + - INCORRECT: "She looks worried." + - CORRECT: "He raises his fist." + - INCORRECT: "He appears angry." +- Race/ethnicity: use currently-accepted, respectful terminology; if you describe one person's racial or ethnic background, be consistent and describe all people in the same scene +- Do NOT guess at racial, ethnic, gender, or religious identity if not clearly confirmed — use neutral descriptors (e.g., "a middle-aged person") rather than specifying when uncertain +- For disabilities: describe observable facts only (e.g., "a person using a motorized wheelchair") — do not interpret capability or emotional state + - CORRECT: "A person using a motorized wheelchair enters the room." + - INCORRECT: "A disabled person in a wheelchair rolls in." - Avoid language that stereotypes, sensationalises, or assigns motivation based on appearance +- For animals: use "it" or "the [animal]" — do NOT use gendered pronouns (he/she) unless the animal's sex is explicitly established in the dialogue or on-screen content + - CORRECT: "The cat stretches on the windowsill." + - INCORRECT: "He stretches his paw across the windowsill." (when sex is not established) -AUDIO DESCRIPTION GUIDELINES: +AUDIO DESCRIPTION GUIDELINES (DCMP standard): Priority order for what to describe (use available time wisely): 1. ESSENTIAL: Actions and details critical for following the narrative; information that would cause confusion if omitted; scene context and setting 2. HIGH PRIORITY: Significant character appearance relevant to the story; visual details supporting understanding; scene changes and time passages 3. TIME-PERMITTING: Additional aesthetic or contextual details Rules: +- Start generally (establish the context), then move to specific details - Place descriptions BEFORE the visual content they refer to when possible (pre-teaching), not after -- Use present tense, active voice, and third-person narrative -- Describe actions and observable gestures; do NOT infer or state emotions unless clearly displayed (e.g., "She covers her face with her hands" not "She looks devastated") +- Use present tense, active voice, and third-person narrative (e.g., "Ted breaks the window" not "The window was broken by Ted") +- Use clear, concise, complete sentences; sentence fragments only if time is severely limited +- Describe objectively without personal interpretation, censorship, or comment - Do NOT use cinematic terminology such as "close-up", "pan", "cut to", "flashback", or "montage" unless absolutely necessary for comprehension -- ON-SCREEN TEXT (MANDATORY): You MUST describe ALL visible text — titles, lower-thirds, signs, captions, graphics, URLs, phone numbers, product labels. Use the format: "Text on screen reads: [exact text]". Include it even if similar content is spoken; if the spoken dialogue covers it exactly, use "Title card: [text]" to note its presence. Do NOT silently skip any on-screen text. If text appears for less than one second, note it in the nearest available gap. +- ON-SCREEN TEXT (MANDATORY): You MUST describe ALL visible text — titles, lower-thirds, signs, captions, graphics, URLs, phone numbers, product labels. Use the format: "Text on screen reads: [exact text]". If text appears for less than one second, note it in the nearest available gap. Do NOT silently skip any on-screen text. +- Treat song lyrics as dialogue — describe the song, do not describe over the lyrics while they are playing - Describe colors, clothing, setting, and spatial relationships when relevant to understanding - Be succinct — omit redundant or self-evident details - Do NOT duplicate information already in the spoken dialogue @@ -81,9 +116,9 @@ Example output format: "confidence": 0.95, "summary": "A tutorial video showing how to use a web application dashboard.", "transcript_plaintext": "Hello everyone, welcome to this tutorial. Today we'll be exploring the dashboard interface. First, let's log in to the system.", - "captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHello everyone, welcome to this tutorial.\n\n00:00:04.000 --> 00:00:07.200\nToday we'll be exploring the dashboard interface.\n\n00:00:08.000 --> 00:00:10.500\nFirst, let's log in to the system.", - "audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nA bright computer monitor displays a clean, modern login page with blue and white corporate branding. The interface features prominently positioned username and password fields.\n\n00:00:05.000 --> 00:00:07.000\nA cursor arrow hovers over the rectangular username input field, which highlights with a subtle blue border as the user prepares to type.\n\n00:00:10.000 --> 00:00:12.000\nThe screen transitions to reveal a comprehensive dashboard filled with colorful charts, data widgets, and navigation panels arranged in an organized grid layout." + "captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHello everyone, welcome\nto this tutorial.\n\n00:00:04.000 --> 00:00:07.200\nToday we'll be exploring\nthe dashboard interface.\n\n00:00:08.000 --> 00:00:10.500\nFirst, let's log in to the system.", + "audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nA bright computer monitor displays a clean, modern login page with blue and white corporate branding.\n\n00:00:05.000 --> 00:00:07.000\nA cursor hovers over the username input field, which highlights with a subtle blue border.\n\n00:00:10.000 --> 00:00:12.000\nThe screen reveals a comprehensive dashboard filled with colorful charts and data widgets." } ``` -Follow this exact structure and formatting. \ No newline at end of file +Follow this exact structure and formatting. diff --git a/backend/app/prompts/gemini_ingestion_targeted.md b/backend/app/prompts/gemini_ingestion_targeted.md index dd528c7..4af276b 100644 --- a/backend/app/prompts/gemini_ingestion_targeted.md +++ b/backend/app/prompts/gemini_ingestion_targeted.md @@ -46,30 +46,65 @@ CRITICAL TIMING REQUIREMENTS: BRAND NAMES AND PRODUCTS: {BRAND_CONTEXT} -- When you can clearly identify a product that matches a brand in the provided list, use the brand name rather than a generic descriptor (e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes") -- Only use brand names when you are confident of the identification from visible labels, logos, or distinctive design -- If a product is not on the list or is unclear, use a generic descriptor — do not guess +- You MUST use the exact brand names from the list whenever those products are visible on screen +- Always prefer the brand name over a generic description — e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes", "3M VetBond" not "tissue adhesive" +- If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification +- If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names -ETHICAL GUIDELINES FOR DESCRIBING PEOPLE: -- Describe people objectively and factually based on what is clearly visible — do not interpret, assume, or editorialize -- Use person-first, inclusive language (e.g., "a person using a wheelchair" not "a wheelchair-bound person"; "officer" not "policeman") -- Describe race, ethnicity, gender, age, or other personal characteristics ONLY when they are relevant to the narrative or plot. When you describe these characteristics for one person, be consistent and describe them for all relevant people in the same scene -- Do NOT guess at racial, ethnic, gender, or religious identity if it is not clearly confirmed by visual context or dialogue — use general descriptors instead -- For disabilities or medical conditions: describe observable facts only — do not interpret emotional state or capability +CAPTION FORMATTING (DCMP standard): +- Maximum TWO lines per caption. Never use three or more lines. +- Each line should be no longer than ~37 characters where possible (42 absolute max) +- Do NOT split lines in the middle of: a modifier and its word, a prepositional phrase, a person's name and title, after a conjunction (and/or/but/because), or between an auxiliary verb and the word it modifies +- Do NOT start a new sentence on the same line where a previous sentence ends (unless both sentences are very short and closely related) +- Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds +- Use mixed case. Use ALL CAPS only for screaming or shouting + +SOUND AND MUSIC FORMATTING (DCMP standard): +- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching] +- Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks] +- Background music with lyrics: use the ♪ music symbol, e.g., ♪ Here comes the sun ♪ +- End of a song: use double music symbols, e.g., ♪ last lyric line ♪♪ +- Offscreen sounds and offscreen music: wrap in VTT italic tags, e.g., [phone ringing], ♪ upbeat jazz ♪ +- Do not caption background music shorter than 5 seconds +- Describe music mood objectively using brackets, e.g., [suspenseful orchestral music] — avoid subjective words like "beautiful" or "haunting" +- For standard captions (non-SDH): include significant non-speech audio events in brackets when they are essential for understanding the content + +CAPTION PLACEMENT: +- Captions are normally positioned at the bottom of the screen +- When visible text, graphics, logos, or on-screen information appear at the bottom of the frame during a caption cue, add the VTT cue setting "line:0%" to move that caption to the top — format: "00:00:01.000 --> 00:00:03.000 line:0%" + +ETHICAL GUIDELINES FOR DESCRIBING PEOPLE (DCMP standard): +- Consistently identify people/characters by name. When a name is not yet known, identify by the most obvious visible attribute (e.g., "the person in the red jacket") until the name is established, then switch to the name and use it consistently +- Describe discernable physical attributes and observable gestures — do NOT infer or state emotional states + - CORRECT: "She furrows her brow and clasps her hands." + - INCORRECT: "She looks worried." + - CORRECT: "He raises his fist." + - INCORRECT: "He appears angry." +- Race/ethnicity: use currently-accepted, respectful terminology; if you describe one person's racial or ethnic background, be consistent and describe all people in the same scene +- Do NOT guess at racial, ethnic, gender, or religious identity if not clearly confirmed — use neutral descriptors (e.g., "a middle-aged person") rather than specifying when uncertain +- For disabilities: describe observable facts only (e.g., "a person using a motorized wheelchair") — do not interpret capability or emotional state + - CORRECT: "A person using a motorized wheelchair enters the room." + - INCORRECT: "A disabled person in a wheelchair rolls in." - Avoid language that stereotypes, sensationalises, or assigns motivation based on appearance +- For animals: use "it" or "the [animal]" — do NOT use gendered pronouns (he/she) unless the animal's sex is explicitly established in the dialogue or on-screen content + - CORRECT: "The cat stretches on the windowsill." + - INCORRECT: "He stretches his paw across the windowsill." (when sex is not established) -AUDIO DESCRIPTION GUIDELINES: +AUDIO DESCRIPTION GUIDELINES (DCMP standard): Priority order for what to describe (use available time wisely): 1. ESSENTIAL: Actions and details critical for following the narrative; information that would cause confusion if omitted; scene context and setting 2. HIGH PRIORITY: Significant character appearance relevant to the story; visual details supporting understanding; scene changes and time passages 3. TIME-PERMITTING: Additional aesthetic or contextual details Rules: +- Start generally (establish the context), then move to specific details - Place descriptions BEFORE the visual content they refer to when possible (pre-teaching), not after -- Use present tense, active voice, and third-person narrative -- Describe actions and observable gestures; do NOT infer or state emotions unless clearly displayed (e.g., describe the gesture, not the inferred feeling) +- Use present tense, active voice, and third-person narrative (e.g., "Ted breaks the window" not "The window was broken by Ted") +- Use clear, concise, complete sentences; sentence fragments only if time is severely limited +- Describe objectively without personal interpretation, censorship, or comment - Do NOT use cinematic terminology such as "close-up", "pan", "cut to", "flashback", or "montage" unless absolutely necessary for comprehension -- ON-SCREEN TEXT (MANDATORY): You MUST describe ALL visible text — titles, lower-thirds, signs, captions, graphics, URLs, phone numbers, product labels. Use the format: "Text on screen reads: [exact text]". Include it even if similar content is spoken; if the spoken dialogue covers it exactly, use "Title card: [text]" to note its presence. Do NOT silently skip any on-screen text. If text appears for less than one second, note it in the nearest available gap. +- ON-SCREEN TEXT (MANDATORY): You MUST describe ALL visible text — titles, lower-thirds, signs, captions, graphics, URLs, phone numbers, product labels. Use the format: "Text on screen reads: [exact text]". If text appears for less than one second, note it in the nearest available gap. Do NOT silently skip any on-screen text. +- Treat song lyrics as dialogue — describe the song, do not describe over the lyrics while they are playing - Describe colors, clothing, setting, and spatial relationships when relevant to understanding - Be succinct — omit redundant or self-evident details - Do NOT duplicate information already in the spoken dialogue @@ -86,8 +121,8 @@ Example output format (if TARGET_LANGUAGE were Spanish): "confidence": 0.95, "summary": "Un video tutorial que muestra como usar una aplicacion web de panel de control.", "transcript_plaintext": "Hola a todos, bienvenidos a este tutorial. Hoy vamos a explorar la interfaz del panel de control. Primero, iniciemos sesion en el sistema.", - "captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHola a todos, bienvenidos a este tutorial.\n\n00:00:04.000 --> 00:00:07.200\nHoy vamos a explorar la interfaz del panel de control.\n\n00:00:08.000 --> 00:00:10.500\nPrimero, iniciemos sesion en el sistema.", - "audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nUna pantalla de computadora brillante muestra una pagina de inicio de sesion moderna y limpia con marca corporativa azul y blanca.\n\n00:00:05.000 --> 00:00:07.000\nUn cursor se desplaza sobre el campo de entrada de nombre de usuario, que se resalta con un borde azul sutil.\n\n00:00:10.000 --> 00:00:12.000\nLa pantalla cambia para revelar un panel completo lleno de graficos coloridos y widgets de datos." + "captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHola a todos, bienvenidos\na este tutorial.\n\n00:00:04.000 --> 00:00:07.200\nHoy vamos a explorar la interfaz\ndel panel de control.\n\n00:00:08.000 --> 00:00:10.500\nPrimero, iniciemos sesion en el sistema.", + "audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nUna pantalla de computadora muestra una pagina de inicio de sesion moderna con marca corporativa azul y blanca.\n\n00:00:05.000 --> 00:00:07.000\nUn cursor se desplaza sobre el campo de nombre de usuario, que se resalta con un borde azul sutil.\n\n00:00:10.000 --> 00:00:12.000\nLa pantalla revela un panel completo con graficos coloridos y widgets de datos." } ``` diff --git a/backend/app/services/descriptive_transcript.py b/backend/app/services/descriptive_transcript.py new file mode 100644 index 0000000..6b07e78 --- /dev/null +++ b/backend/app/services/descriptive_transcript.py @@ -0,0 +1,89 @@ +""" +Descriptive Transcript Service + +Generates a WCAG-compliant descriptive transcript by merging captions (speech) +and audio descriptions (visuals) into a single chronological plain text document. + +Format: + (SPEECH) + [MUSIC PLAYING] + + (DESCRIPTION) + Jennifer runs across the stage. + + (SPEECH) + In sports, there is no substitute for sweat. + +Reference: WCAG 2.1 Success Criterion 1.2.1 +""" +from ..lib.vtt import VTTCue, VTTParser +from ..core.logging import get_logger + +logger = get_logger(__name__) + + +def generate_descriptive_transcript(captions_vtt: str, ad_vtt: str) -> str: + """ + Merge captions VTT and audio description VTT into a descriptive transcript. + + Args: + captions_vtt: WebVTT content for captions (speech + non-speech audio) + ad_vtt: WebVTT content for audio descriptions (visual descriptions) + + Returns: + Plain text descriptive transcript with (SPEECH) and (DESCRIPTION) sections + """ + try: + caption_cues = VTTParser.parse(captions_vtt) if captions_vtt else [] + except Exception as e: + logger.warning(f"Failed to parse captions VTT for transcript: {e}") + caption_cues = [] + + try: + ad_cues = VTTParser.parse(ad_vtt) if ad_vtt else [] + except Exception as e: + logger.warning(f"Failed to parse AD VTT for transcript: {e}") + ad_cues = [] + + if not caption_cues and not ad_cues: + return "" + + # Tag each cue with its type + tagged: list[tuple[str, VTTCue]] = ( + [("speech", cue) for cue in caption_cues] + + [("description", cue) for cue in ad_cues] + ) + + # Sort chronologically by start time; descriptions before captions at the same time + tagged.sort(key=lambda x: (x[1].start_time, 0 if x[0] == "description" else 1)) + + # Group consecutive same-type cues together + sections: list[tuple[str, list[str]]] = [] + current_type: str | None = None + current_texts: list[str] = [] + + for cue_type, cue in tagged: + text = cue.text.strip() + if not text: + continue + + if cue_type == current_type: + current_texts.append(text) + else: + if current_type is not None and current_texts: + sections.append((current_type, current_texts)) + current_type = cue_type + current_texts = [text] + + if current_type is not None and current_texts: + sections.append((current_type, current_texts)) + + # Build the output text + output_lines: list[str] = [] + for section_type, texts in sections: + header = "(SPEECH)" if section_type == "speech" else "(DESCRIPTION)" + output_lines.append(header) + output_lines.append("\n".join(texts)) + output_lines.append("") # blank line between sections + + return "\n".join(output_lines).strip() diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py index 84b400a..8d369a8 100644 --- a/backend/app/services/gemini.py +++ b/backend/app/services/gemini.py @@ -779,63 +779,101 @@ JSON: source_language: str = "en" ) -> str: """ - Translate VTT content using Gemini, preserving timing and structure. - More cost-effective alternative to Google Translate API (6-36x cheaper). + Translate VTT content using Gemini, preserving timing programmatically. - Args: - vtt_content: The VTT file content to translate - target_language: The language code to translate to (e.g., 'es', 'fr') - source_language: The source language code (default: 'en') + Uses a two-step approach to guarantee timestamp integrity: + 1. Send only the text cues (no timestamps) to Gemini as a numbered list + 2. Apply translated texts back onto the original VTT using translate_preserving_timing() - Returns: - Translated VTT content with preserved timestamps + This avoids any possibility of Gemini drifting or altering timestamps. """ - prompt = f"""Translate this WebVTT subtitle file from {source_language} to {target_language}. + from ..lib.vtt import VTTParser, VTTEditor -CRITICAL REQUIREMENTS: -- Preserve ALL timestamps exactly as-is (do not modify any timing) -- Keep the WEBVTT header line -- Translate ONLY the text content between timestamps -- Maintain readable line lengths (~32-40 characters per line) -- Handle idioms and slang naturally in {target_language} -- Preserve any speaker labels (e.g., "[Speaker 1]:") -- Do NOT add any explanation or markdown - return ONLY the translated VTT + source_cues = VTTParser.parse(vtt_content) + if not source_cues: + logger.warning(f"No cues found in VTT for {target_language} translation") + return vtt_content -VTT Content to translate: -{vtt_content}""" + cue_count = len(source_cues) + + async def _attempt_translation(extra_instruction: str = "") -> list[str]: + numbered_texts = "\n".join( + f"{i + 1}. {cue.text.replace(chr(10), ' ')}" + for i, cue in enumerate(source_cues) + ) + prompt = f"""Translate the following {cue_count} numbered text segments from {source_language} to {target_language}. + +REQUIREMENTS: +- Return EXACTLY {cue_count} numbered lines, one translation per line +- Format: "1. translated text", "2. translated text", etc. +- Preserve speaker labels like [Speaker 1]: unchanged +- Use natural, idiomatic {target_language} +- Do NOT add any explanation, preamble, or extra lines{extra_instruction} + +Segments to translate: +{numbered_texts}""" - try: response = await asyncio.to_thread( client.models.generate_content, model=self.model_name, contents=[genai.types.Part.from_text(text=prompt)] ) + return self._parse_numbered_translation(response.text.strip(), cue_count) - result = response.text.strip() + try: + translated_texts = await _attempt_translation() - # Handle potential markdown formatting - if result.startswith("```"): - # Remove markdown code blocks - lines = result.split("\n") - # Filter out lines that are just ``` or ```vtt or ```webvtt - filtered_lines = [ - line for line in lines - if not line.strip().startswith("```") - ] - result = "\n".join(filtered_lines).strip() + if len(translated_texts) != cue_count: + logger.warning( + f"Translation cue count mismatch for {target_language}: " + f"expected {cue_count}, got {len(translated_texts)}. Retrying." + ) + translated_texts = await _attempt_translation( + extra_instruction=f"\n- You MUST return exactly {cue_count} lines, no more, no less" + ) - # Validate VTT format - if not result.startswith("WEBVTT"): - logger.warning("Gemini translation missing WEBVTT header, adding it") - result = "WEBVTT\n\n" + result + if len(translated_texts) != cue_count: + # Pad or truncate as last resort to avoid breaking downstream + logger.warning( + f"Retried translation still mismatched ({len(translated_texts)} vs {cue_count}). " + f"Padding/truncating to match." + ) + if len(translated_texts) < cue_count: + translated_texts.extend( + source_cues[i].text + for i in range(len(translated_texts), cue_count) + ) + else: + translated_texts = translated_texts[:cue_count] - logger.info(f"Successfully translated VTT to {target_language} using Gemini") + result = VTTEditor.translate_preserving_timing(vtt_content, translated_texts) + logger.info(f"Successfully translated VTT to {target_language} ({cue_count} cues)") return result except Exception as e: logger.error(f"Gemini translation failed for {target_language}: {e}") raise + @staticmethod + def _parse_numbered_translation(response_text: str, expected_count: int) -> list[str]: + """Parse a numbered list response from Gemini into a list of translated texts.""" + import re + lines = response_text.strip().split("\n") + results = [] + for line in lines: + line = line.strip() + if not line: + continue + # Match "1. text", "1) text", or just text if already stripped + match = re.match(r"^\d+[.)]\s+(.+)$", line) + if match: + results.append(match.group(1).strip()) + elif results or re.match(r"^\d+[.)]", line) is None: + # Non-numbered continuation line — append to last result or skip + if results: + results[-1] += " " + line + return results + async def rewrite_tts_cue( self, original_text: str, diff --git a/backend/app/services/video_renderer.py b/backend/app/services/video_renderer.py index d24226b..744fc2a 100644 --- a/backend/app/services/video_renderer.py +++ b/backend/app/services/video_renderer.py @@ -446,7 +446,7 @@ class VideoRendererService: all_audio = base_audio + "".join(ad_labels) num_inputs = 1 + len(ad_labels) filter_parts.append( - f"{all_audio}amix=inputs={num_inputs}:duration=first:dropout_transition=0[mixed]" + f"{all_audio}amix=inputs={num_inputs}:duration=first:dropout_transition=0:normalize=0[mixed]" ) audio_output = "[mixed]" else: diff --git a/backend/app/services/zip_download.py b/backend/app/services/zip_download.py index fa373e9..4e280b8 100644 --- a/backend/app/services/zip_download.py +++ b/backend/app/services/zip_download.py @@ -48,6 +48,7 @@ FILE_TYPE_MAPPING = { "ad_mp3_gcs": "ad.mp3", "accessible_video_gcs": "accessible_video.mp4", "retimed_captions_vtt_gcs": "accessible_captions.vtt", + "descriptive_transcript_gcs": "descriptive_transcript.txt", } diff --git a/backend/app/tasks/ingest_and_ai.py b/backend/app/tasks/ingest_and_ai.py index a728eb6..a96fc6a 100644 --- a/backend/app/tasks/ingest_and_ai.py +++ b/backend/app/tasks/ingest_and_ai.py @@ -262,12 +262,31 @@ async def ingest_and_ai_task_impl(job_id: str): f"{job_id}/{source_language}/sdh_captions.vtt" ) + # Generate descriptive transcript (WCAG 2.1 1.2.1) + transcript_gcs_uri = None + try: + from ..services.descriptive_transcript import generate_descriptive_transcript + transcript_text = generate_descriptive_transcript( + ai_result["captions_vtt"], + ai_result["audio_description_vtt"] + ) + if transcript_text: + transcript_gcs_uri = await upload_vtt_to_gcs( + transcript_text, + f"{job_id}/{source_language}/descriptive_transcript.txt" + ) + logger.info(f"Generated descriptive transcript for job {job_id}, language {source_language}") + except Exception as e: + logger.warning(f"Failed to generate descriptive transcript for job {job_id}: {e}") + source_lang_output = { "captions_vtt_gcs": captions_gcs_uri, "ad_vtt_gcs": ad_gcs_uri, } if sdh_gcs_uri: source_lang_output["sdh_captions_vtt_gcs"] = sdh_gcs_uri + if transcript_gcs_uri: + source_lang_output["descriptive_transcript_gcs"] = transcript_gcs_uri # Update job with AI results, detected language, and outputs # Set status to TRANSLATING to trigger translation pipeline before QC diff --git a/backend/app/tasks/rerender_accessible_video.py b/backend/app/tasks/rerender_accessible_video.py index f823f21..0ff7077 100644 --- a/backend/app/tasks/rerender_accessible_video.py +++ b/backend/app/tasks/rerender_accessible_video.py @@ -514,23 +514,30 @@ def _build_placements_with_adjustments( adjusted_ms = pp.get("adjusted_ms") # Fallback for data without source_ms (backward compatibility) + # When source_ms is missing we cannot reliably map rendered adjustments + # back to source coordinates, so we skip the delta and use original_ms as-is. if source_ms is None: logger.warning( - f"Cue {cue_idx}: No source_ms found, falling back to original_ms. " - "Job may need to be re-processed from initial render." + f"Cue {cue_idx}: No source_ms found, falling back to original_ms " + "without applying adjustment delta. " + "Job may need to be re-processed from initial render for timing adjustments to work." ) - source_ms = original_ms + # Use original_ms directly; skip adjustment to avoid double-counting + pause_time_s = original_ms / 1000.0 if original_ms is not None else cues[cue_idx].start_time if cue_idx < len(cues) else 0.0 + adjusted_pause_by_cue[cue_idx] = pause_time_s + continue - # Apply user adjustment as relative offset + # Apply user adjustment as relative offset in source coordinates if adjusted_ms is not None and original_ms is not None: # User adjusted in rendered timeline - apply same delta to source adjustment_delta = adjusted_ms - original_ms - source_ms = source_ms + adjustment_delta + adjusted_source_ms = source_ms + adjustment_delta logger.info( f"Cue {cue_idx}: Applying adjustment delta {adjustment_delta:.1f}ms " f"(rendered: {original_ms:.1f} -> {adjusted_ms:.1f}, " - f"source: {source_ms - adjustment_delta:.1f} -> {source_ms:.1f})" + f"source: {source_ms:.1f} -> {adjusted_source_ms:.1f})" ) + source_ms = adjusted_source_ms # Convert to seconds for placement pause_time_s = source_ms / 1000.0 diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py index 2242c40..09ce1ea 100644 --- a/backend/app/tasks/translate_and_synthesize.py +++ b/backend/app/tasks/translate_and_synthesize.py @@ -7,6 +7,7 @@ import time import random from celery import Task +from celery.exceptions import SoftTimeLimitExceeded from motor.motor_asyncio import AsyncIOMotorClient from ..core.config import settings @@ -96,7 +97,39 @@ async def retry_with_backoff(func, max_retries=3, base_delay=1): raise last_exception -@celery_app.task(bind=True) +async def _mark_task_timed_out(job_id: str) -> None: + """Mark a job as failed due to task timeout and notify via WebSocket""" + try: + client = AsyncIOMotorClient(settings.mongodb_uri) + db = client[settings.mongodb_db] + job_doc = await db.jobs.find_one({"_id": job_id}, {"status": 1, "title": 1}) + if not job_doc: + return + current_status = job_doc.get("status", "") + # Map current processing state to appropriate failure status + if current_status in ("tts_generating", "translating"): + fail_status = JobStatus.TTS_FAILED.value + else: + fail_status = current_status # Leave as-is for other states + await db.jobs.update_one( + {"_id": job_id}, + {"$set": { + "status": fail_status, + "updated_at": datetime.utcnow(), + "error": "Processing timed out. Please retry." + }} + ) + broadcast_status_update( + job_id=job_id, + status=fail_status, + job_title=job_doc.get("title"), + message="Processing timed out. Please retry." + ) + except Exception as e: + logger.error(f"Failed to mark job {job_id} as timed out: {e}") + + +@celery_app.task(bind=True, time_limit=3600, soft_time_limit=3400) def translate_and_synthesize_task(self, job_id: str): """ Pipeline 2: Translation & MP3 Generation @@ -109,6 +142,11 @@ def translate_and_synthesize_task(self, job_id: str): result = asyncio.run(_async_translate_and_synthesize(job_id)) logger.info(f"✅ CELERY TASK COMPLETED successfully for job {job_id}") return result + except SoftTimeLimitExceeded: + logger.error(f"⏰ translate_and_synthesize_task soft time limit exceeded for job {job_id}") + import asyncio as _asyncio + _asyncio.run(_mark_task_timed_out(job_id)) + raise except Exception as e: logger.error(f"❌ CELERY TASK FAILED for job {job_id}: {str(e)}") logger.error(f"❌ Exception type: {type(e).__name__}") @@ -274,12 +312,25 @@ async def _async_translate_and_synthesize(job_id: str): f"{job_id}/{lang}/sdh_captions.vtt" ) + # Generate descriptive transcript (WCAG 2.1 1.2.1) + transcript_gcs_uri = None + try: + from ..services.descriptive_transcript import generate_descriptive_transcript + transcript_text = generate_descriptive_transcript(translated_captions, translated_ad) + if transcript_text: + transcript_gcs_uri = await upload_vtt_to_gcs( + transcript_text, + f"{job_id}/{lang}/descriptive_transcript.txt" + ) + except Exception as transcript_err: + logger.warning(f"Failed to generate descriptive transcript for {lang}: {transcript_err}") + logger.info(f"Completed video-native translation for {lang}") - return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, None) + return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, transcript_gcs_uri, None) except Exception as e: logger.error(f"Video-native translation failed for {lang}: {e}") - return (lang, None, None, None, str(e)) + return (lang, None, None, None, None, str(e)) # Run all translations in parallel (limited by semaphore) if target_languages: @@ -298,7 +349,7 @@ async def _async_translate_and_synthesize(job_id: str): "qa_notes": f"Translation failed: {str(result)}" } else: - lang, captions_uri, ad_uri, sdh_uri, error_msg = result + lang, captions_uri, ad_uri, sdh_uri, transcript_uri, error_msg = result if error_msg: updated_outputs[lang] = { "origin": "video_native", @@ -312,6 +363,8 @@ async def _async_translate_and_synthesize(job_id: str): } if sdh_uri: lang_out["sdh_captions_vtt_gcs"] = sdh_uri + if transcript_uri: + lang_out["descriptive_transcript_gcs"] = transcript_uri updated_outputs[lang] = lang_out logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)") @@ -381,6 +434,19 @@ async def _async_translate_and_synthesize(job_id: str): ) lang_out["sdh_captions_vtt_gcs"] = sdh_gcs_uri + # Generate descriptive transcript (WCAG 2.1 1.2.1) + try: + from ..services.descriptive_transcript import generate_descriptive_transcript + transcript_text = generate_descriptive_transcript(translated_captions, translated_ad) + if transcript_text: + transcript_gcs_uri = await upload_vtt_to_gcs( + transcript_text, + f"{job_id}/{language}/descriptive_transcript.txt" + ) + lang_out["descriptive_transcript_gcs"] = transcript_gcs_uri + except Exception as transcript_err: + logger.warning(f"Failed to generate descriptive transcript for {language}: {transcript_err}") + # Store language outputs updated_outputs[language] = lang_out @@ -653,7 +719,8 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict, logger.warning(f"No cues to synthesize for job {job_id}, language {language}") return - # Poll for group completion with async sleep + # Poll for group completion with async sleep (max 15 minutes per language) + TTS_POLL_TIMEOUT_SECONDS = 900 poll_count = 0 while not group_result.ready(): await asyncio.sleep(1.0) @@ -663,6 +730,21 @@ async def _generate_language_tts(job_id: str, language: str, lang_output: dict, f"Waiting for TTS group ({language}): {poll_count}s elapsed, " f"completed={group_result.completed_count()}/{len(cues)}" ) + if poll_count >= TTS_POLL_TIMEOUT_SECONDS: + logger.error( + f"TTS group timed out for job {job_id}, language {language} " + f"after {TTS_POLL_TIMEOUT_SECONDS}s — " + f"{group_result.completed_count()}/{len(cues)} cues completed" + ) + raise TTSSynthesisError( + message=( + f"TTS synthesis timed out after {TTS_POLL_TIMEOUT_SECONDS}s " + f"({group_result.completed_count()}/{len(cues)} cues completed)" + ), + cue_index=group_result.completed_count(), + cue_text="", + api_response_info="timeout" + ) # Get results from all cue tasks # Use propagate=False to get results even if some tasks failed diff --git a/frontend/src/components/TimelinePreview/TimelinePreview.tsx b/frontend/src/components/TimelinePreview/TimelinePreview.tsx index 4a7779c..7df433e 100644 --- a/frontend/src/components/TimelinePreview/TimelinePreview.tsx +++ b/frontend/src/components/TimelinePreview/TimelinePreview.tsx @@ -116,7 +116,7 @@ export function TimelinePreview({ onClick={() => handleSegmentClick(segment)} title={ segment.is_freeze_frame - ? `AD Cue ${segment.cue_index}${isRegenerationQueued ? ' (Regenerate queued)' : ''}` + ? `AD Cue ${segment.cue_index !== null ? segment.cue_index + 1 : ''}${isRegenerationQueued ? ' (Regenerate queued)' : ''}` : `Video segment ${segment.segment_index}` } > @@ -124,7 +124,7 @@ export function TimelinePreview({ {segment.is_freeze_frame && segment.cue_index !== null && widthPercent > 2 && (