fix(pipeline): fix 5 QA tickets — caption alignment, glossary, source_has_ad render, filler words, NL error surfacing

- caption_aligner: lower match ratio 0.5→0.35, widen search window 60→150, add time-based cursor fallback on miss - gemini.py: explicit 'MUST use glossary terms' requirement in translate_vtt prompt; source_has_ad prompt now instructs not to include AD narration in captions - ingest_and_ai: load glossary for source language and pass to extract_accessibility - render_accessible_video: handle source_has_ad=True via caption-embed path (ffmpeg subtitle inject, no AD pipeline) - translate_and_synthesize: track failed languages, write translation_errors to DB, add exc_info to error log - vtt.py: expand _FILLER_PATTERNS to nl/pt/pl/uk/ru, widen EN/ES/FR/DE/IT lists - gemini_ingestion.md: strengthen line:0% placement rule, expand disfluency examples per language Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-08 18:36:59 +01:00 · 2026-05-08 18:36:59 +01:00 · 76bee82119
commit 76bee82119
parent f7708f0214
7 changed files with 148 additions and 25 deletions
--- a/backend/app/lib/vtt.py
+++ b/backend/app/lib/vtt.py
@ -288,11 +288,16 @@ class VTTEditor:

    # DCMP §6.01 filler patterns per language (whole-word, case-insensitive)
    _FILLER_PATTERNS: dict[str, str] = {
-        "en": r'\b(um+|uh+|ah+|er+|hmm+|like|you know|i mean)\b',
-        "es": r'\b(eh+|este|o sea|pues)\b',
-        "fr": r'\b(euh+|beh|ben|donc|quoi)\b',
-        "de": r'\b(äh+|ähm+|halt|ne)\b',
-        "it": r'\b(ehm+|allora|cioè|tipo)\b',
+        "en": r'\b(um+|uh+|ah+|er+|hmm+|you know|i mean|sort of|kind of|basically|literally|honestly|actually|right\?|so yeah)\b',
+        "es": r'\b(eh+|este|o sea|pues|bueno|o sea que|mmm+)\b',
+        "fr": r'\b(euh+|beh|ben|donc|quoi|enfin|voilà|genre)\b',
+        "de": r'\b(äh+|ähm+|halt|ne|also|naja|sozusagen|quasi)\b',
+        "it": r'\b(ehm+|allora|cioè|tipo|praticamente|insomma|ecco)\b',
+        "nl": r'\b(eh+|nou|zeg|eigenlijk|gewoon|toch|zo van|hè)\b',
+        "pt": r'\b(ahn+|hã+|né|sabe|tipo|então|assim)\b',
+        "pl": r'\b(no|że|bo|znaczy|właśnie|jakby|wiesz)\b',
+        "uk": r'\b(ну+|ем+|типу|знаєш|значить|власне|от)\b',
+        "ru": r'\b(ну+|эм+|типа|знаешь|значит|вот|собственно)\b',
    }

    @staticmethod
--- a/backend/app/prompts/gemini_ingestion.md
+++ b/backend/app/prompts/gemini_ingestion.md
@ -59,11 +59,12 @@ CAPTION FORMATTING (DCMP standard):
 - Use mixed case. Use ALL CAPS only for screaming or shouting

 DISFLUENCY REMOVAL (DCMP §6.01):
- Do NOT include filler words, false starts, or hesitations in captions
- Remove: "um", "uh", "ah", "er", "hmm", "like" (as filler), "you know" (as filler), "I mean" (as filler)
- Also remove language-specific fillers (e.g., "euh"/"beh" in French, "äh"/"ähm" in German, "eh"/"este" in Spanish, "ehm"/"allora" in Italian)
+- MANDATORY: Never include filler words, false starts, or hesitations in captions — remove them silently
+- English fillers to remove: "um", "uh", "ah", "er", "hmm", "you know", "I mean", "sort of", "kind of", "basically", "literally", "honestly"
+- Language-specific fillers: French "euh"/"beh"/"ben"/"genre", German "äh"/"ähm"/"halt"/"also", Spanish "eh"/"este"/"o sea"/"pues", Italian "ehm"/"allora"/"cioè"/"tipo", Dutch "eh"/"nou"/"zeg"/"eigenlijk", Portuguese "ahn"/"né"/"sabe"/"tipo"
 - Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store")
 - Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices
+- When in doubt whether a word is a filler or content: omit it — clean captions are preferred over over-inclusive ones

 SOUND AND MUSIC FORMATTING (DCMP standard):
 - Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
@ -77,7 +78,9 @@ SOUND AND MUSIC FORMATTING (DCMP standard):

 CAPTION PLACEMENT:
 - Captions are normally positioned at the bottom of the screen
- When visible text, graphics, logos, or on-screen information appear at the bottom of the frame during a caption cue, add the VTT cue setting "line:0%" to move that caption to the top — format: "00:00:01.000 --> 00:00:03.000 line:0%"
+- CRITICAL: When ANY of the following are visible at the BOTTOM of the frame during a caption cue — on-screen text, lower-thirds, name plates, location titles, graphics, logos, product labels, URLs, or any visual information — you MUST add the VTT cue setting "line:0%" to move that cue to the top of the screen. Format: "00:00:01.000 --> 00:00:03.000 line:0%"
+- When in doubt whether bottom content conflicts with captions, use "line:0%" — it is better to be at the top than to obstruct important on-screen information
+- Example: if a lower-third name plate is visible at seconds 0:05–0:08, all caption cues overlapping that range must have "line:0%"

 ETHICAL GUIDELINES FOR DESCRIBING PEOPLE (DCMP standard):
 - Consistently identify people/characters by name. When a name is not yet known, identify by the most obvious visible attribute (e.g., "the person in the red jacket") until the name is established, then switch to the name and use it consistently
--- a/backend/app/services/caption_aligner.py
+++ b/backend/app/services/caption_aligner.py
@ -10,6 +10,7 @@ Algorithm:
  graceful fallbacks where Whisper didn't capture the audio.
 """

+import bisect
 import re
 from dataclasses import dataclass

@ -23,10 +24,12 @@ logger = get_logger(__name__)
 _PUNCT = re.compile(r"[^\w']", re.UNICODE)
 # Tokens shorter than this are considered stop-words and excluded from matching
 _MIN_TOKEN_LEN = 2
-# Minimum fraction of cue tokens that must match Whisper words for alignment
-_MIN_MATCH_RATIO = 0.5
-# How many Whisper words ahead of the cursor to search for a cue's tokens
-_SEARCH_WINDOW = 60
+# Minimum fraction of cue tokens that must match Whisper words for alignment.
+# Lowered from 0.5 → 0.35 to handle Gemini paraphrasing and short cues.
+_MIN_MATCH_RATIO = 0.35
+# How many Whisper words ahead of the cursor to search for a cue's tokens.
+# Widened from 60 → 150 so the window stays valid even after several failed cues.
+_SEARCH_WINDOW = 150


 def _tokenise(text: str) -> list[str]:
@ -80,6 +83,13 @@ def _find_match(
    return best


+def _cursor_for_time(whisper_words: list[WordTimestamp], t: float, from_idx: int) -> int:
+    """Return the index of the first Whisper word at or after time t, starting from from_idx."""
+    starts = [w.start for w in whisper_words]
+    idx = bisect.bisect_left(starts, t, from_idx)
+    return min(idx, len(whisper_words) - 1)
+
+
 def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
    """Replace VTT cue timings with Whisper-accurate timestamps where possible.

@ -97,23 +107,23 @@ def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
    for cue in cues:
        tokens = _tokenise(cue.text)
        if not tokens:
-            # Sound-effect or music cue — nothing to align
            continue

        match = _find_match(tokens, whisper_words, cursor)
        if match is None:
+            # Advance cursor to the Whisper word closest to this cue's start time
+            # so subsequent cues don't search from a stale position.
+            cursor = _cursor_for_time(whisper_words, cue.start_time, cursor)
            continue

        new_start = whisper_words[match.first_word_idx].start
        new_end = whisper_words[match.last_word_idx].end

-        # Sanity: don't create zero-duration or backwards cues
        if new_end > new_start:
            cue.start_time = new_start
            cue.end_time = new_end
            aligned += 1

-        # Advance cursor to just past the last matched word
        cursor = match.last_word_idx + 1

    logger.info(
--- a/backend/app/services/gemini.py
+++ b/backend/app/services/gemini.py
@ -146,8 +146,11 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
        if source_has_ad:
            return (
                "SOURCE AUDIO DESCRIPTION NOTICE: This video already has professional audio descriptions "
-                "embedded in its audio track. Return an empty audio_description_vtt containing only "
-                "the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions."
+                "embedded in its audio track. "
+                "1) Return an empty audio_description_vtt containing only the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions. "
+                "2) For captions_vtt: transcribe ONLY the original program dialogue and relevant sound effects. "
+                "Do NOT caption the audio description narration — AD narration is spoken during natural pauses "
+                "and describes visual scenes rather than being part of the original dialogue."
            )
        return ""

@ -891,6 +894,10 @@ JSON:
            _tgt_label = locale_lib.get_gemini_label(target_language)
            _glossary_section = self._build_glossary_block(glossary_block)
            _glossary_line = f"\n\n{_glossary_section}" if _glossary_section else ""
+            _glossary_req = (
+                "\n- MUST use the exact approved terms from the glossary below — these override natural translation choices, even for English terms"
+                if _glossary_section else ""
+            )
            _adapt_line = _style_instruction.format(tgt=_tgt_label) if style == "transcreate" else ""
            prompt = f"""Translate the following {cue_count} numbered text segments from {_src_label} to {_tgt_label}.

@ -899,7 +906,7 @@ REQUIREMENTS:
 - Format: "1. translated text", "2. translated text", etc.
 - Preserve speaker labels like [Speaker 1]: unchanged
 - {_adapt_line}Use natural, idiomatic {_tgt_label}
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_line}
+- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_req}{_glossary_line}

 Segments to translate:
 {numbered_texts}"""
--- a/backend/app/tasks/ingest_and_ai.py
+++ b/backend/app/tasks/ingest_and_ai.py
@ -169,11 +169,17 @@ async def ingest_and_ai_task_impl(job_id: str):
                    user_external_id=_cost_ctx["user_id"],
                    project_id=_cost_ctx["project_id"],
                )
+                # Load glossary for source language — use brand context as vocabulary hint
+                from ..services.glossary_service import get_glossary_block_for_job
+                _source_lang = job_doc.get("source", {}).get("language", "en")
+                _job_for_glossary = {**job_doc, "_glossary_source_text": brand_context or ""}
+                glossary_block = await get_glossary_block_for_job(_job_for_glossary, _source_lang, db)
                ai_result = await gemini_service.extract_accessibility(
                    temp_path,
                    brand_context=brand_context,
                    sdh_requested=sdh_requested,
                    source_has_ad=source_has_ad,
+                    glossary_block=glossary_block,
                    _cost_ctx=_cost_ctx,
                )
                # Enforce: if source already has AD, discard any AI-generated AD
--- a/backend/app/tasks/render_accessible_video.py
+++ b/backend/app/tasks/render_accessible_video.py
@ -135,6 +135,15 @@ async def _async_render_accessible_video(job_id: str, language: str):
            if not lang_output:
                raise ValueError(f"No outputs found for language {language}")

+            # When source already has professional AD, render captions-only accessible video
+            source_has_ad = job_doc.get("source", {}).get("source_has_ad", False)
+            if source_has_ad:
+                await _render_source_has_ad_video(
+                    job_id, job_doc, language, lang_output,
+                    source_video_path, temp_dir, db, job_title
+                )
+                return
+
            # 3. Download AD VTT content
            ad_vtt_gcs = lang_output.get("ad_vtt_gcs")
            if not ad_vtt_gcs:
@ -367,6 +376,83 @@ async def _async_render_accessible_video(job_id: str, language: str):
        client.close()


+async def _render_source_has_ad_video(
+    job_id: str,
+    job_doc: dict,
+    language: str,
+    lang_output: dict,
+    source_video_path: str,
+    temp_dir: str,
+    db,
+    job_title: str,
+) -> None:
+    """Render accessible video for jobs where the source already has professional AD.
+
+    Embeds the captions VTT as a soft subtitle track — no AD audio injection needed
+    since the original audio track already contains the AD narration.
+    """
+    captions_vtt_gcs = lang_output.get("captions_vtt_gcs")
+    if not captions_vtt_gcs:
+        raise ValueError(f"No captions VTT found for language {language}")
+
+    # Download captions VTT
+    captions_blob_path = captions_vtt_gcs.replace(f"gs://{settings.gcs_bucket}/", "")
+    captions_vtt_content = gcs_service.bucket.blob(captions_blob_path).download_as_text()
+
+    # Write VTT to temp file
+    vtt_path = os.path.join(temp_dir, "captions.vtt")
+    with open(vtt_path, "w", encoding="utf-8") as f:
+        f.write(captions_vtt_content)
+
+    # Embed captions as soft subtitle track — no re-encode needed
+    output_video_path = os.path.join(temp_dir, "accessible_video.mp4")
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", source_video_path,
+        "-i", vtt_path,
+        "-c", "copy",
+        "-c:s", "webvtt",
+        "-metadata:s:s:0", f"language={language}",
+        output_video_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg caption embed failed: {result.stderr[-500:]}")
+
+    # Upload rendered video
+    video_blob_path = gcs_path(job_doc, language, "accessible_video.mp4")
+    video_blob = gcs_service.bucket.blob(video_blob_path)
+    video_blob.content_type = "video/mp4"
+    video_blob.upload_from_filename(output_video_path)
+    video_gcs_uri = f"gs://{settings.gcs_bucket}/{video_blob_path}"
+    logger.info(f"Uploaded source-has-ad accessible video to {video_gcs_uri}")
+
+    # Update job document
+    await db.jobs.update_one(
+        {"_id": job_id},
+        {
+            "$set": {
+                f"outputs.{language}.accessible_video_gcs": video_gcs_uri,
+                f"outputs.{language}.accessible_video_method": "caption_embed",
+                f"accessible_video_progress.{language}": {
+                    "status": "completed",
+                    "method": "caption_embed",
+                    "started_at": job_doc.get("accessible_video_progress", {}).get(language, {}).get("started_at"),
+                    "completed_at": datetime.utcnow(),
+                },
+                "updated_at": datetime.utcnow(),
+            }
+        },
+    )
+    broadcast_status_update(
+        job_id,
+        "asset_ready",
+        job_title=job_title,
+        message=f"Accessible video ready for {language.upper()} (caption embed)",
+    )
+    await _check_accessible_video_completion(job_id, db)
+
+
 def _build_placements_from_ad_vtt(ad_vtt_content: str, cue_durations: list[float]) -> list[dict]:
    """
    Build placement instructions from AD VTT cues and TTS durations.
--- a/backend/app/tasks/translate_and_synthesize.py
+++ b/backend/app/tasks/translate_and_synthesize.py
@ -189,6 +189,7 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No

        updated_outputs = job_doc.get("outputs", {})
        _source_text_for_glossary = " ".join(filter(None, [source_captions_vtt, source_ad_vtt]))
+        _failed_languages: list[str] = []

        try:
            target_languages = [lang for lang in requested_languages if lang != source_language]
@ -268,7 +269,8 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No
                    logger.info(f"Processed language: {language} (origin: {origin})")

                except Exception as e:
-                    logger.error(f"Failed to process language {language}: {e}")
+                    logger.error(f"Failed to process language {language}: {e}", exc_info=True)
+                    _failed_languages.append(language)
                    # Preserve existing GCS URIs and origin so retranslation failure
                    # doesn't destroy captions the user can still view
                    existing = updated_outputs.get(language, {})
@ -288,14 +290,18 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No
            for lang in target_languages
            if lang in updated_outputs
        }
+        _status_update: dict = {
+            "status": JobStatus.TTS_GENERATING.value,
+            "updated_at": datetime.utcnow(),
+            **per_lang_updates,
+        }
+        if _failed_languages:
+            _status_update["translation_errors"] = _failed_languages
+            logger.warning(f"Job {job_id}: translation failed for languages: {_failed_languages}")
        await db.jobs.update_one(
            {"_id": job_id},
            {
-                "$set": {
-                    "status": JobStatus.TTS_GENERATING.value,
-                    "updated_at": datetime.utcnow(),
-                    **per_lang_updates,
-                },
+                "$set": _status_update,
                "$push": {
                    "review.history": {
                        "at": datetime.utcnow(),