fix: 7 caption/AD quality bugs + retranslation error handling

Bug fixes: - Bug 1a: source_has_ad flag prevents AI generating AD over existing professional AD; JobBrief/Job models, gemini service prompt conditional, NewBrief UI checkbox - Bug 1b: disable native textTracks on video element to prevent double captions - Bug 2: caption ALL audible speech including off-screen narrators (prompt fix) - Bug 3: DCMP §6.01 disfluency removal for EN/ES/FR/DE/IT (prompt + post-pass) - Bug 4: VTT cue settings (line:0%, position:) preserved through parser round-trip - Bug 5: Whisper word-level timestamp alignment via new caption_aligner service - Bug 6: assert_cue_alignment used .start/.end; renamed to .start_time/.end_time - New migration: backfill source_has_ad=False on existing jobs and job_briefs Also fix retranslation error handling to preserve existing GCS URIs on failure so video_native captions remain accessible if retranslation fails. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 15:38:20 +01:00 · 2026-05-07 15:38:20 +01:00 · 290d5e32e6
commit 290d5e32e6
parent 00dd1643f5
13 changed files with 325 additions and 16 deletions
--- a/backend/app/lib/vtt.py
+++ b/backend/app/lib/vtt.py
@ -8,6 +8,7 @@ class VTTCue:
    end_time: float    # seconds
    text: str
    identifier: str | None = None
+    settings: str = ""


 class VTTParser:
@ -37,10 +38,11 @@ class VTTParser:

            # Parse timing line
            if " --> " in line:
-                timing_match = re.match(r'([\d:.,]+)\s+-->\s+([\d:.,]+)', line)
+                timing_match = re.match(r'([\d:.,]+)\s+-->\s+([\d:.,]+)\s*(.*)', line)
                if timing_match:
                    start_time = VTTParser._parse_timestamp(timing_match.group(1))
                    end_time = VTTParser._parse_timestamp(timing_match.group(2))
+                    settings = timing_match.group(3).strip()

                    # Collect text lines until empty line or next cue
                    i += 1
@ -53,7 +55,8 @@ class VTTParser:
                        start_time=start_time,
                        end_time=end_time,
                        text="\n".join(text_lines),
-                        identifier=identifier
+                        identifier=identifier,
+                        settings=settings,
                    ))
            else:
                i += 1
@ -70,10 +73,13 @@ class VTTParser:
            if cue.identifier:
                lines.append(cue.identifier)

-            # Add timing line
+            # Add timing line (preserve cue settings like line:0%)
            start_timestamp = VTTParser._format_timestamp(cue.start_time)
            end_timestamp = VTTParser._format_timestamp(cue.end_time)
-            lines.append(f"{start_timestamp} --> {end_timestamp}")
+            timing_line = f"{start_timestamp} --> {end_timestamp}"
+            if cue.settings:
+                timing_line += f" {cue.settings}"
+            lines.append(timing_line)

            # Add text (can be multi-line)
            lines.append(cue.text)
@ -156,11 +162,11 @@ class VTTEditor:
            raise ValueError(
                f"Cue count mismatch for {lang}: EN has {len(en_cues)}, target has {len(tgt_cues)}"
            )
-        for i, (en, tgt) in enumerate(zip(en_cues, tgt_cues)):
-            if en.start != tgt.start or en.end != tgt.end:
+        for i, (en, tgt) in enumerate(zip(en_cues, tgt_cues, strict=True)):
+            if en.start_time != tgt.start_time or en.end_time != tgt.end_time:
                raise ValueError(
                    f"Timestamp mismatch for {lang} cue {i}: "
-                    f"EN {en.start}-->{en.end}, target {tgt.start}-->{tgt.end}"
+                    f"EN {en.start_time}-->{en.end_time}, target {tgt.start_time}-->{tgt.end_time}"
                )

    @staticmethod
@ -236,7 +242,7 @@ class VTTEditor:
                )
                return False, errors

-            for i, (src, tgt) in enumerate(zip(source_cues, translated_cues)):
+            for i, (src, tgt) in enumerate(zip(source_cues, translated_cues, strict=False)):
                if abs(src.start_time - tgt.start_time) > 0.001:
                    errors.append(
                        f"Cue {i + 1}: start time changed "
@ -266,3 +272,28 @@ class VTTEditor:

        return VTTParser.build(cues)

+    # DCMP §6.01 filler patterns per language (whole-word, case-insensitive)
+    _FILLER_PATTERNS: dict[str, str] = {
+        "en": r'\b(um+|uh+|ah+|er+|hmm+|like|you know|i mean)\b',
+        "es": r'\b(eh+|este|o sea|pues)\b',
+        "fr": r'\b(euh+|beh|ben|donc|quoi)\b',
+        "de": r'\b(äh+|ähm+|halt|ne)\b',
+        "it": r'\b(ehm+|allora|cioè|tipo)\b',
+    }
+
+    @staticmethod
+    def clean_disfluencies(vtt_content: str, lang: str) -> str:
+        """Remove filler words and hesitations per DCMP §6.01 for supported languages."""
+        pattern = VTTEditor._FILLER_PATTERNS.get(lang.split("-")[0].lower())
+        if not pattern:
+            return vtt_content
+        cues = VTTParser.parse(vtt_content)
+        compiled = re.compile(pattern, re.IGNORECASE)
+        for cue in cues:
+            cleaned = compiled.sub("", cue.text)
+            # Collapse multiple spaces and strip leading/trailing punctuation artifacts
+            cleaned = re.sub(r'[ \t]{2,}', ' ', cleaned).strip().strip(',').strip()
+            if cleaned:
+                cue.text = cleaned
+        return VTTParser.build(cues)
+
--- a/backend/app/migrations/scripts/migration_2026-05-08-000000_add_source_has_ad.py
+++ b/backend/app/migrations/scripts/migration_2026-05-08-000000_add_source_has_ad.py
@ -0,0 +1,26 @@
+"""Backfill source_has_ad=False on existing jobs and job_briefs."""
+from app.migrations.migrator import Migration
+
+
+class Migration(Migration):
+    version = "2026-05-08-000000"
+    description = "Add source_has_ad field to jobs.source and job_briefs"
+
+    async def up(self) -> None:
+        db = self.db
+
+        jobs_result = await db.jobs.update_many(
+            {"source.source_has_ad": {"$exists": False}},
+            {"$set": {"source.source_has_ad": False}},
+        )
+        briefs_result = await db.job_briefs.update_many(
+            {"source_has_ad": {"$exists": False}},
+            {"$set": {"source_has_ad": False}},
+        )
+
+        print(f"✅ Backfilled source_has_ad on {jobs_result.modified_count} jobs, {briefs_result.modified_count} job_briefs")
+
+    async def down(self) -> None:
+        db = self.db
+        await db.jobs.update_many({}, {"$unset": {"source.source_has_ad": ""}})
+        await db.job_briefs.update_many({}, {"$unset": {"source_has_ad": ""}})
--- a/backend/app/models/job.py
+++ b/backend/app/models/job.py
@ -50,6 +50,7 @@ class Source(BaseModel):
    language: constr(min_length=2, max_length=10) = "en"  # Final source language (from detection or explicit)
    language_hint: str | None = None  # User-provided hint for non-English videos
    detected_language: str | None = None  # AI-detected language from Gemini
+    source_has_ad: bool = False  # Source video already contains professional audio descriptions


 class TTSPreferences(BaseModel):
@ -281,6 +282,7 @@ class JobCreate(BaseModel):
    language_hint: str | None = None  # Optional hint when source_is_english=False
    requested_outputs: RequestedOutputs
    brand_context: str | None = None  # Comma-separated brand names present in the video (e.g. "Sellotape, Coca-Cola")
+    source_has_ad: bool = False  # Source video already contains professional audio descriptions


 class JobUpdate(BaseModel):
--- a/backend/app/models/job_brief.py
+++ b/backend/app/models/job_brief.py
@ -45,6 +45,7 @@ class JobBriefCreate(BaseModel):
    deadline: datetime | None = None
    project_id: str | None = None
    assignee_id: str | None = None
+    source_has_ad: bool = False  # Source video already contains professional audio descriptions


 class JobBriefUpdate(BaseModel):
--- a/backend/app/prompts/gemini_ingestion.md
+++ b/backend/app/prompts/gemini_ingestion.md
@ -10,6 +10,7 @@ You are given a video. Return a JSON object with:
 - captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language)
 - audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language)
 {SDH_FIELD}
+{SOURCE_HAS_AD}

 CRITICAL LANGUAGE REQUIREMENT:
 - First, detect the language spoken in the video
@ -36,7 +37,7 @@ CRITICAL TIMING REQUIREMENTS:
 - Each caption cue should end exactly when the speaker finishes that phrase/sentence
 - Listen carefully to detect natural speech pauses and word boundaries
 - Avoid starting captions too early or ending them too late
- Ensure captions align with lip movement and speech rhythm
+- Caption ALL audible speech — include off-screen narrators, voiceover, and any speaker not visible on screen. Do NOT omit speech because the speaker is not visible or because it plays over non-dialogue segments.
 - For audio descriptions, time them during natural speech gaps or over non-dialogue audio
 - Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends)

@ -57,6 +58,13 @@ CAPTION FORMATTING (DCMP standard):
 - Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds
 - Use mixed case. Use ALL CAPS only for screaming or shouting

+DISFLUENCY REMOVAL (DCMP §6.01):
+- Do NOT include filler words, false starts, or hesitations in captions
+- Remove: "um", "uh", "ah", "er", "hmm", "like" (as filler), "you know" (as filler), "I mean" (as filler)
+- Also remove language-specific fillers (e.g., "euh"/"beh" in French, "äh"/"ähm" in German, "eh"/"este" in Spanish, "ehm"/"allora" in Italian)
+- Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store")
+- Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices
+
 SOUND AND MUSIC FORMATTING (DCMP standard):
 - Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
 - Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks]
--- a/backend/app/prompts/gemini_ingestion_targeted.md
+++ b/backend/app/prompts/gemini_ingestion_targeted.md
@ -10,6 +10,7 @@ You are given a video. Return a JSON object with:
 - captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE})
 - audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE})
 {SDH_FIELD}
+{SOURCE_HAS_AD}

 TARGET LANGUAGE: {TARGET_LANGUAGE}

@ -40,7 +41,7 @@ CRITICAL TIMING REQUIREMENTS:
 - Each caption cue should end exactly when the speaker finishes that phrase/sentence
 - Listen carefully to detect natural speech pauses and word boundaries
 - Avoid starting captions too early or ending them too late
- Ensure captions align with lip movement and speech rhythm
+- Caption ALL audible speech — include off-screen narrators, voiceover, and any speaker not visible on screen. Do NOT omit speech because the speaker is not visible or because it plays over non-dialogue segments.
 - For audio descriptions, time them during natural speech gaps or over non-dialogue audio
 - Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends)

@ -61,6 +62,13 @@ CAPTION FORMATTING (DCMP standard):
 - Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds
 - Use mixed case. Use ALL CAPS only for screaming or shouting

+DISFLUENCY REMOVAL (DCMP §6.01):
+- Do NOT include filler words, false starts, or hesitations in captions
+- Remove: "um", "uh", "ah", "er", "hmm", "like" (as filler), "you know" (as filler), "I mean" (as filler)
+- Also remove language-specific fillers (e.g., "euh"/"beh" in French, "äh"/"ähm" in German, "eh"/"este" in Spanish, "ehm"/"allora" in Italian)
+- Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store")
+- Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices
+
 SOUND AND MUSIC FORMATTING (DCMP standard):
 - Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
 - Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks]
--- a/backend/app/services/caption_aligner.py
+++ b/backend/app/services/caption_aligner.py
@ -0,0 +1,125 @@
+"""Align Gemini caption VTT timings against Whisper word-level timestamps.
+
+Algorithm:
+  For each VTT cue, tokenise its text and search for the token sequence in the
+  Whisper word stream starting from the cursor position (with a look-ahead window).
+  When a match of sufficient confidence is found the cue's start/end timestamps
+  are replaced with the matched Whisper words' start/end.  Cues that cannot be
+  matched (music notation, sound effects, empty cues) keep their original Gemini
+  timestamps.  The result has Whisper-accurate timings early in the video and
+  graceful fallbacks where Whisper didn't capture the audio.
+"""
+
+import re
+from dataclasses import dataclass
+
+from ..core.logging import get_logger
+from ..lib.vtt import VTTEditor, VTTParser
+from ..services.whisper_service import WordTimestamp
+
+logger = get_logger(__name__)
+
+# Characters to strip when comparing tokens
+_PUNCT = re.compile(r"[^\w']", re.UNICODE)
+# Tokens shorter than this are considered stop-words and excluded from matching
+_MIN_TOKEN_LEN = 2
+# Minimum fraction of cue tokens that must match Whisper words for alignment
+_MIN_MATCH_RATIO = 0.5
+# How many Whisper words ahead of the cursor to search for a cue's tokens
+_SEARCH_WINDOW = 60
+
+
+def _tokenise(text: str) -> list[str]:
+    """Lower-case, strip punctuation, drop short tokens."""
+    return [
+        t for t in (_PUNCT.sub("", w).lower() for w in text.split())
+        if len(t) >= _MIN_TOKEN_LEN
+    ]
+
+
+@dataclass
+class _Match:
+    first_word_idx: int
+    last_word_idx: int
+    ratio: float  # matched_tokens / cue_tokens
+
+
+def _find_match(
+    cue_tokens: list[str],
+    whisper_words: list[WordTimestamp],
+    cursor: int,
+) -> _Match | None:
+    """Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW."""
+    if not cue_tokens:
+        return None
+
+    best: _Match | None = None
+    end = min(cursor + _SEARCH_WINDOW, len(whisper_words))
+
+    for start_idx in range(cursor, end):
+        matched = 0
+        last_idx = start_idx
+        token_pos = 0
+
+        for w_idx in range(start_idx, end):
+            if token_pos >= len(cue_tokens):
+                break
+            w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower()
+            if w_tok == cue_tokens[token_pos]:
+                matched += 1
+                last_idx = w_idx
+                token_pos += 1
+
+        ratio = matched / len(cue_tokens)
+        if ratio >= _MIN_MATCH_RATIO:
+            if best is None or ratio > best.ratio:
+                best = _Match(start_idx, last_idx, ratio)
+            if ratio == 1.0:
+                break  # perfect match — no need to search further
+
+    return best
+
+
+def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
+    """Replace VTT cue timings with Whisper-accurate timestamps where possible.
+
+    Returns a VTT string with the same cue count as the input, with improved
+    timing accuracy on cues that could be matched to Whisper word output.
+    """
+    if not whisper_words:
+        logger.warning("caption_aligner: no Whisper words supplied — returning original VTT")
+        return captions_vtt
+
+    cues = VTTParser.parse(captions_vtt)
+    cursor = 0
+    aligned = 0
+
+    for cue in cues:
+        tokens = _tokenise(cue.text)
+        if not tokens:
+            # Sound-effect or music cue — nothing to align
+            continue
+
+        match = _find_match(tokens, whisper_words, cursor)
+        if match is None:
+            continue
+
+        new_start = whisper_words[match.first_word_idx].start
+        new_end = whisper_words[match.last_word_idx].end
+
+        # Sanity: don't create zero-duration or backwards cues
+        if new_end > new_start:
+            cue.start_time = new_start
+            cue.end_time = new_end
+            aligned += 1
+
+        # Advance cursor to just past the last matched word
+        cursor = match.last_word_idx + 1
+
+    logger.info(
+        f"caption_aligner: aligned {aligned}/{len(cues)} cues "
+        f"against {len(whisper_words)} Whisper words"
+    )
+    return VTTEditor.translate_preserving_timing(
+        captions_vtt, [c.text for c in cues]
+    ) if aligned == 0 else VTTParser.build(cues)
--- a/backend/app/services/gemini.py
+++ b/backend/app/services/gemini.py
@ -113,6 +113,15 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
            return glossary_block.strip()
        return ""

+    def _build_source_has_ad_block(self, source_has_ad: bool) -> str:
+        if source_has_ad:
+            return (
+                "SOURCE AUDIO DESCRIPTION NOTICE: This video already has professional audio descriptions "
+                "embedded in its audio track. Return an empty audio_description_vtt containing only "
+                "the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions."
+            )
+        return ""
+
    def _build_brand_context_block(self, brand_context: str | None) -> str:
        """Build the brand context instruction block for injection into prompts."""
        if brand_context and brand_context.strip():
@ -125,7 +134,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
                )
        return "No specific brand names have been provided for this video."

-    async def extract_accessibility(self, video_file_path: str, brand_context: str | None = None, sdh_requested: bool = False, glossary_block: str | None = None, _cost_ctx: dict | None = None) -> dict[str, Any]:
+    async def extract_accessibility(self, video_file_path: str, brand_context: str | None = None, sdh_requested: bool = False, glossary_block: str | None = None, source_has_ad: bool = False, _cost_ctx: dict | None = None) -> dict[str, Any]:
        """
        Extract captions and audio descriptions from video using Gemini 2.0
        Returns structured JSON with transcript, captions VTT, and audio description VTT
@ -137,6 +146,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
            .replace("{GLOSSARY}", self._build_glossary_block(glossary_block))
            .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
            .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
+            .replace("{SOURCE_HAS_AD}", self._build_source_has_ad_block(source_has_ad))
        )
        uploaded_file = None

--- a/backend/app/tasks/ingest_and_ai.py
+++ b/backend/app/tasks/ingest_and_ai.py
@ -1,20 +1,25 @@
 import asyncio
 import os
+import subprocess
 import tempfile
 from datetime import datetime

 import ffmpeg
 from celery import Task
+from celery.result import allow_join_result
 from motor.motor_asyncio import AsyncIOMotorClient

 from ..core.config import settings
 from ..core.logging import get_logger
+from ..lib.vtt import VTTEditor
 from ..models.job import JobStatus
-from ..services import cost_tracker
+from ..services import caption_aligner, cost_tracker
 from ..services.gcs import gcs_path, gcs_service, upload_vtt_to_gcs
 from ..services.gemini import gemini_service
+from ..services.whisper_service import WordTimestamp
 from . import celery_app
 from ._websocket_bridge import broadcast_status_update
+from .whisper_transcribe import transcribe_video_audio_task

 logger = get_logger(__name__)

@ -153,6 +158,7 @@ async def ingest_and_ai_task_impl(job_id: str):
                # Process with Gemini
                brand_context = job_doc.get("brand_context")
                sdh_requested = job_doc.get("requested_outputs", {}).get("sdh_vtt", False)
+                source_has_ad = job_doc.get("source", {}).get("source_has_ad", False)
                _cost_ctx = {
                    "user_id": job_doc.get("client_id", "system"),
                    "job_id": job_id,
@ -167,8 +173,13 @@ async def ingest_and_ai_task_impl(job_id: str):
                    temp_path,
                    brand_context=brand_context,
                    sdh_requested=sdh_requested,
+                    source_has_ad=source_has_ad,
                    _cost_ctx=_cost_ctx,
                )
+                # Enforce: if source already has AD, discard any AI-generated AD
+                if source_has_ad:
+                    ai_result["audio_description_vtt"] = "WEBVTT\n"
+                    logger.info(f"source_has_ad=True for job {job_id}: skipping AD generation")

                # Final safety check for required fields
                required_fields = ["captions_vtt", "audio_description_vtt"]
@ -202,6 +213,13 @@ async def ingest_and_ai_task_impl(job_id: str):
                source_language = detected_language
                logger.info(f"Using detected language '{source_language}' for job {job_id}")

+                # Post-process: remove filler words per DCMP §6.01
+                captions_vtt = VTTEditor.clean_disfluencies(ai_result["captions_vtt"], source_language)
+
+                # Align caption timings with Whisper word-level timestamps (Bug 5)
+                captions_vtt = await _align_captions_with_whisper(captions_vtt, temp_path, job_id)
+                ai_result["captions_vtt"] = captions_vtt
+
                # Upload VTT files to GCS using detected language
                captions_gcs_uri = await upload_vtt_to_gcs(
                    ai_result["captions_vtt"],
@ -333,3 +351,47 @@ async def _get_video_duration(video_path: str) -> float:
    except Exception as e:
        logger.warning(f"Could not determine video duration: {e}")
        return 0.0
+
+
+async def _align_captions_with_whisper(captions_vtt: str, video_path: str, job_id: str) -> str:
+    """Align caption VTT timings with Whisper word timestamps. Returns original VTT on failure."""
+    audio_path = video_path.replace(".mp4", "_captions_align.mp3")
+    try:
+        # Extract audio at 16kHz mono (optimal for Whisper)
+        def _extract():
+            result = subprocess.run(
+                ["ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame",
+                 "-ar", "16000", "-ac", "1", "-q:a", "5", audio_path],
+                capture_output=True, text=True
+            )
+            if result.returncode != 0:
+                raise RuntimeError(f"FFmpeg failed: {result.stderr}")
+
+        await asyncio.to_thread(_extract)
+
+        task_result = transcribe_video_audio_task.apply_async(
+            args=[job_id, audio_path], queue="whisper"
+        )
+        poll_count = 0
+        while not task_result.ready():
+            await asyncio.sleep(1.0)
+            poll_count += 1
+            if poll_count > 600:
+                logger.warning(f"Whisper timeout for job {job_id}, skipping alignment")
+                return captions_vtt
+
+        with allow_join_result():
+            result_data = task_result.get(timeout=10)
+
+        words = [
+            WordTimestamp(word=w["word"], start=w["start"], end=w["end"])
+            for w in result_data.get("words", [])
+        ]
+        return caption_aligner.align(captions_vtt, words)
+
+    except Exception as e:
+        logger.warning(f"Whisper caption alignment failed for job {job_id}: {e} — using Gemini timestamps")
+        return captions_vtt
+    finally:
+        if os.path.exists(audio_path):
+            os.unlink(audio_path)
--- a/backend/app/tasks/translate_and_synthesize.py
+++ b/backend/app/tasks/translate_and_synthesize.py
@ -269,9 +269,12 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No

                except Exception as e:
                    logger.error(f"Failed to process language {language}: {e}")
+                    # Preserve existing GCS URIs and origin so retranslation failure
+                    # doesn't destroy captions the user can still view
+                    existing = updated_outputs.get(language, {})
                    updated_outputs[language] = {
-                        "origin": "transcreate" if _style == "transcreate" else "gemini_translate",
-                        "qa_notes": f"Translation failed: {str(e)}",
+                        **existing,
+                        "qa_notes": f"Translation failed: {str(e)[:200]}",
                    }

        finally:
--- a/frontend/src/components/VideoReview/VideoReviewPlayer.tsx
+++ b/frontend/src/components/VideoReview/VideoReviewPlayer.tsx
@ -53,6 +53,20 @@ export function VideoReviewPlayer({ job, downloads }: VideoReviewPlayerProps) {
    }
  }, [assetTabs, activeTabKey]);

+  // Disable browser-native text tracks so they don't compete with our React overlay
+  useEffect(() => {
+    const video = videoRef.current;
+    if (!video) return;
+    const disableTracks = () => {
+      for (let i = 0; i < video.textTracks.length; i++) {
+        video.textTracks[i].mode = 'disabled';
+      }
+    };
+    disableTracks();
+    video.addEventListener('loadedmetadata', disableTracks);
+    return () => video.removeEventListener('loadedmetadata', disableTracks);
+  }, [videoRef.current]);
+
  // Get current tab
  const activeTab = assetTabs.find((t) => t.key === activeTabKey);

@ -305,9 +319,9 @@ export function VideoReviewPlayer({ job, downloads }: VideoReviewPlayerProps) {
              </div>
            )}

-            {/* Caption Overlay — always at the bottom, above native controls */}
+            {/* Caption Overlay — position at top when cue has line:0% setting */}
            {showCaptions && currentCaption && (
-              <div className="absolute bottom-14 left-1/2 transform -translate-x-1/2 bg-black bg-opacity-80 text-white px-4 py-2 rounded max-w-[90%]">
+              <div className={`absolute ${currentCaption.positionTop ? 'top-4' : 'bottom-14'} left-1/2 transform -translate-x-1/2 bg-black bg-opacity-80 text-white px-4 py-2 rounded max-w-[90%]`}>
                <div className="text-center whitespace-pre-wrap">
                  {currentCaption.text}
                </div>
--- a/frontend/src/routes/briefs/NewBrief.tsx
+++ b/frontend/src/routes/briefs/NewBrief.tsx
@ -77,6 +77,7 @@ export function NewBrief() {
  const [accessibleMethod, setAccessibleMethod] = useState<'overlay' | 'pause_insert'>('pause_insert');
  const [sdhVtt, setSdhVtt] = useState(false);
  const [descriptiveTranscript, setDescriptiveTranscript] = useState(false);
+  const [sourceHasAd, setSourceHasAd] = useState(false);

  const { data: projects = [] } = useAllProjects();
  const { data: assignees = [] } = useBriefAssignees();
@ -113,6 +114,7 @@ export function NewBrief() {
        deadline: deadline || undefined,
        project_id: projectId || undefined,
        assignee_id: assigneeId || undefined,
+        source_has_ad: sourceHasAd,
      });
      toast.toastOnly.success('Brief created');
      navigate(`/briefs/${brief.id}`);
@ -240,6 +242,22 @@ export function NewBrief() {
          </div>
        </div>

+        <div>
+          <label className="block text-sm font-medium text-gray-700 mb-2">Source Video</label>
+          <label className="flex items-start gap-2 text-sm text-gray-700 cursor-pointer">
+            <input
+              type="checkbox"
+              checked={sourceHasAd}
+              onChange={e => setSourceHasAd(e.target.checked)}
+              className="rounded mt-0.5 flex-shrink-0"
+            />
+            <span>
+              <span className="font-medium">Source video already contains audio descriptions</span>
+              <span className="text-gray-400 ml-1">— AI will not generate new AD for this job</span>
+            </span>
+          </label>
+        </div>
+
        <div>
          <label className="block text-sm font-medium text-gray-700 mb-2">
            Languages
--- a/frontend/src/types/api.ts
+++ b/frontend/src/types/api.ts
@ -880,4 +880,5 @@ export interface JobBriefCreate {
  deadline?: string;
  project_id?: string;
  assignee_id?: string;
+  source_has_ad?: boolean;
 }