video-accessibility/backend/app/services/caption_aligner.py
Vadym Samoilenko 290d5e32e6 fix: 7 caption/AD quality bugs + retranslation error handling
Bug fixes:
- Bug 1a: source_has_ad flag prevents AI generating AD over existing professional AD;
  JobBrief/Job models, gemini service prompt conditional, NewBrief UI checkbox
- Bug 1b: disable native textTracks on video element to prevent double captions
- Bug 2: caption ALL audible speech including off-screen narrators (prompt fix)
- Bug 3: DCMP §6.01 disfluency removal for EN/ES/FR/DE/IT (prompt + post-pass)
- Bug 4: VTT cue settings (line:0%, position:) preserved through parser round-trip
- Bug 5: Whisper word-level timestamp alignment via new caption_aligner service
- Bug 6: assert_cue_alignment used .start/.end; renamed to .start_time/.end_time
- New migration: backfill source_has_ad=False on existing jobs and job_briefs

Also fix retranslation error handling to preserve existing GCS URIs on failure
so video_native captions remain accessible if retranslation fails.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 15:38:20 +01:00

125 lines
4.1 KiB
Python

"""Align Gemini caption VTT timings against Whisper word-level timestamps.
Algorithm:
For each VTT cue, tokenise its text and search for the token sequence in the
Whisper word stream starting from the cursor position (with a look-ahead window).
When a match of sufficient confidence is found the cue's start/end timestamps
are replaced with the matched Whisper words' start/end. Cues that cannot be
matched (music notation, sound effects, empty cues) keep their original Gemini
timestamps. The result has Whisper-accurate timings early in the video and
graceful fallbacks where Whisper didn't capture the audio.
"""
import re
from dataclasses import dataclass
from ..core.logging import get_logger
from ..lib.vtt import VTTEditor, VTTParser
from ..services.whisper_service import WordTimestamp
logger = get_logger(__name__)
# Characters to strip when comparing tokens
_PUNCT = re.compile(r"[^\w']", re.UNICODE)
# Tokens shorter than this are considered stop-words and excluded from matching
_MIN_TOKEN_LEN = 2
# Minimum fraction of cue tokens that must match Whisper words for alignment
_MIN_MATCH_RATIO = 0.5
# How many Whisper words ahead of the cursor to search for a cue's tokens
_SEARCH_WINDOW = 60
def _tokenise(text: str) -> list[str]:
"""Lower-case, strip punctuation, drop short tokens."""
return [
t for t in (_PUNCT.sub("", w).lower() for w in text.split())
if len(t) >= _MIN_TOKEN_LEN
]
@dataclass
class _Match:
first_word_idx: int
last_word_idx: int
ratio: float # matched_tokens / cue_tokens
def _find_match(
cue_tokens: list[str],
whisper_words: list[WordTimestamp],
cursor: int,
) -> _Match | None:
"""Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW."""
if not cue_tokens:
return None
best: _Match | None = None
end = min(cursor + _SEARCH_WINDOW, len(whisper_words))
for start_idx in range(cursor, end):
matched = 0
last_idx = start_idx
token_pos = 0
for w_idx in range(start_idx, end):
if token_pos >= len(cue_tokens):
break
w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower()
if w_tok == cue_tokens[token_pos]:
matched += 1
last_idx = w_idx
token_pos += 1
ratio = matched / len(cue_tokens)
if ratio >= _MIN_MATCH_RATIO:
if best is None or ratio > best.ratio:
best = _Match(start_idx, last_idx, ratio)
if ratio == 1.0:
break # perfect match — no need to search further
return best
def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
"""Replace VTT cue timings with Whisper-accurate timestamps where possible.
Returns a VTT string with the same cue count as the input, with improved
timing accuracy on cues that could be matched to Whisper word output.
"""
if not whisper_words:
logger.warning("caption_aligner: no Whisper words supplied — returning original VTT")
return captions_vtt
cues = VTTParser.parse(captions_vtt)
cursor = 0
aligned = 0
for cue in cues:
tokens = _tokenise(cue.text)
if not tokens:
# Sound-effect or music cue — nothing to align
continue
match = _find_match(tokens, whisper_words, cursor)
if match is None:
continue
new_start = whisper_words[match.first_word_idx].start
new_end = whisper_words[match.last_word_idx].end
# Sanity: don't create zero-duration or backwards cues
if new_end > new_start:
cue.start_time = new_start
cue.end_time = new_end
aligned += 1
# Advance cursor to just past the last matched word
cursor = match.last_word_idx + 1
logger.info(
f"caption_aligner: aligned {aligned}/{len(cues)} cues "
f"against {len(whisper_words)} Whisper words"
)
return VTTEditor.translate_preserving_timing(
captions_vtt, [c.text for c in cues]
) if aligned == 0 else VTTParser.build(cues)