video-accessibility/backend/app/services/caption_aligner.py
Vadym Samoilenko 76bee82119 fix(pipeline): fix 5 QA tickets — caption alignment, glossary, source_has_ad render, filler words, NL error surfacing
- caption_aligner: lower match ratio 0.5→0.35, widen search window 60→150, add time-based cursor fallback on miss
- gemini.py: explicit 'MUST use glossary terms' requirement in translate_vtt prompt; source_has_ad prompt now instructs not to include AD narration in captions
- ingest_and_ai: load glossary for source language and pass to extract_accessibility
- render_accessible_video: handle source_has_ad=True via caption-embed path (ffmpeg subtitle inject, no AD pipeline)
- translate_and_synthesize: track failed languages, write translation_errors to DB, add exc_info to error log
- vtt.py: expand _FILLER_PATTERNS to nl/pt/pl/uk/ru, widen EN/ES/FR/DE/IT lists
- gemini_ingestion.md: strengthen line:0% placement rule, expand disfluency examples per language

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-08 18:36:59 +01:00

135 lines
4.6 KiB
Python

"""Align Gemini caption VTT timings against Whisper word-level timestamps.
Algorithm:
For each VTT cue, tokenise its text and search for the token sequence in the
Whisper word stream starting from the cursor position (with a look-ahead window).
When a match of sufficient confidence is found the cue's start/end timestamps
are replaced with the matched Whisper words' start/end. Cues that cannot be
matched (music notation, sound effects, empty cues) keep their original Gemini
timestamps. The result has Whisper-accurate timings early in the video and
graceful fallbacks where Whisper didn't capture the audio.
"""
import bisect
import re
from dataclasses import dataclass
from ..core.logging import get_logger
from ..lib.vtt import VTTEditor, VTTParser
from ..services.whisper_service import WordTimestamp
logger = get_logger(__name__)
# Characters to strip when comparing tokens
_PUNCT = re.compile(r"[^\w']", re.UNICODE)
# Tokens shorter than this are considered stop-words and excluded from matching
_MIN_TOKEN_LEN = 2
# Minimum fraction of cue tokens that must match Whisper words for alignment.
# Lowered from 0.5 → 0.35 to handle Gemini paraphrasing and short cues.
_MIN_MATCH_RATIO = 0.35
# How many Whisper words ahead of the cursor to search for a cue's tokens.
# Widened from 60 → 150 so the window stays valid even after several failed cues.
_SEARCH_WINDOW = 150
def _tokenise(text: str) -> list[str]:
"""Lower-case, strip punctuation, drop short tokens."""
return [
t for t in (_PUNCT.sub("", w).lower() for w in text.split())
if len(t) >= _MIN_TOKEN_LEN
]
@dataclass
class _Match:
first_word_idx: int
last_word_idx: int
ratio: float # matched_tokens / cue_tokens
def _find_match(
cue_tokens: list[str],
whisper_words: list[WordTimestamp],
cursor: int,
) -> _Match | None:
"""Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW."""
if not cue_tokens:
return None
best: _Match | None = None
end = min(cursor + _SEARCH_WINDOW, len(whisper_words))
for start_idx in range(cursor, end):
matched = 0
last_idx = start_idx
token_pos = 0
for w_idx in range(start_idx, end):
if token_pos >= len(cue_tokens):
break
w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower()
if w_tok == cue_tokens[token_pos]:
matched += 1
last_idx = w_idx
token_pos += 1
ratio = matched / len(cue_tokens)
if ratio >= _MIN_MATCH_RATIO:
if best is None or ratio > best.ratio:
best = _Match(start_idx, last_idx, ratio)
if ratio == 1.0:
break # perfect match — no need to search further
return best
def _cursor_for_time(whisper_words: list[WordTimestamp], t: float, from_idx: int) -> int:
"""Return the index of the first Whisper word at or after time t, starting from from_idx."""
starts = [w.start for w in whisper_words]
idx = bisect.bisect_left(starts, t, from_idx)
return min(idx, len(whisper_words) - 1)
def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
"""Replace VTT cue timings with Whisper-accurate timestamps where possible.
Returns a VTT string with the same cue count as the input, with improved
timing accuracy on cues that could be matched to Whisper word output.
"""
if not whisper_words:
logger.warning("caption_aligner: no Whisper words supplied — returning original VTT")
return captions_vtt
cues = VTTParser.parse(captions_vtt)
cursor = 0
aligned = 0
for cue in cues:
tokens = _tokenise(cue.text)
if not tokens:
continue
match = _find_match(tokens, whisper_words, cursor)
if match is None:
# Advance cursor to the Whisper word closest to this cue's start time
# so subsequent cues don't search from a stale position.
cursor = _cursor_for_time(whisper_words, cue.start_time, cursor)
continue
new_start = whisper_words[match.first_word_idx].start
new_end = whisper_words[match.last_word_idx].end
if new_end > new_start:
cue.start_time = new_start
cue.end_time = new_end
aligned += 1
cursor = match.last_word_idx + 1
logger.info(
f"caption_aligner: aligned {aligned}/{len(cues)} cues "
f"against {len(whisper_words)} Whisper words"
)
return VTTEditor.translate_preserving_timing(
captions_vtt, [c.text for c in cues]
) if aligned == 0 else VTTParser.build(cues)