"""Align Gemini caption VTT timings against Whisper word-level timestamps. Algorithm: For each VTT cue, tokenise its text and search for the token sequence in the Whisper word stream starting from the cursor position (with a look-ahead window). When a match of sufficient confidence is found the cue's start/end timestamps are replaced with the matched Whisper words' start/end. Cues that cannot be matched (music notation, sound effects, empty cues) keep their original Gemini timestamps. The result has Whisper-accurate timings early in the video and graceful fallbacks where Whisper didn't capture the audio. """ import bisect import re from dataclasses import dataclass from ..core.logging import get_logger from ..lib.vtt import VTTEditor, VTTParser from ..services.whisper_service import WordTimestamp logger = get_logger(__name__) # Characters to strip when comparing tokens _PUNCT = re.compile(r"[^\w']", re.UNICODE) # Tokens shorter than this are considered stop-words and excluded from matching _MIN_TOKEN_LEN = 2 # Minimum fraction of cue tokens that must match Whisper words for alignment. # Lowered from 0.5 → 0.35 to handle Gemini paraphrasing and short cues. _MIN_MATCH_RATIO = 0.35 # How many Whisper words ahead of the cursor to search for a cue's tokens. # Widened from 60 → 150 so the window stays valid even after several failed cues. _SEARCH_WINDOW = 150 def _tokenise(text: str) -> list[str]: """Lower-case, strip punctuation, drop short tokens.""" return [ t for t in (_PUNCT.sub("", w).lower() for w in text.split()) if len(t) >= _MIN_TOKEN_LEN ] @dataclass class _Match: first_word_idx: int last_word_idx: int ratio: float # matched_tokens / cue_tokens def _find_match( cue_tokens: list[str], whisper_words: list[WordTimestamp], cursor: int, ) -> _Match | None: """Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW.""" if not cue_tokens: return None best: _Match | None = None end = min(cursor + _SEARCH_WINDOW, len(whisper_words)) for start_idx in range(cursor, end): matched = 0 last_idx = start_idx token_pos = 0 for w_idx in range(start_idx, end): if token_pos >= len(cue_tokens): break w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower() if w_tok == cue_tokens[token_pos]: matched += 1 last_idx = w_idx token_pos += 1 ratio = matched / len(cue_tokens) if ratio >= _MIN_MATCH_RATIO: if best is None or ratio > best.ratio: best = _Match(start_idx, last_idx, ratio) if ratio == 1.0: break # perfect match — no need to search further return best def _cursor_for_time(whisper_words: list[WordTimestamp], t: float, from_idx: int) -> int: """Return the index of the first Whisper word at or after time t, starting from from_idx.""" starts = [w.start for w in whisper_words] idx = bisect.bisect_left(starts, t, from_idx) return min(idx, len(whisper_words) - 1) def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str: """Replace VTT cue timings with Whisper-accurate timestamps where possible. Returns a VTT string with the same cue count as the input, with improved timing accuracy on cues that could be matched to Whisper word output. """ if not whisper_words: logger.warning("caption_aligner: no Whisper words supplied — returning original VTT") return captions_vtt cues = VTTParser.parse(captions_vtt) cursor = 0 aligned = 0 for cue in cues: tokens = _tokenise(cue.text) if not tokens: continue match = _find_match(tokens, whisper_words, cursor) if match is None: # Advance cursor to the Whisper word closest to this cue's start time # so subsequent cues don't search from a stale position. cursor = _cursor_for_time(whisper_words, cue.start_time, cursor) continue new_start = whisper_words[match.first_word_idx].start new_end = whisper_words[match.last_word_idx].end if new_end > new_start: cue.start_time = new_start cue.end_time = new_end aligned += 1 cursor = match.last_word_idx + 1 logger.info( f"caption_aligner: aligned {aligned}/{len(cues)} cues " f"against {len(whisper_words)} Whisper words" ) return VTTEditor.translate_preserving_timing( captions_vtt, [c.text for c in cues] ) if aligned == 0 else VTTParser.build(cues)