video-accessibility/backend/app/services/caption_aligner.py

"""Align Gemini caption VTT timings against Whisper word-level timestamps.

Algorithm:
  For each VTT cue, tokenise its text and search for the token sequence in the
  Whisper word stream starting from the cursor position (with a look-ahead window).
  When a match of sufficient confidence is found the cue's start/end timestamps
  are replaced with the matched Whisper words' start/end.  Cues that cannot be
  matched (music notation, sound effects, empty cues) keep their original Gemini
  timestamps.  The result has Whisper-accurate timings early in the video and
  graceful fallbacks where Whisper didn't capture the audio.
"""

import re
from dataclasses import dataclass

from ..core.logging import get_logger
from ..lib.vtt import VTTEditor, VTTParser
from ..services.whisper_service import WordTimestamp

logger = get_logger(__name__)

# Characters to strip when comparing tokens
_PUNCT = re.compile(r"[^\w']", re.UNICODE)
# Tokens shorter than this are considered stop-words and excluded from matching
_MIN_TOKEN_LEN = 2
# Minimum fraction of cue tokens that must match Whisper words for alignment
_MIN_MATCH_RATIO = 0.5
# How many Whisper words ahead of the cursor to search for a cue's tokens
_SEARCH_WINDOW = 60


def _tokenise(text: str) -> list[str]:
    """Lower-case, strip punctuation, drop short tokens."""
    return [
        t for t in (_PUNCT.sub("", w).lower() for w in text.split())
        if len(t) >= _MIN_TOKEN_LEN
    ]


@dataclass
class _Match:
    first_word_idx: int
    last_word_idx: int
    ratio: float  # matched_tokens / cue_tokens


def _find_match(
    cue_tokens: list[str],
    whisper_words: list[WordTimestamp],
    cursor: int,
) -> _Match | None:
    """Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW."""
    if not cue_tokens:
        return None

    best: _Match | None = None
    end = min(cursor + _SEARCH_WINDOW, len(whisper_words))

    for start_idx in range(cursor, end):
        matched = 0
        last_idx = start_idx
        token_pos = 0

        for w_idx in range(start_idx, end):
            if token_pos >= len(cue_tokens):
                break
            w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower()
            if w_tok == cue_tokens[token_pos]:
                matched += 1
                last_idx = w_idx
                token_pos += 1

        ratio = matched / len(cue_tokens)
        if ratio >= _MIN_MATCH_RATIO:
            if best is None or ratio > best.ratio:
                best = _Match(start_idx, last_idx, ratio)
            if ratio == 1.0:
                break  # perfect match — no need to search further

    return best


def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
    """Replace VTT cue timings with Whisper-accurate timestamps where possible.

    Returns a VTT string with the same cue count as the input, with improved
    timing accuracy on cues that could be matched to Whisper word output.
    """
    if not whisper_words:
        logger.warning("caption_aligner: no Whisper words supplied — returning original VTT")
        return captions_vtt

    cues = VTTParser.parse(captions_vtt)
    cursor = 0
    aligned = 0

    for cue in cues:
        tokens = _tokenise(cue.text)
        if not tokens:
            # Sound-effect or music cue — nothing to align
            continue

        match = _find_match(tokens, whisper_words, cursor)
        if match is None:
            continue

        new_start = whisper_words[match.first_word_idx].start
        new_end = whisper_words[match.last_word_idx].end

        # Sanity: don't create zero-duration or backwards cues
        if new_end > new_start:
            cue.start_time = new_start
            cue.end_time = new_end
            aligned += 1

        # Advance cursor to just past the last matched word
        cursor = match.last_word_idx + 1

    logger.info(
        f"caption_aligner: aligned {aligned}/{len(cues)} cues "
        f"against {len(whisper_words)} Whisper words"
    )
    return VTTEditor.translate_preserving_timing(
        captions_vtt, [c.text for c in cues]
    ) if aligned == 0 else VTTParser.build(cues)