- caption_aligner: lower match ratio 0.5→0.35, widen search window 60→150, add time-based cursor fallback on miss - gemini.py: explicit 'MUST use glossary terms' requirement in translate_vtt prompt; source_has_ad prompt now instructs not to include AD narration in captions - ingest_and_ai: load glossary for source language and pass to extract_accessibility - render_accessible_video: handle source_has_ad=True via caption-embed path (ffmpeg subtitle inject, no AD pipeline) - translate_and_synthesize: track failed languages, write translation_errors to DB, add exc_info to error log - vtt.py: expand _FILLER_PATTERNS to nl/pt/pl/uk/ru, widen EN/ES/FR/DE/IT lists - gemini_ingestion.md: strengthen line:0% placement rule, expand disfluency examples per language Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
135 lines
4.6 KiB
Python
135 lines
4.6 KiB
Python
"""Align Gemini caption VTT timings against Whisper word-level timestamps.
|
|
|
|
Algorithm:
|
|
For each VTT cue, tokenise its text and search for the token sequence in the
|
|
Whisper word stream starting from the cursor position (with a look-ahead window).
|
|
When a match of sufficient confidence is found the cue's start/end timestamps
|
|
are replaced with the matched Whisper words' start/end. Cues that cannot be
|
|
matched (music notation, sound effects, empty cues) keep their original Gemini
|
|
timestamps. The result has Whisper-accurate timings early in the video and
|
|
graceful fallbacks where Whisper didn't capture the audio.
|
|
"""
|
|
|
|
import bisect
|
|
import re
|
|
from dataclasses import dataclass
|
|
|
|
from ..core.logging import get_logger
|
|
from ..lib.vtt import VTTEditor, VTTParser
|
|
from ..services.whisper_service import WordTimestamp
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
# Characters to strip when comparing tokens
|
|
_PUNCT = re.compile(r"[^\w']", re.UNICODE)
|
|
# Tokens shorter than this are considered stop-words and excluded from matching
|
|
_MIN_TOKEN_LEN = 2
|
|
# Minimum fraction of cue tokens that must match Whisper words for alignment.
|
|
# Lowered from 0.5 → 0.35 to handle Gemini paraphrasing and short cues.
|
|
_MIN_MATCH_RATIO = 0.35
|
|
# How many Whisper words ahead of the cursor to search for a cue's tokens.
|
|
# Widened from 60 → 150 so the window stays valid even after several failed cues.
|
|
_SEARCH_WINDOW = 150
|
|
|
|
|
|
def _tokenise(text: str) -> list[str]:
|
|
"""Lower-case, strip punctuation, drop short tokens."""
|
|
return [
|
|
t for t in (_PUNCT.sub("", w).lower() for w in text.split())
|
|
if len(t) >= _MIN_TOKEN_LEN
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class _Match:
|
|
first_word_idx: int
|
|
last_word_idx: int
|
|
ratio: float # matched_tokens / cue_tokens
|
|
|
|
|
|
def _find_match(
|
|
cue_tokens: list[str],
|
|
whisper_words: list[WordTimestamp],
|
|
cursor: int,
|
|
) -> _Match | None:
|
|
"""Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW."""
|
|
if not cue_tokens:
|
|
return None
|
|
|
|
best: _Match | None = None
|
|
end = min(cursor + _SEARCH_WINDOW, len(whisper_words))
|
|
|
|
for start_idx in range(cursor, end):
|
|
matched = 0
|
|
last_idx = start_idx
|
|
token_pos = 0
|
|
|
|
for w_idx in range(start_idx, end):
|
|
if token_pos >= len(cue_tokens):
|
|
break
|
|
w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower()
|
|
if w_tok == cue_tokens[token_pos]:
|
|
matched += 1
|
|
last_idx = w_idx
|
|
token_pos += 1
|
|
|
|
ratio = matched / len(cue_tokens)
|
|
if ratio >= _MIN_MATCH_RATIO:
|
|
if best is None or ratio > best.ratio:
|
|
best = _Match(start_idx, last_idx, ratio)
|
|
if ratio == 1.0:
|
|
break # perfect match — no need to search further
|
|
|
|
return best
|
|
|
|
|
|
def _cursor_for_time(whisper_words: list[WordTimestamp], t: float, from_idx: int) -> int:
|
|
"""Return the index of the first Whisper word at or after time t, starting from from_idx."""
|
|
starts = [w.start for w in whisper_words]
|
|
idx = bisect.bisect_left(starts, t, from_idx)
|
|
return min(idx, len(whisper_words) - 1)
|
|
|
|
|
|
def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
|
|
"""Replace VTT cue timings with Whisper-accurate timestamps where possible.
|
|
|
|
Returns a VTT string with the same cue count as the input, with improved
|
|
timing accuracy on cues that could be matched to Whisper word output.
|
|
"""
|
|
if not whisper_words:
|
|
logger.warning("caption_aligner: no Whisper words supplied — returning original VTT")
|
|
return captions_vtt
|
|
|
|
cues = VTTParser.parse(captions_vtt)
|
|
cursor = 0
|
|
aligned = 0
|
|
|
|
for cue in cues:
|
|
tokens = _tokenise(cue.text)
|
|
if not tokens:
|
|
continue
|
|
|
|
match = _find_match(tokens, whisper_words, cursor)
|
|
if match is None:
|
|
# Advance cursor to the Whisper word closest to this cue's start time
|
|
# so subsequent cues don't search from a stale position.
|
|
cursor = _cursor_for_time(whisper_words, cue.start_time, cursor)
|
|
continue
|
|
|
|
new_start = whisper_words[match.first_word_idx].start
|
|
new_end = whisper_words[match.last_word_idx].end
|
|
|
|
if new_end > new_start:
|
|
cue.start_time = new_start
|
|
cue.end_time = new_end
|
|
aligned += 1
|
|
|
|
cursor = match.last_word_idx + 1
|
|
|
|
logger.info(
|
|
f"caption_aligner: aligned {aligned}/{len(cues)} cues "
|
|
f"against {len(whisper_words)} Whisper words"
|
|
)
|
|
return VTTEditor.translate_preserving_timing(
|
|
captions_vtt, [c.text for c in cues]
|
|
) if aligned == 0 else VTTParser.build(cues)
|