Bug fixes: - Bug 1a: source_has_ad flag prevents AI generating AD over existing professional AD; JobBrief/Job models, gemini service prompt conditional, NewBrief UI checkbox - Bug 1b: disable native textTracks on video element to prevent double captions - Bug 2: caption ALL audible speech including off-screen narrators (prompt fix) - Bug 3: DCMP §6.01 disfluency removal for EN/ES/FR/DE/IT (prompt + post-pass) - Bug 4: VTT cue settings (line:0%, position:) preserved through parser round-trip - Bug 5: Whisper word-level timestamp alignment via new caption_aligner service - Bug 6: assert_cue_alignment used .start/.end; renamed to .start_time/.end_time - New migration: backfill source_has_ad=False on existing jobs and job_briefs Also fix retranslation error handling to preserve existing GCS URIs on failure so video_native captions remain accessible if retranslation fails. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
125 lines
4.1 KiB
Python
125 lines
4.1 KiB
Python
"""Align Gemini caption VTT timings against Whisper word-level timestamps.
|
|
|
|
Algorithm:
|
|
For each VTT cue, tokenise its text and search for the token sequence in the
|
|
Whisper word stream starting from the cursor position (with a look-ahead window).
|
|
When a match of sufficient confidence is found the cue's start/end timestamps
|
|
are replaced with the matched Whisper words' start/end. Cues that cannot be
|
|
matched (music notation, sound effects, empty cues) keep their original Gemini
|
|
timestamps. The result has Whisper-accurate timings early in the video and
|
|
graceful fallbacks where Whisper didn't capture the audio.
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
|
|
from ..core.logging import get_logger
|
|
from ..lib.vtt import VTTEditor, VTTParser
|
|
from ..services.whisper_service import WordTimestamp
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
# Characters to strip when comparing tokens
|
|
_PUNCT = re.compile(r"[^\w']", re.UNICODE)
|
|
# Tokens shorter than this are considered stop-words and excluded from matching
|
|
_MIN_TOKEN_LEN = 2
|
|
# Minimum fraction of cue tokens that must match Whisper words for alignment
|
|
_MIN_MATCH_RATIO = 0.5
|
|
# How many Whisper words ahead of the cursor to search for a cue's tokens
|
|
_SEARCH_WINDOW = 60
|
|
|
|
|
|
def _tokenise(text: str) -> list[str]:
|
|
"""Lower-case, strip punctuation, drop short tokens."""
|
|
return [
|
|
t for t in (_PUNCT.sub("", w).lower() for w in text.split())
|
|
if len(t) >= _MIN_TOKEN_LEN
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class _Match:
|
|
first_word_idx: int
|
|
last_word_idx: int
|
|
ratio: float # matched_tokens / cue_tokens
|
|
|
|
|
|
def _find_match(
|
|
cue_tokens: list[str],
|
|
whisper_words: list[WordTimestamp],
|
|
cursor: int,
|
|
) -> _Match | None:
|
|
"""Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW."""
|
|
if not cue_tokens:
|
|
return None
|
|
|
|
best: _Match | None = None
|
|
end = min(cursor + _SEARCH_WINDOW, len(whisper_words))
|
|
|
|
for start_idx in range(cursor, end):
|
|
matched = 0
|
|
last_idx = start_idx
|
|
token_pos = 0
|
|
|
|
for w_idx in range(start_idx, end):
|
|
if token_pos >= len(cue_tokens):
|
|
break
|
|
w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower()
|
|
if w_tok == cue_tokens[token_pos]:
|
|
matched += 1
|
|
last_idx = w_idx
|
|
token_pos += 1
|
|
|
|
ratio = matched / len(cue_tokens)
|
|
if ratio >= _MIN_MATCH_RATIO:
|
|
if best is None or ratio > best.ratio:
|
|
best = _Match(start_idx, last_idx, ratio)
|
|
if ratio == 1.0:
|
|
break # perfect match — no need to search further
|
|
|
|
return best
|
|
|
|
|
|
def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
|
|
"""Replace VTT cue timings with Whisper-accurate timestamps where possible.
|
|
|
|
Returns a VTT string with the same cue count as the input, with improved
|
|
timing accuracy on cues that could be matched to Whisper word output.
|
|
"""
|
|
if not whisper_words:
|
|
logger.warning("caption_aligner: no Whisper words supplied — returning original VTT")
|
|
return captions_vtt
|
|
|
|
cues = VTTParser.parse(captions_vtt)
|
|
cursor = 0
|
|
aligned = 0
|
|
|
|
for cue in cues:
|
|
tokens = _tokenise(cue.text)
|
|
if not tokens:
|
|
# Sound-effect or music cue — nothing to align
|
|
continue
|
|
|
|
match = _find_match(tokens, whisper_words, cursor)
|
|
if match is None:
|
|
continue
|
|
|
|
new_start = whisper_words[match.first_word_idx].start
|
|
new_end = whisper_words[match.last_word_idx].end
|
|
|
|
# Sanity: don't create zero-duration or backwards cues
|
|
if new_end > new_start:
|
|
cue.start_time = new_start
|
|
cue.end_time = new_end
|
|
aligned += 1
|
|
|
|
# Advance cursor to just past the last matched word
|
|
cursor = match.last_word_idx + 1
|
|
|
|
logger.info(
|
|
f"caption_aligner: aligned {aligned}/{len(cues)} cues "
|
|
f"against {len(whisper_words)} Whisper words"
|
|
)
|
|
return VTTEditor.translate_preserving_timing(
|
|
captions_vtt, [c.text for c in cues]
|
|
) if aligned == 0 else VTTParser.build(cues)
|