refactor: rewrite pause point refinement algorithm with ordered logic

Completely rewrites the Whisper-based pause point refinement to use
a two-phase approach with explicit ordering:

Phase 1 - Individual refinement:
1. Check if pause point is "during speaking" (words within ±2s)
   - If NOT during speaking → use Gemini's exact point, no overlap
2. If during speaking, find nearest sentence boundary
3. Apply appropriate buffering based on context:
   - Case A: First sentence → pause 500ms before sentence starts
   - Case B: Last sentence → pause 500ms after sentence ends
   - Case C: Between sentences → full double buffer (overlap)

Phase 2 - Consolidation (after all refinements):
- Consolidate cues within 5s of each other to play back-to-back

Key changes:
- Add SentenceBoundary dataclass for tracking boundaries with context
- Add _is_during_speaking() helper to detect speech proximity
- Add _find_sentence_boundaries() with longest-gap fallback
- Rewrite snap_pause_point() with new ordered algorithm
- Update refine_all_pause_points() to pass words and use two phases

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
michael 2025-12-29 08:19:03 -06:00
parent d092800676
commit 3588d3fa14
2 changed files with 267 additions and 97 deletions

View file

@ -45,6 +45,22 @@ class SpeechGap:
return {"sentence": 1, "phrase": 2, "word": 3}.get(self.gap_type, 4)
@dataclass
class SentenceBoundary:
"""A sentence boundary (start or end) for pause point snapping.
Used to determine where to place pause points relative to sentences:
- sentence_end: The end time of a word ending with .!?
- sentence_start: The start time of the first word after a sentence-ending word
"""
time: float # The boundary timestamp
boundary_type: str # "sentence_start" or "sentence_end"
word_index: int # Index of the associated word in the words list
has_previous_sentence: bool # Is there a sentence before this boundary?
has_next_sentence: bool # Is there a sentence after this boundary?
gap: SpeechGap | None # The gap this boundary belongs to (for double-buffer case)
class WhisperService:
"""Service for speech analysis using faster-whisper."""
@ -175,105 +191,283 @@ class WhisperService:
return sorted(gaps, key=lambda g: g.start)
def _is_during_speaking(
self,
pause_point: float,
words: list[WordTimestamp],
threshold: float = 2.0
) -> bool:
"""
Check if a pause point is "during speaking" (words nearby).
Args:
pause_point: The timestamp to check
words: List of word timestamps from Whisper
threshold: Max distance in seconds to consider "nearby" (default: 2.0s)
Returns:
True if any word is within ±threshold seconds of the pause point
"""
for word in words:
# Check if pause point is near word start or end
if abs(word.start - pause_point) <= threshold or abs(word.end - pause_point) <= threshold:
return True
return False
def _find_sentence_boundaries(
self,
words: list[WordTimestamp],
gaps: list[SpeechGap]
) -> list[SentenceBoundary]:
"""
Find all sentence boundaries (starts and ends) from the transcript.
Boundaries are identified from:
1. Words ending with sentence punctuation (.!?) - these mark sentence ends
2. Words following sentence-ending words - these mark sentence starts
3. Fallback: If no punctuation found, use the longest gap as a boundary
Args:
words: List of word timestamps from Whisper
gaps: List of speech gaps between words
Returns:
List of SentenceBoundary objects sorted by time
"""
if not words:
return []
boundaries: list[SentenceBoundary] = []
sentence_end_punctuation = ('.', '!', '?', '...', '', '', '')
# Track which word indices end sentences
sentence_ending_indices: set[int] = set()
# Find all sentence-ending words
for i, word in enumerate(words):
word_text = word.word.rstrip()
if word_text.endswith(sentence_end_punctuation):
sentence_ending_indices.add(i)
# If no sentence punctuation found, use the longest gap as a fallback
if not sentence_ending_indices and gaps:
longest_gap = max(gaps, key=lambda g: g.duration)
# Find the word index that ends at this gap
for i, word in enumerate(words[:-1]):
if abs(word.end - longest_gap.start) < 0.01: # Match within 10ms
sentence_ending_indices.add(i)
logger.info(
f"No sentence punctuation found, using longest gap "
f"({longest_gap.duration:.2f}s) at {longest_gap.start:.2f}s as boundary"
)
break
# Create boundaries from sentence-ending words
for i in sorted(sentence_ending_indices):
word = words[i]
# Find the gap after this word (if any)
associated_gap = None
for gap in gaps:
if abs(gap.start - word.end) < 0.01: # Match within 10ms
associated_gap = gap
break
# Check if there's a previous sentence (any sentence-ending word before this one)
has_previous = any(j < i for j in sentence_ending_indices) or i > 0
# Check if there's a next sentence (any word after this one)
has_next = i < len(words) - 1
# Add sentence END boundary
boundaries.append(SentenceBoundary(
time=word.end,
boundary_type="sentence_end",
word_index=i,
has_previous_sentence=has_previous,
has_next_sentence=has_next,
gap=associated_gap
))
# Add sentence START boundary (next word's start) if there's a next word
if has_next and associated_gap:
next_word = words[i + 1]
# For sentence_start, check if there was a previous sentence
# (the sentence that just ended counts as previous)
boundaries.append(SentenceBoundary(
time=next_word.start,
boundary_type="sentence_start",
word_index=i + 1,
has_previous_sentence=True, # The sentence that just ended
has_next_sentence=any(j > i for j in sentence_ending_indices),
gap=associated_gap
))
# Also add boundaries for the very first and last words if not already covered
if words:
# First word boundary (if not already a sentence start)
first_word = words[0]
has_first_boundary = any(
b.boundary_type == "sentence_start" and b.word_index == 0
for b in boundaries
)
if not has_first_boundary:
boundaries.append(SentenceBoundary(
time=first_word.start,
boundary_type="sentence_start",
word_index=0,
has_previous_sentence=False, # Nothing before first word
has_next_sentence=len(sentence_ending_indices) > 0 or len(words) > 1,
gap=None
))
# Last word boundary (if it's a sentence end not already covered)
last_idx = len(words) - 1
if last_idx not in sentence_ending_indices:
last_word = words[last_idx]
boundaries.append(SentenceBoundary(
time=last_word.end,
boundary_type="sentence_end",
word_index=last_idx,
has_previous_sentence=len(sentence_ending_indices) > 0 or last_idx > 0,
has_next_sentence=False, # Nothing after last word
gap=None
))
return sorted(boundaries, key=lambda b: b.time)
def snap_pause_point(
self,
gemini_pause: float,
words: list[WordTimestamp],
gaps: list[SpeechGap],
max_search_window: float | None = None
boundaries: list[SentenceBoundary],
speaking_threshold: float = 2.0
) -> tuple[float, float, str | None]:
"""
Snap a Gemini pause point to the nearest sentence break, or use exact point if no speech nearby.
Snap a Gemini pause point to the nearest sentence boundary using ordered logic.
Three possible outcomes:
1. No speech nearby: Use Gemini's exact pause point (no overlap needed)
2. Speech nearby but no sentence break: Return warning, use Gemini's point
3. Sentence break found: Apply "full gap overlap" algorithm
The "full gap overlap" algorithm (case 3):
- pause_point: Just BEFORE the next sentence starts (gap.end - buffer)
- resume_from: Just AFTER the previous sentence ends (gap.start + buffer)
This uses the entire gap as buffer on BOTH sides of the AD.
Algorithm (in order):
1. Check if "during speaking" (words within ±threshold)
- If NO Use Gemini's exact pause point, no overlap
2. If during speaking, find nearest sentence boundary
3. Apply appropriate buffering based on context:
- Case A: Beginning of sentence, no previous pause 500ms before sentence starts
- Case B: End of sentence, no next pause 500ms after sentence ends
- Case C: Gap between sentences full double buffer
Args:
gemini_pause: Original pause point from Gemini (seconds)
words: List of word timestamps from Whisper
gaps: List of speech gaps from identify_speech_gaps()
max_search_window: Max seconds to search in each direction (default: self.max_search_window)
boundaries: List of sentence boundaries from _find_sentence_boundaries()
speaking_threshold: Max distance to consider "during speaking" (default: 2.0s)
Returns:
Tuple of (pause_point, resume_from, warning_message_or_none)
"""
search_window = max_search_window or self.max_search_window
# First, check if there's ANY speech (any gap type) within the search window
# Gaps only exist between words, so if there are no gaps nearby, there's no speech
any_gaps_nearby = [
g for g in gaps
if g.start >= gemini_pause - search_window
and g.start <= gemini_pause + search_window
]
if not any_gaps_nearby:
# No speech detected near this pause point - use Gemini's exact recommendation
# No overlap needed since there's no dialogue to buffer around
# Step 1: Check if "during speaking" (words within ±threshold)
if not self._is_during_speaking(gemini_pause, words, speaking_threshold):
# Not during speaking - use Gemini's exact pause point
logger.info(
f"No speech detected within +/-{search_window}s of {gemini_pause:.2f}s, "
"using Gemini's exact pause point (no overlap)"
f"Pause point {gemini_pause:.2f}s is NOT during speaking "
f"(no words within ±{speaking_threshold}s), using Gemini's exact point"
)
# Return None for warning - this is not a problem, just a different scenario
return gemini_pause, gemini_pause, None
# There IS speech nearby - now look for sentence breaks specifically
sentence_gaps = [g for g in any_gaps_nearby if g.gap_type == "sentence"]
# Step 2: During speaking - find nearest sentence boundary
if not boundaries:
# No boundaries found at all - use Gemini's point with warning
logger.warning(f"No sentence boundaries found, using Gemini's exact point {gemini_pause:.2f}s")
return gemini_pause, gemini_pause, "No sentence boundaries found in transcript"
if not sentence_gaps:
# Speech exists but no sentence break found - return warning
# The caller may apply special handling (e.g., move first AD to video start)
return gemini_pause, gemini_pause, f"No sentence break found within +/-{search_window}s of {gemini_pause:.2f}s"
# Sort by distance from gemini_pause (closest first)
sentence_gaps.sort(key=lambda g: abs(g.start - gemini_pause))
best_gap = sentence_gaps[0]
# Small edge buffer to avoid cutting exactly at speech boundaries (prevents clicks/pops)
# Use 50ms as minimum, but cap at 10% of gap duration for very short gaps
edge_buffer = min(0.05, best_gap.duration * 0.1)
# "Full gap overlap" algorithm:
# - pause_point: Play video until just BEFORE next sentence (gap.end - edge_buffer)
# - resume_from: Resume just AFTER previous sentence (gap.start + edge_buffer)
# This means the gap portion (minus 2x edge_buffer) gets played twice,
# providing maximum natural buffer on both sides of the AD.
pause_point = best_gap.end - edge_buffer
resume_from = best_gap.start + edge_buffer
# Find the boundary closest to the Gemini pause point
closest_boundary = min(boundaries, key=lambda b: abs(b.time - gemini_pause))
logger.debug(
f"Full-gap-overlap: gap={best_gap.start:.3f}s-{best_gap.end:.3f}s "
f"(duration={best_gap.duration:.3f}s), edge_buffer={edge_buffer:.3f}s, "
f"pause_point={pause_point:.3f}s, resume_from={resume_from:.3f}s, "
f"overlap_duration={pause_point - resume_from:.3f}s"
f"Nearest boundary to {gemini_pause:.2f}s: {closest_boundary.boundary_type} "
f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)"
)
# Step 3: Apply appropriate buffering based on context
edge_buffer = 0.5 # 500ms buffer for edge cases
# Case A: Beginning of sentence with no previous sentence (first sentence in video)
if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence:
# Pause 500ms BEFORE the sentence starts (away from speech)
pause_point = max(0.0, closest_boundary.time - edge_buffer)
resume_from = pause_point # No overlap
logger.info(
f"Case A (first sentence): pause_point={pause_point:.2f}s "
f"(500ms before sentence start at {closest_boundary.time:.2f}s)"
)
return pause_point, resume_from, None
# Case B: End of sentence with no next sentence (last sentence in video)
if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence:
# Pause 500ms AFTER the sentence ends (away from speech)
pause_point = closest_boundary.time + edge_buffer
resume_from = pause_point # No overlap
logger.info(
f"Case B (last sentence): pause_point={pause_point:.2f}s "
f"(500ms after sentence end at {closest_boundary.time:.2f}s)"
)
return pause_point, resume_from, None
# Case C: Gap between two sentences (normal case with double buffer)
if closest_boundary.gap:
gap = closest_boundary.gap
# Small edge buffer to avoid cutting exactly at speech boundaries
small_buffer = min(0.05, gap.duration * 0.1)
# Full double buffer:
# - pause_point: Just BEFORE next sentence (gap.end - small_buffer)
# - resume_from: Just AFTER previous sentence (gap.start + small_buffer)
pause_point = gap.end - small_buffer
resume_from = gap.start + small_buffer
logger.info(
f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, "
f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, "
f"overlap={pause_point - resume_from:.2f}s"
)
return pause_point, resume_from, None
# Fallback: No gap associated with boundary - use the boundary time with small buffer
# This shouldn't normally happen but handles edge cases
if closest_boundary.boundary_type == "sentence_end":
pause_point = closest_boundary.time + 0.05 # Small buffer after end
else:
pause_point = max(0.0, closest_boundary.time - 0.05) # Small buffer before start
resume_from = pause_point
logger.info(
f"Fallback: Using boundary at {closest_boundary.time:.2f}s, "
f"pause_point={pause_point:.2f}s (no gap available)"
)
return pause_point, resume_from, None
def refine_all_pause_points(
self,
placements: list[dict],
words: list[WordTimestamp],
gaps: list[SpeechGap],
consolidation_threshold: float = 5.0
) -> tuple[list[dict], list[str]]:
"""
Refine all pause points in a Gemini analysis result.
Uses the "full gap overlap" algorithm where:
- pause_point: Where to pause video (just before next sentence)
- resume_from: Where to resume video (just after previous sentence)
Two-phase algorithm:
Phase 1: Refine each pause point individually using ordered logic:
1. Check if "during speaking" (words within ±2s)
2. If not during speaking use Gemini's exact point
3. If during speaking snap to nearest boundary with appropriate buffering
This creates a small overlap where a tiny bit of video plays twice,
but provides maximum natural buffer around the audio description.
Phase 2: Consolidate cues that are within 5s of each other (after all refinements)
Args:
placements: List of placement dicts from Gemini analysis
words: Word timestamps from Whisper transcription
gaps: Speech gaps from Whisper analysis
consolidation_threshold: If consecutive cues have pause points within
this many seconds, combine them to play back-to-back (default: 5.0s)
@ -284,51 +478,31 @@ class WhisperService:
refined_placements = []
warnings = []
# Pre-compute sentence boundaries once for all placements
boundaries = self._find_sentence_boundaries(words, gaps)
logger.info(f"Found {len(boundaries)} sentence boundaries for pause point refinement")
# Phase 1: Refine each pause point individually
for placement in placements:
refined = placement.copy()
if placement.get("pause_point") is not None:
original = placement["pause_point"]
pause_point, resume_from, warning = self.snap_pause_point(original, gaps)
pause_point, resume_from, warning = self.snap_pause_point(
original, words, gaps, boundaries
)
refined["pause_point"] = pause_point
refined["resume_from"] = resume_from
refined["original_pause_point"] = original # Preserve for debugging
if warning:
# Special handling for first AD cue: if no sentence break found (but speech exists),
# insert at the very beginning of the video to avoid mid-sentence insertion
if placement["ad_cue_index"] == 0:
refined["pause_point"] = 0.0
refined["resume_from"] = 0.0
warnings.append(
f"Cue 0: No sentence break found within search window of {original:.2f}s, "
"inserting AD at video start (0.0s)"
)
logger.info(
f"First AD cue: No sentence break found near {original:.2f}s, "
"using video start (0.0s) to avoid mid-sentence insertion"
)
else:
warnings.append(f"Cue {placement['ad_cue_index']}: {warning}")
logger.warning(f"Pause point refinement warning for cue {placement['ad_cue_index']}: {warning}")
elif pause_point == resume_from:
# No overlap - either no speech nearby, or some other reason
# Log this as info since it's a valid scenario (non-dialogue section)
logger.info(
f"Cue {placement['ad_cue_index']}: Using exact pause point {pause_point:.2f}s "
f"(no overlap - likely non-dialogue section)"
)
elif abs(pause_point - original) > 0.1:
logger.info(
f"Refined pause point for cue {placement['ad_cue_index']}: "
f"{original:.2f}s -> pause_at={pause_point:.2f}s, resume_from={resume_from:.2f}s "
f"(overlap={pause_point - resume_from:.2f}s)"
)
warnings.append(f"Cue {placement['ad_cue_index']}: {warning}")
logger.warning(f"Pause point refinement warning for cue {placement['ad_cue_index']}: {warning}")
refined_placements.append(refined)
# Consolidate cues that are close together to avoid mid-sentence insertions
# Phase 2: Consolidate cues that are close together (AFTER all individual refinements)
refined_placements = self._consolidate_close_cues(
refined_placements, consolidation_threshold, warnings
)
@ -365,7 +539,6 @@ class WhisperService:
# First pass: identify consolidated groups and assign same pause_point
consolidated = [placements[0].copy()]
group_resume_from = placements[0].get("resume_from") # Track the back buffer for the group
for i in range(1, len(placements)):
current = placements[i].copy()
@ -396,10 +569,6 @@ class WhisperService:
f"Cue {current['ad_cue_index']}: Consolidated with previous cue "
f"(pause points were {gap:.2f}s apart, playing back-to-back)"
)
else:
# New group starts - update group_resume_from for the new group
group_resume_from = current.get("resume_from")
consolidated.append(current)
# Second pass: fix resume_from values for consolidated groups

View file

@ -417,9 +417,10 @@ async def _refine_pause_points_with_whisper(
gaps = whisper_service.identify_speech_gaps(words)
logger.info(f"Found {len(gaps)} speech gaps in video for job {job_id}")
# Refine pause points
# Refine pause points (Phase 1: individual refinement, Phase 2: consolidation)
refined_placements, warnings = whisper_service.refine_all_pause_points(
analysis.get("placements", []),
words,
gaps
)