refactor: rewrite pause point refinement algorithm with ordered logic
Completely rewrites the Whisper-based pause point refinement to use a two-phase approach with explicit ordering: Phase 1 - Individual refinement: 1. Check if pause point is "during speaking" (words within ±2s) - If NOT during speaking → use Gemini's exact point, no overlap 2. If during speaking, find nearest sentence boundary 3. Apply appropriate buffering based on context: - Case A: First sentence → pause 500ms before sentence starts - Case B: Last sentence → pause 500ms after sentence ends - Case C: Between sentences → full double buffer (overlap) Phase 2 - Consolidation (after all refinements): - Consolidate cues within 5s of each other to play back-to-back Key changes: - Add SentenceBoundary dataclass for tracking boundaries with context - Add _is_during_speaking() helper to detect speech proximity - Add _find_sentence_boundaries() with longest-gap fallback - Rewrite snap_pause_point() with new ordered algorithm - Update refine_all_pause_points() to pass words and use two phases 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
d092800676
commit
3588d3fa14
2 changed files with 267 additions and 97 deletions
|
|
@ -45,6 +45,22 @@ class SpeechGap:
|
|||
return {"sentence": 1, "phrase": 2, "word": 3}.get(self.gap_type, 4)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SentenceBoundary:
|
||||
"""A sentence boundary (start or end) for pause point snapping.
|
||||
|
||||
Used to determine where to place pause points relative to sentences:
|
||||
- sentence_end: The end time of a word ending with .!?
|
||||
- sentence_start: The start time of the first word after a sentence-ending word
|
||||
"""
|
||||
time: float # The boundary timestamp
|
||||
boundary_type: str # "sentence_start" or "sentence_end"
|
||||
word_index: int # Index of the associated word in the words list
|
||||
has_previous_sentence: bool # Is there a sentence before this boundary?
|
||||
has_next_sentence: bool # Is there a sentence after this boundary?
|
||||
gap: SpeechGap | None # The gap this boundary belongs to (for double-buffer case)
|
||||
|
||||
|
||||
class WhisperService:
|
||||
"""Service for speech analysis using faster-whisper."""
|
||||
|
||||
|
|
@ -175,105 +191,283 @@ class WhisperService:
|
|||
|
||||
return sorted(gaps, key=lambda g: g.start)
|
||||
|
||||
def _is_during_speaking(
|
||||
self,
|
||||
pause_point: float,
|
||||
words: list[WordTimestamp],
|
||||
threshold: float = 2.0
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a pause point is "during speaking" (words nearby).
|
||||
|
||||
Args:
|
||||
pause_point: The timestamp to check
|
||||
words: List of word timestamps from Whisper
|
||||
threshold: Max distance in seconds to consider "nearby" (default: 2.0s)
|
||||
|
||||
Returns:
|
||||
True if any word is within ±threshold seconds of the pause point
|
||||
"""
|
||||
for word in words:
|
||||
# Check if pause point is near word start or end
|
||||
if abs(word.start - pause_point) <= threshold or abs(word.end - pause_point) <= threshold:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _find_sentence_boundaries(
|
||||
self,
|
||||
words: list[WordTimestamp],
|
||||
gaps: list[SpeechGap]
|
||||
) -> list[SentenceBoundary]:
|
||||
"""
|
||||
Find all sentence boundaries (starts and ends) from the transcript.
|
||||
|
||||
Boundaries are identified from:
|
||||
1. Words ending with sentence punctuation (.!?) - these mark sentence ends
|
||||
2. Words following sentence-ending words - these mark sentence starts
|
||||
3. Fallback: If no punctuation found, use the longest gap as a boundary
|
||||
|
||||
Args:
|
||||
words: List of word timestamps from Whisper
|
||||
gaps: List of speech gaps between words
|
||||
|
||||
Returns:
|
||||
List of SentenceBoundary objects sorted by time
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
boundaries: list[SentenceBoundary] = []
|
||||
sentence_end_punctuation = ('.', '!', '?', '...', '。', '!', '?')
|
||||
|
||||
# Track which word indices end sentences
|
||||
sentence_ending_indices: set[int] = set()
|
||||
|
||||
# Find all sentence-ending words
|
||||
for i, word in enumerate(words):
|
||||
word_text = word.word.rstrip()
|
||||
if word_text.endswith(sentence_end_punctuation):
|
||||
sentence_ending_indices.add(i)
|
||||
|
||||
# If no sentence punctuation found, use the longest gap as a fallback
|
||||
if not sentence_ending_indices and gaps:
|
||||
longest_gap = max(gaps, key=lambda g: g.duration)
|
||||
# Find the word index that ends at this gap
|
||||
for i, word in enumerate(words[:-1]):
|
||||
if abs(word.end - longest_gap.start) < 0.01: # Match within 10ms
|
||||
sentence_ending_indices.add(i)
|
||||
logger.info(
|
||||
f"No sentence punctuation found, using longest gap "
|
||||
f"({longest_gap.duration:.2f}s) at {longest_gap.start:.2f}s as boundary"
|
||||
)
|
||||
break
|
||||
|
||||
# Create boundaries from sentence-ending words
|
||||
for i in sorted(sentence_ending_indices):
|
||||
word = words[i]
|
||||
|
||||
# Find the gap after this word (if any)
|
||||
associated_gap = None
|
||||
for gap in gaps:
|
||||
if abs(gap.start - word.end) < 0.01: # Match within 10ms
|
||||
associated_gap = gap
|
||||
break
|
||||
|
||||
# Check if there's a previous sentence (any sentence-ending word before this one)
|
||||
has_previous = any(j < i for j in sentence_ending_indices) or i > 0
|
||||
|
||||
# Check if there's a next sentence (any word after this one)
|
||||
has_next = i < len(words) - 1
|
||||
|
||||
# Add sentence END boundary
|
||||
boundaries.append(SentenceBoundary(
|
||||
time=word.end,
|
||||
boundary_type="sentence_end",
|
||||
word_index=i,
|
||||
has_previous_sentence=has_previous,
|
||||
has_next_sentence=has_next,
|
||||
gap=associated_gap
|
||||
))
|
||||
|
||||
# Add sentence START boundary (next word's start) if there's a next word
|
||||
if has_next and associated_gap:
|
||||
next_word = words[i + 1]
|
||||
# For sentence_start, check if there was a previous sentence
|
||||
# (the sentence that just ended counts as previous)
|
||||
boundaries.append(SentenceBoundary(
|
||||
time=next_word.start,
|
||||
boundary_type="sentence_start",
|
||||
word_index=i + 1,
|
||||
has_previous_sentence=True, # The sentence that just ended
|
||||
has_next_sentence=any(j > i for j in sentence_ending_indices),
|
||||
gap=associated_gap
|
||||
))
|
||||
|
||||
# Also add boundaries for the very first and last words if not already covered
|
||||
if words:
|
||||
# First word boundary (if not already a sentence start)
|
||||
first_word = words[0]
|
||||
has_first_boundary = any(
|
||||
b.boundary_type == "sentence_start" and b.word_index == 0
|
||||
for b in boundaries
|
||||
)
|
||||
if not has_first_boundary:
|
||||
boundaries.append(SentenceBoundary(
|
||||
time=first_word.start,
|
||||
boundary_type="sentence_start",
|
||||
word_index=0,
|
||||
has_previous_sentence=False, # Nothing before first word
|
||||
has_next_sentence=len(sentence_ending_indices) > 0 or len(words) > 1,
|
||||
gap=None
|
||||
))
|
||||
|
||||
# Last word boundary (if it's a sentence end not already covered)
|
||||
last_idx = len(words) - 1
|
||||
if last_idx not in sentence_ending_indices:
|
||||
last_word = words[last_idx]
|
||||
boundaries.append(SentenceBoundary(
|
||||
time=last_word.end,
|
||||
boundary_type="sentence_end",
|
||||
word_index=last_idx,
|
||||
has_previous_sentence=len(sentence_ending_indices) > 0 or last_idx > 0,
|
||||
has_next_sentence=False, # Nothing after last word
|
||||
gap=None
|
||||
))
|
||||
|
||||
return sorted(boundaries, key=lambda b: b.time)
|
||||
|
||||
def snap_pause_point(
|
||||
self,
|
||||
gemini_pause: float,
|
||||
words: list[WordTimestamp],
|
||||
gaps: list[SpeechGap],
|
||||
max_search_window: float | None = None
|
||||
boundaries: list[SentenceBoundary],
|
||||
speaking_threshold: float = 2.0
|
||||
) -> tuple[float, float, str | None]:
|
||||
"""
|
||||
Snap a Gemini pause point to the nearest sentence break, or use exact point if no speech nearby.
|
||||
Snap a Gemini pause point to the nearest sentence boundary using ordered logic.
|
||||
|
||||
Three possible outcomes:
|
||||
1. No speech nearby: Use Gemini's exact pause point (no overlap needed)
|
||||
2. Speech nearby but no sentence break: Return warning, use Gemini's point
|
||||
3. Sentence break found: Apply "full gap overlap" algorithm
|
||||
|
||||
The "full gap overlap" algorithm (case 3):
|
||||
- pause_point: Just BEFORE the next sentence starts (gap.end - buffer)
|
||||
- resume_from: Just AFTER the previous sentence ends (gap.start + buffer)
|
||||
This uses the entire gap as buffer on BOTH sides of the AD.
|
||||
Algorithm (in order):
|
||||
1. Check if "during speaking" (words within ±threshold)
|
||||
- If NO → Use Gemini's exact pause point, no overlap
|
||||
2. If during speaking, find nearest sentence boundary
|
||||
3. Apply appropriate buffering based on context:
|
||||
- Case A: Beginning of sentence, no previous → pause 500ms before sentence starts
|
||||
- Case B: End of sentence, no next → pause 500ms after sentence ends
|
||||
- Case C: Gap between sentences → full double buffer
|
||||
|
||||
Args:
|
||||
gemini_pause: Original pause point from Gemini (seconds)
|
||||
words: List of word timestamps from Whisper
|
||||
gaps: List of speech gaps from identify_speech_gaps()
|
||||
max_search_window: Max seconds to search in each direction (default: self.max_search_window)
|
||||
boundaries: List of sentence boundaries from _find_sentence_boundaries()
|
||||
speaking_threshold: Max distance to consider "during speaking" (default: 2.0s)
|
||||
|
||||
Returns:
|
||||
Tuple of (pause_point, resume_from, warning_message_or_none)
|
||||
"""
|
||||
search_window = max_search_window or self.max_search_window
|
||||
|
||||
# First, check if there's ANY speech (any gap type) within the search window
|
||||
# Gaps only exist between words, so if there are no gaps nearby, there's no speech
|
||||
any_gaps_nearby = [
|
||||
g for g in gaps
|
||||
if g.start >= gemini_pause - search_window
|
||||
and g.start <= gemini_pause + search_window
|
||||
]
|
||||
|
||||
if not any_gaps_nearby:
|
||||
# No speech detected near this pause point - use Gemini's exact recommendation
|
||||
# No overlap needed since there's no dialogue to buffer around
|
||||
# Step 1: Check if "during speaking" (words within ±threshold)
|
||||
if not self._is_during_speaking(gemini_pause, words, speaking_threshold):
|
||||
# Not during speaking - use Gemini's exact pause point
|
||||
logger.info(
|
||||
f"No speech detected within +/-{search_window}s of {gemini_pause:.2f}s, "
|
||||
"using Gemini's exact pause point (no overlap)"
|
||||
f"Pause point {gemini_pause:.2f}s is NOT during speaking "
|
||||
f"(no words within ±{speaking_threshold}s), using Gemini's exact point"
|
||||
)
|
||||
# Return None for warning - this is not a problem, just a different scenario
|
||||
return gemini_pause, gemini_pause, None
|
||||
|
||||
# There IS speech nearby - now look for sentence breaks specifically
|
||||
sentence_gaps = [g for g in any_gaps_nearby if g.gap_type == "sentence"]
|
||||
# Step 2: During speaking - find nearest sentence boundary
|
||||
if not boundaries:
|
||||
# No boundaries found at all - use Gemini's point with warning
|
||||
logger.warning(f"No sentence boundaries found, using Gemini's exact point {gemini_pause:.2f}s")
|
||||
return gemini_pause, gemini_pause, "No sentence boundaries found in transcript"
|
||||
|
||||
if not sentence_gaps:
|
||||
# Speech exists but no sentence break found - return warning
|
||||
# The caller may apply special handling (e.g., move first AD to video start)
|
||||
return gemini_pause, gemini_pause, f"No sentence break found within +/-{search_window}s of {gemini_pause:.2f}s"
|
||||
|
||||
# Sort by distance from gemini_pause (closest first)
|
||||
sentence_gaps.sort(key=lambda g: abs(g.start - gemini_pause))
|
||||
|
||||
best_gap = sentence_gaps[0]
|
||||
|
||||
# Small edge buffer to avoid cutting exactly at speech boundaries (prevents clicks/pops)
|
||||
# Use 50ms as minimum, but cap at 10% of gap duration for very short gaps
|
||||
edge_buffer = min(0.05, best_gap.duration * 0.1)
|
||||
|
||||
# "Full gap overlap" algorithm:
|
||||
# - pause_point: Play video until just BEFORE next sentence (gap.end - edge_buffer)
|
||||
# - resume_from: Resume just AFTER previous sentence (gap.start + edge_buffer)
|
||||
# This means the gap portion (minus 2x edge_buffer) gets played twice,
|
||||
# providing maximum natural buffer on both sides of the AD.
|
||||
pause_point = best_gap.end - edge_buffer
|
||||
resume_from = best_gap.start + edge_buffer
|
||||
# Find the boundary closest to the Gemini pause point
|
||||
closest_boundary = min(boundaries, key=lambda b: abs(b.time - gemini_pause))
|
||||
|
||||
logger.debug(
|
||||
f"Full-gap-overlap: gap={best_gap.start:.3f}s-{best_gap.end:.3f}s "
|
||||
f"(duration={best_gap.duration:.3f}s), edge_buffer={edge_buffer:.3f}s, "
|
||||
f"pause_point={pause_point:.3f}s, resume_from={resume_from:.3f}s, "
|
||||
f"overlap_duration={pause_point - resume_from:.3f}s"
|
||||
f"Nearest boundary to {gemini_pause:.2f}s: {closest_boundary.boundary_type} "
|
||||
f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)"
|
||||
)
|
||||
|
||||
# Step 3: Apply appropriate buffering based on context
|
||||
edge_buffer = 0.5 # 500ms buffer for edge cases
|
||||
|
||||
# Case A: Beginning of sentence with no previous sentence (first sentence in video)
|
||||
if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence:
|
||||
# Pause 500ms BEFORE the sentence starts (away from speech)
|
||||
pause_point = max(0.0, closest_boundary.time - edge_buffer)
|
||||
resume_from = pause_point # No overlap
|
||||
logger.info(
|
||||
f"Case A (first sentence): pause_point={pause_point:.2f}s "
|
||||
f"(500ms before sentence start at {closest_boundary.time:.2f}s)"
|
||||
)
|
||||
return pause_point, resume_from, None
|
||||
|
||||
# Case B: End of sentence with no next sentence (last sentence in video)
|
||||
if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence:
|
||||
# Pause 500ms AFTER the sentence ends (away from speech)
|
||||
pause_point = closest_boundary.time + edge_buffer
|
||||
resume_from = pause_point # No overlap
|
||||
logger.info(
|
||||
f"Case B (last sentence): pause_point={pause_point:.2f}s "
|
||||
f"(500ms after sentence end at {closest_boundary.time:.2f}s)"
|
||||
)
|
||||
return pause_point, resume_from, None
|
||||
|
||||
# Case C: Gap between two sentences (normal case with double buffer)
|
||||
if closest_boundary.gap:
|
||||
gap = closest_boundary.gap
|
||||
# Small edge buffer to avoid cutting exactly at speech boundaries
|
||||
small_buffer = min(0.05, gap.duration * 0.1)
|
||||
|
||||
# Full double buffer:
|
||||
# - pause_point: Just BEFORE next sentence (gap.end - small_buffer)
|
||||
# - resume_from: Just AFTER previous sentence (gap.start + small_buffer)
|
||||
pause_point = gap.end - small_buffer
|
||||
resume_from = gap.start + small_buffer
|
||||
|
||||
logger.info(
|
||||
f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, "
|
||||
f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, "
|
||||
f"overlap={pause_point - resume_from:.2f}s"
|
||||
)
|
||||
return pause_point, resume_from, None
|
||||
|
||||
# Fallback: No gap associated with boundary - use the boundary time with small buffer
|
||||
# This shouldn't normally happen but handles edge cases
|
||||
if closest_boundary.boundary_type == "sentence_end":
|
||||
pause_point = closest_boundary.time + 0.05 # Small buffer after end
|
||||
else:
|
||||
pause_point = max(0.0, closest_boundary.time - 0.05) # Small buffer before start
|
||||
resume_from = pause_point
|
||||
|
||||
logger.info(
|
||||
f"Fallback: Using boundary at {closest_boundary.time:.2f}s, "
|
||||
f"pause_point={pause_point:.2f}s (no gap available)"
|
||||
)
|
||||
return pause_point, resume_from, None
|
||||
|
||||
def refine_all_pause_points(
|
||||
self,
|
||||
placements: list[dict],
|
||||
words: list[WordTimestamp],
|
||||
gaps: list[SpeechGap],
|
||||
consolidation_threshold: float = 5.0
|
||||
) -> tuple[list[dict], list[str]]:
|
||||
"""
|
||||
Refine all pause points in a Gemini analysis result.
|
||||
|
||||
Uses the "full gap overlap" algorithm where:
|
||||
- pause_point: Where to pause video (just before next sentence)
|
||||
- resume_from: Where to resume video (just after previous sentence)
|
||||
Two-phase algorithm:
|
||||
Phase 1: Refine each pause point individually using ordered logic:
|
||||
1. Check if "during speaking" (words within ±2s)
|
||||
2. If not during speaking → use Gemini's exact point
|
||||
3. If during speaking → snap to nearest boundary with appropriate buffering
|
||||
|
||||
This creates a small overlap where a tiny bit of video plays twice,
|
||||
but provides maximum natural buffer around the audio description.
|
||||
Phase 2: Consolidate cues that are within 5s of each other (after all refinements)
|
||||
|
||||
Args:
|
||||
placements: List of placement dicts from Gemini analysis
|
||||
words: Word timestamps from Whisper transcription
|
||||
gaps: Speech gaps from Whisper analysis
|
||||
consolidation_threshold: If consecutive cues have pause points within
|
||||
this many seconds, combine them to play back-to-back (default: 5.0s)
|
||||
|
|
@ -284,51 +478,31 @@ class WhisperService:
|
|||
refined_placements = []
|
||||
warnings = []
|
||||
|
||||
# Pre-compute sentence boundaries once for all placements
|
||||
boundaries = self._find_sentence_boundaries(words, gaps)
|
||||
logger.info(f"Found {len(boundaries)} sentence boundaries for pause point refinement")
|
||||
|
||||
# Phase 1: Refine each pause point individually
|
||||
for placement in placements:
|
||||
refined = placement.copy()
|
||||
|
||||
if placement.get("pause_point") is not None:
|
||||
original = placement["pause_point"]
|
||||
pause_point, resume_from, warning = self.snap_pause_point(original, gaps)
|
||||
pause_point, resume_from, warning = self.snap_pause_point(
|
||||
original, words, gaps, boundaries
|
||||
)
|
||||
|
||||
refined["pause_point"] = pause_point
|
||||
refined["resume_from"] = resume_from
|
||||
refined["original_pause_point"] = original # Preserve for debugging
|
||||
|
||||
if warning:
|
||||
# Special handling for first AD cue: if no sentence break found (but speech exists),
|
||||
# insert at the very beginning of the video to avoid mid-sentence insertion
|
||||
if placement["ad_cue_index"] == 0:
|
||||
refined["pause_point"] = 0.0
|
||||
refined["resume_from"] = 0.0
|
||||
warnings.append(
|
||||
f"Cue 0: No sentence break found within search window of {original:.2f}s, "
|
||||
"inserting AD at video start (0.0s)"
|
||||
)
|
||||
logger.info(
|
||||
f"First AD cue: No sentence break found near {original:.2f}s, "
|
||||
"using video start (0.0s) to avoid mid-sentence insertion"
|
||||
)
|
||||
else:
|
||||
warnings.append(f"Cue {placement['ad_cue_index']}: {warning}")
|
||||
logger.warning(f"Pause point refinement warning for cue {placement['ad_cue_index']}: {warning}")
|
||||
elif pause_point == resume_from:
|
||||
# No overlap - either no speech nearby, or some other reason
|
||||
# Log this as info since it's a valid scenario (non-dialogue section)
|
||||
logger.info(
|
||||
f"Cue {placement['ad_cue_index']}: Using exact pause point {pause_point:.2f}s "
|
||||
f"(no overlap - likely non-dialogue section)"
|
||||
)
|
||||
elif abs(pause_point - original) > 0.1:
|
||||
logger.info(
|
||||
f"Refined pause point for cue {placement['ad_cue_index']}: "
|
||||
f"{original:.2f}s -> pause_at={pause_point:.2f}s, resume_from={resume_from:.2f}s "
|
||||
f"(overlap={pause_point - resume_from:.2f}s)"
|
||||
)
|
||||
warnings.append(f"Cue {placement['ad_cue_index']}: {warning}")
|
||||
logger.warning(f"Pause point refinement warning for cue {placement['ad_cue_index']}: {warning}")
|
||||
|
||||
refined_placements.append(refined)
|
||||
|
||||
# Consolidate cues that are close together to avoid mid-sentence insertions
|
||||
# Phase 2: Consolidate cues that are close together (AFTER all individual refinements)
|
||||
refined_placements = self._consolidate_close_cues(
|
||||
refined_placements, consolidation_threshold, warnings
|
||||
)
|
||||
|
|
@ -365,7 +539,6 @@ class WhisperService:
|
|||
|
||||
# First pass: identify consolidated groups and assign same pause_point
|
||||
consolidated = [placements[0].copy()]
|
||||
group_resume_from = placements[0].get("resume_from") # Track the back buffer for the group
|
||||
|
||||
for i in range(1, len(placements)):
|
||||
current = placements[i].copy()
|
||||
|
|
@ -396,10 +569,6 @@ class WhisperService:
|
|||
f"Cue {current['ad_cue_index']}: Consolidated with previous cue "
|
||||
f"(pause points were {gap:.2f}s apart, playing back-to-back)"
|
||||
)
|
||||
else:
|
||||
# New group starts - update group_resume_from for the new group
|
||||
group_resume_from = current.get("resume_from")
|
||||
|
||||
consolidated.append(current)
|
||||
|
||||
# Second pass: fix resume_from values for consolidated groups
|
||||
|
|
|
|||
|
|
@ -417,9 +417,10 @@ async def _refine_pause_points_with_whisper(
|
|||
gaps = whisper_service.identify_speech_gaps(words)
|
||||
logger.info(f"Found {len(gaps)} speech gaps in video for job {job_id}")
|
||||
|
||||
# Refine pause points
|
||||
# Refine pause points (Phase 1: individual refinement, Phase 2: consolidation)
|
||||
refined_placements, warnings = whisper_service.refine_all_pause_points(
|
||||
analysis.get("placements", []),
|
||||
words,
|
||||
gaps
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue