refactor: simplify pause point algorithm with midpoint snapping and silence buffers

Replace complex overlap/catch-up logic with simpler approach:
- Snap pause points to midpoint between sentences (not sentence boundaries)
- Add 500ms silence before AND after AD audio during freeze frame
- Resume playback from same midpoint (no overlap, no visual jump-back)

This eliminates audio/visual anomalies caused by the previous algorithm's
complexity around sentence boundary snapping and audio catch-up.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
michael 2025-12-29 09:55:40 -06:00
parent 37f5e8d1b0
commit 37593dd4bc
3 changed files with 106 additions and 142 deletions

View file

@ -285,15 +285,12 @@ class VideoRendererService:
key=lambda p: p["pause_point"]
)
# Debug logging for pause points (full-gap-overlap algorithm)
logger.info(f"Pause-insert (full-gap-overlap): {len(sorted_placements)} placements with pause points")
# Debug logging for pause points (midpoint algorithm with silence buffers)
logger.info(f"Pause-insert (midpoint + 500ms silence buffers): {len(sorted_placements)} placements")
for i, p in enumerate(sorted_placements):
resume_from = p.get('resume_from', p.get('pause_point')) # Fallback for backwards compat
overlap = p.get('pause_point', 0) - resume_from if resume_from else 0
logger.info(
f" Placement {i}: cue_index={p.get('ad_cue_index')}, "
f"pause_at={p.get('pause_point'):.2f}s, resume_from={resume_from:.2f}s, "
f"overlap={overlap:.2f}s, ad_duration={p.get('ad_duration'):.2f}s, "
f"pause_at={p.get('pause_point'):.2f}s, ad_duration={p.get('ad_duration'):.2f}s, "
f"consolidated={p.get('consolidated_with_previous', False)}"
)
@ -320,7 +317,6 @@ class VideoRendererService:
pause_point = placement["pause_point"]
cue_index = placement["ad_cue_index"]
ad_duration = placement["ad_duration"]
resume_from = placement.get("resume_from", pause_point) # Fallback for backwards compat
# Validate pause_point is within video bounds
if pause_point >= source_duration:
@ -330,11 +326,6 @@ class VideoRendererService:
)
pause_point = max(0, source_duration - 0.1) # Clamp to 100ms before end
if resume_from >= source_duration:
resume_from = pause_point
overlap = max(0, pause_point - resume_from)
# Get the AD audio for this cue
ad_mp3_path = cue_to_mp3.get(cue_index)
if not ad_mp3_path:
@ -362,40 +353,30 @@ class VideoRendererService:
)
# 3. Prepare audio for freeze segment
# If there's overlap (Case C - between sentences), we need catch-up audio
# The freeze frame stays visible while source audio "catches up" to pause_point
# This avoids visual jump-back artifacts
if overlap > 0.01: # Only if meaningful overlap (> 10ms)
# Extract catch-up audio from source video [resume_from, pause_point]
catchup_audio_path = temp_dir_path / f"catchup_{i}.m4a"
await self._extract_audio_segment(
source_video_path,
resume_from,
overlap,
str(catchup_audio_path),
video_props
)
# Add 500ms silence before AND after the AD audio for smooth transitions
silence_duration = 0.5 # 500ms
silence_path = temp_dir_path / f"silence_{i}.m4a"
await self._generate_silence(
silence_duration,
str(silence_path),
video_props
)
# Concatenate AD audio + catch-up audio
# Order: AD plays first, then source audio catches up
combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
await self._concatenate_audio(
[ad_mp3_path, str(catchup_audio_path)],
str(combined_audio_path),
video_props
)
# Concatenate: 500ms silence + AD audio + 500ms silence
combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
await self._concatenate_audio(
[str(silence_path), ad_mp3_path, str(silence_path)],
str(combined_audio_path),
video_props
)
freeze_audio_path = str(combined_audio_path)
total_freeze_duration = ad_duration + overlap
freeze_audio_path = str(combined_audio_path)
total_freeze_duration = ad_duration + (2 * silence_duration) # AD + 1.0s total silence
logger.info(
f"Cue {cue_index}: Audio catch-up enabled - "
f"AD={ad_duration:.2f}s + catchup={overlap:.2f}s = {total_freeze_duration:.2f}s"
)
else:
# No overlap (Case A/B) - just use AD audio
freeze_audio_path = ad_mp3_path
total_freeze_duration = ad_duration
logger.info(
f"Cue {cue_index}: Freeze segment with silence buffers - "
f"500ms + AD={ad_duration:.2f}s + 500ms = {total_freeze_duration:.2f}s"
)
# 4. Create freeze segment with prepared audio
freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
@ -774,6 +755,43 @@ class VideoRendererService:
]
await self._run_ffmpeg(cmd)
async def _generate_silence(
self,
duration: float,
output_path: str,
props: dict[str, Any]
):
"""
Generate a silent audio file of specified duration.
Used to create 500ms silence buffers before/after AD audio.
Args:
duration: Duration of silence in seconds
output_path: Path to output audio file
props: Video properties (for sample_rate, channels)
"""
if duration <= 0:
raise ValueError(f"Invalid silence duration: {duration}")
logger.debug(
f"Generating {duration:.2f}s silence: output={output_path}"
)
cmd = [
self.ffmpeg_path,
"-y",
"-f", "lavfi",
"-i", f"anullsrc=r={props['sample_rate']}:cl={'stereo' if props['channels'] == '2' else 'mono'}",
"-t", str(duration),
"-c:a", "aac",
"-ar", props["sample_rate"],
"-ac", props["channels"],
"-b:a", "192k",
output_path
]
await self._run_ffmpeg(cmd)
async def _concatenate_segments(
self,
segment_paths: list[str],

View file

@ -18,41 +18,37 @@ class VTTRetimerService:
"""
Generate new VTT with adjusted timings for pause-insert accessible video.
Uses the "full gap overlap" algorithm where:
- Video plays until pause_point (just before next sentence)
- Freeze frame with AD audio plays for ad_duration
- Video resumes from resume_from (just after previous sentence)
Uses the simplified midpoint algorithm with silence buffers:
- Video plays until pause_point (midpoint between sentences)
- Freeze frame shows with: 500ms silence + AD audio + 500ms silence
- Video resumes from the same pause_point
This means the effective time offset is: ad_duration + (pause_point - resume_from)
because after the AD, we jump BACK in the source timeline.
Captions in the overlap zone [resume_from, pause_point] only show once
(during the first playback, before the AD).
The effective time offset is: ad_duration + 1.0 seconds (for the silence buffers)
Args:
original_vtt: Original VTT content
analysis: Gemini analysis with placements containing pause_point, resume_from, and ad_duration
analysis: Gemini analysis with placements containing pause_point and ad_duration
Returns:
Re-timed VTT content
"""
placements = analysis.get("placements", [])
# Silence buffer duration (500ms before + 500ms after AD)
silence_buffer_total = 1.0
# Build list of (pause_point, effective_offset) sorted by time
# effective_offset = ad_duration + overlap (where overlap = pause_point - resume_from)
# effective_offset = ad_duration + 1.0s (for silence buffers)
pauses = []
for placement in placements:
pause_point = placement.get("pause_point")
resume_from = placement.get("resume_from", pause_point) # Fallback for backwards compat
ad_duration = placement.get("ad_duration", 0)
if pause_point is not None and ad_duration > 0:
# Overlap is the video portion that plays twice (between resume_from and pause_point)
overlap = max(0, pause_point - resume_from) if resume_from else 0
effective_offset = ad_duration + overlap
effective_offset = ad_duration + silence_buffer_total
pauses.append((pause_point, effective_offset))
logger.debug(
f"Pause at {pause_point:.2f}s: ad_duration={ad_duration:.2f}s, "
f"overlap={overlap:.2f}s, effective_offset={effective_offset:.2f}s"
f"silence_buffers=1.0s, effective_offset={effective_offset:.2f}s"
)
pauses.sort(key=lambda x: x[0])
@ -61,7 +57,7 @@ class VTTRetimerService:
logger.info("No pauses to apply, returning original VTT")
return original_vtt
logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (full-gap-overlap algorithm)")
logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (midpoint + silence buffers)")
# Parse and retime cues
cues = self._parse_vtt(original_vtt)
@ -69,8 +65,6 @@ class VTTRetimerService:
for cue in cues:
# Calculate cumulative offset from all pauses that occur before this cue's start
# Captions in overlap zone [resume_from, pause_point] only shift on pause_point,
# so they show during the first playback (before AD), not the second
cumulative_offset = sum(
effective_offset for pause_point, effective_offset in pauses
if pause_point <= cue["start_time"]

View file

@ -345,16 +345,18 @@ class WhisperService:
speaking_threshold: float = 2.0
) -> tuple[float, float, str | None]:
"""
Snap a Gemini pause point to the nearest sentence boundary using ordered logic.
Snap a Gemini pause point to the nearest sentence boundary.
Algorithm (in order):
Simplified algorithm:
1. Check if "during speaking" (words within ±threshold)
- If NO Use Gemini's exact pause point, no overlap
2. If during speaking, find nearest sentence boundary
3. Apply appropriate buffering based on context:
- Case A: Beginning of sentence, no previous pause 500ms before sentence starts
- Case B: End of sentence, no next pause 500ms after sentence ends
- Case C: Gap between sentences full double buffer
- If NO Use Gemini's exact pause point
2. If during speaking, find nearest sentence gap and snap to MIDPOINT
3. Edge cases:
- Case A: First sentence in video pause at video start (0.0)
- Case B: Last sentence in video pause at video end
The video renderer adds 500ms silence buffers before/after AD audio,
so no overlap or catch-up logic is needed here.
Args:
gemini_pause: Original pause point from Gemini (seconds)
@ -365,6 +367,7 @@ class WhisperService:
Returns:
Tuple of (pause_point, resume_from, warning_message_or_none)
Note: resume_from always equals pause_point with the simplified algorithm
"""
# Step 1: Check if "during speaking" (words within ±threshold)
if not self._is_during_speaking(gemini_pause, words, speaking_threshold):
@ -389,63 +392,44 @@ class WhisperService:
f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)"
)
# Step 3: Apply appropriate buffering based on context
edge_buffer = 0.5 # 500ms buffer for edge cases
# Case A: Beginning of sentence with no previous sentence (first sentence in video)
# Case A: First sentence in video (no previous sentence) → snap to video start
if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence:
# Pause 500ms BEFORE the sentence starts (away from speech)
pause_point = max(0.0, closest_boundary.time - edge_buffer)
resume_from = pause_point # No overlap
pause_point = 0.0
logger.info(
f"Case A (first sentence): pause_point={pause_point:.2f}s "
f"(500ms before sentence start at {closest_boundary.time:.2f}s)"
f"(snapped to video start)"
)
return pause_point, resume_from, None
return pause_point, pause_point, None
# Case B: End of sentence with no next sentence (last sentence in video)
# Case B: Last sentence in video (no next sentence) → snap to boundary time
if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence:
# Pause 500ms AFTER the sentence ends (away from speech)
pause_point = closest_boundary.time + edge_buffer
resume_from = pause_point # No overlap
pause_point = closest_boundary.time
logger.info(
f"Case B (last sentence): pause_point={pause_point:.2f}s "
f"(500ms after sentence end at {closest_boundary.time:.2f}s)"
f"(snapped to video end at sentence boundary)"
)
return pause_point, resume_from, None
return pause_point, pause_point, None
# Case C: Gap between two sentences (normal case with double buffer)
# Case C: Gap between two sentences → snap to MIDPOINT of the gap
if closest_boundary.gap:
gap = closest_boundary.gap
# Small edge buffer to avoid cutting exactly at speech boundaries
small_buffer = min(0.05, gap.duration * 0.1)
# Full double buffer:
# - pause_point: Just BEFORE next sentence (gap.end - small_buffer)
# - resume_from: Just AFTER previous sentence (gap.start + small_buffer)
pause_point = gap.end - small_buffer
resume_from = gap.start + small_buffer
# Calculate midpoint between end of previous sentence and start of next
midpoint = (gap.start + gap.end) / 2.0
logger.info(
f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, "
f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, "
f"overlap={pause_point - resume_from:.2f}s"
f"midpoint={midpoint:.2f}s (resume from same point)"
)
return pause_point, resume_from, None
return midpoint, midpoint, None
# Fallback: No gap associated with boundary - use the boundary time with small buffer
# Fallback: No gap associated with boundary - use the boundary time directly
# This shouldn't normally happen but handles edge cases
if closest_boundary.boundary_type == "sentence_end":
pause_point = closest_boundary.time + 0.05 # Small buffer after end
else:
pause_point = max(0.0, closest_boundary.time - 0.05) # Small buffer before start
resume_from = pause_point
pause_point = closest_boundary.time
logger.info(
f"Fallback: Using boundary at {closest_boundary.time:.2f}s, "
f"pause_point={pause_point:.2f}s (no gap available)"
)
return pause_point, resume_from, None
return pause_point, pause_point, None
def refine_all_pause_points(
self,
@ -518,13 +502,9 @@ class WhisperService:
"""
Consolidate AD cues whose pause points are within threshold seconds of each other.
Consolidated cues are treated as a single AD segment:
- All cues in a group share the same pause_point (front buffer applied once)
- Only the LAST cue in the group keeps resume_from (back buffer applied once)
- Middle cues have resume_from = pause_point (no video between ADs)
This ensures consolidated ADs play seamlessly back-to-back without
repeating the overlap video segment between each one.
Consolidated cues share the same pause_point and play back-to-back during
the freeze frame. With the simplified midpoint algorithm, resume_from always
equals pause_point, so no complex buffer logic is needed.
Args:
placements: List of refined placement dicts
@ -532,12 +512,11 @@ class WhisperService:
warnings: List to append warning messages to
Returns:
Updated placements with consolidated pause points and resume_from values
Updated placements with consolidated pause points
"""
if len(placements) < 2:
return placements
# First pass: identify consolidated groups and assign same pause_point
consolidated = [placements[0].copy()]
for i in range(1, len(placements)):
@ -553,12 +532,10 @@ class WhisperService:
if 0 < gap <= threshold:
# Consolidate: set current cue to use same pause point as previous
original_pause = current_pause
original_resume = current.get("resume_from")
current["pause_point"] = previous_pause
current["resume_from"] = previous_pause # Always same as pause_point
current["consolidated_with_previous"] = True
current["original_pause_point_before_consolidation"] = original_pause
current["original_resume_from_before_consolidation"] = original_resume
# Keep tracking the group's resume_from (we'll assign it to the last cue later)
logger.info(
f"Consolidated cue {current['ad_cue_index']} with previous cue: "
@ -571,31 +548,6 @@ class WhisperService:
)
consolidated.append(current)
# Second pass: fix resume_from values for consolidated groups
# Only the LAST cue in each group should have the back buffer (resume_from < pause_point)
# All other cues should have resume_from = pause_point (no video between ADs)
for i in range(len(consolidated)):
current = consolidated[i]
current_pause = current.get("pause_point")
if current_pause is None:
continue
# Check if next cue has the same pause_point (meaning current is NOT last in group)
if i < len(consolidated) - 1:
next_pause = consolidated[i + 1].get("pause_point")
if next_pause == current_pause:
# Current is NOT the last in the group - remove back buffer
# Set resume_from = pause_point so no video plays between this AD and the next
original_resume = current.get("resume_from")
if original_resume != current_pause:
current["resume_from"] = current_pause
current["resume_from_removed_for_consolidation"] = original_resume
logger.debug(
f"Cue {current.get('ad_cue_index')}: Removed back buffer for seamless "
f"consolidated playback (resume_from {original_resume:.2f}s -> {current_pause:.2f}s)"
)
# Log the final consolidated groups
self._log_consolidated_groups(consolidated)