refactor: simplify pause point algorithm with midpoint snapping and silence buffers
Replace complex overlap/catch-up logic with simpler approach: - Snap pause points to midpoint between sentences (not sentence boundaries) - Add 500ms silence before AND after AD audio during freeze frame - Resume playback from same midpoint (no overlap, no visual jump-back) This eliminates audio/visual anomalies caused by the previous algorithm's complexity around sentence boundary snapping and audio catch-up. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
37f5e8d1b0
commit
37593dd4bc
3 changed files with 106 additions and 142 deletions
|
|
@ -285,15 +285,12 @@ class VideoRendererService:
|
|||
key=lambda p: p["pause_point"]
|
||||
)
|
||||
|
||||
# Debug logging for pause points (full-gap-overlap algorithm)
|
||||
logger.info(f"Pause-insert (full-gap-overlap): {len(sorted_placements)} placements with pause points")
|
||||
# Debug logging for pause points (midpoint algorithm with silence buffers)
|
||||
logger.info(f"Pause-insert (midpoint + 500ms silence buffers): {len(sorted_placements)} placements")
|
||||
for i, p in enumerate(sorted_placements):
|
||||
resume_from = p.get('resume_from', p.get('pause_point')) # Fallback for backwards compat
|
||||
overlap = p.get('pause_point', 0) - resume_from if resume_from else 0
|
||||
logger.info(
|
||||
f" Placement {i}: cue_index={p.get('ad_cue_index')}, "
|
||||
f"pause_at={p.get('pause_point'):.2f}s, resume_from={resume_from:.2f}s, "
|
||||
f"overlap={overlap:.2f}s, ad_duration={p.get('ad_duration'):.2f}s, "
|
||||
f"pause_at={p.get('pause_point'):.2f}s, ad_duration={p.get('ad_duration'):.2f}s, "
|
||||
f"consolidated={p.get('consolidated_with_previous', False)}"
|
||||
)
|
||||
|
||||
|
|
@ -320,7 +317,6 @@ class VideoRendererService:
|
|||
pause_point = placement["pause_point"]
|
||||
cue_index = placement["ad_cue_index"]
|
||||
ad_duration = placement["ad_duration"]
|
||||
resume_from = placement.get("resume_from", pause_point) # Fallback for backwards compat
|
||||
|
||||
# Validate pause_point is within video bounds
|
||||
if pause_point >= source_duration:
|
||||
|
|
@ -330,11 +326,6 @@ class VideoRendererService:
|
|||
)
|
||||
pause_point = max(0, source_duration - 0.1) # Clamp to 100ms before end
|
||||
|
||||
if resume_from >= source_duration:
|
||||
resume_from = pause_point
|
||||
|
||||
overlap = max(0, pause_point - resume_from)
|
||||
|
||||
# Get the AD audio for this cue
|
||||
ad_mp3_path = cue_to_mp3.get(cue_index)
|
||||
if not ad_mp3_path:
|
||||
|
|
@ -362,40 +353,30 @@ class VideoRendererService:
|
|||
)
|
||||
|
||||
# 3. Prepare audio for freeze segment
|
||||
# If there's overlap (Case C - between sentences), we need catch-up audio
|
||||
# The freeze frame stays visible while source audio "catches up" to pause_point
|
||||
# This avoids visual jump-back artifacts
|
||||
if overlap > 0.01: # Only if meaningful overlap (> 10ms)
|
||||
# Extract catch-up audio from source video [resume_from, pause_point]
|
||||
catchup_audio_path = temp_dir_path / f"catchup_{i}.m4a"
|
||||
await self._extract_audio_segment(
|
||||
source_video_path,
|
||||
resume_from,
|
||||
overlap,
|
||||
str(catchup_audio_path),
|
||||
video_props
|
||||
)
|
||||
# Add 500ms silence before AND after the AD audio for smooth transitions
|
||||
silence_duration = 0.5 # 500ms
|
||||
silence_path = temp_dir_path / f"silence_{i}.m4a"
|
||||
await self._generate_silence(
|
||||
silence_duration,
|
||||
str(silence_path),
|
||||
video_props
|
||||
)
|
||||
|
||||
# Concatenate AD audio + catch-up audio
|
||||
# Order: AD plays first, then source audio catches up
|
||||
combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
|
||||
await self._concatenate_audio(
|
||||
[ad_mp3_path, str(catchup_audio_path)],
|
||||
str(combined_audio_path),
|
||||
video_props
|
||||
)
|
||||
# Concatenate: 500ms silence + AD audio + 500ms silence
|
||||
combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
|
||||
await self._concatenate_audio(
|
||||
[str(silence_path), ad_mp3_path, str(silence_path)],
|
||||
str(combined_audio_path),
|
||||
video_props
|
||||
)
|
||||
|
||||
freeze_audio_path = str(combined_audio_path)
|
||||
total_freeze_duration = ad_duration + overlap
|
||||
freeze_audio_path = str(combined_audio_path)
|
||||
total_freeze_duration = ad_duration + (2 * silence_duration) # AD + 1.0s total silence
|
||||
|
||||
logger.info(
|
||||
f"Cue {cue_index}: Audio catch-up enabled - "
|
||||
f"AD={ad_duration:.2f}s + catchup={overlap:.2f}s = {total_freeze_duration:.2f}s"
|
||||
)
|
||||
else:
|
||||
# No overlap (Case A/B) - just use AD audio
|
||||
freeze_audio_path = ad_mp3_path
|
||||
total_freeze_duration = ad_duration
|
||||
logger.info(
|
||||
f"Cue {cue_index}: Freeze segment with silence buffers - "
|
||||
f"500ms + AD={ad_duration:.2f}s + 500ms = {total_freeze_duration:.2f}s"
|
||||
)
|
||||
|
||||
# 4. Create freeze segment with prepared audio
|
||||
freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
|
||||
|
|
@ -774,6 +755,43 @@ class VideoRendererService:
|
|||
]
|
||||
await self._run_ffmpeg(cmd)
|
||||
|
||||
async def _generate_silence(
|
||||
self,
|
||||
duration: float,
|
||||
output_path: str,
|
||||
props: dict[str, Any]
|
||||
):
|
||||
"""
|
||||
Generate a silent audio file of specified duration.
|
||||
|
||||
Used to create 500ms silence buffers before/after AD audio.
|
||||
|
||||
Args:
|
||||
duration: Duration of silence in seconds
|
||||
output_path: Path to output audio file
|
||||
props: Video properties (for sample_rate, channels)
|
||||
"""
|
||||
if duration <= 0:
|
||||
raise ValueError(f"Invalid silence duration: {duration}")
|
||||
|
||||
logger.debug(
|
||||
f"Generating {duration:.2f}s silence: output={output_path}"
|
||||
)
|
||||
|
||||
cmd = [
|
||||
self.ffmpeg_path,
|
||||
"-y",
|
||||
"-f", "lavfi",
|
||||
"-i", f"anullsrc=r={props['sample_rate']}:cl={'stereo' if props['channels'] == '2' else 'mono'}",
|
||||
"-t", str(duration),
|
||||
"-c:a", "aac",
|
||||
"-ar", props["sample_rate"],
|
||||
"-ac", props["channels"],
|
||||
"-b:a", "192k",
|
||||
output_path
|
||||
]
|
||||
await self._run_ffmpeg(cmd)
|
||||
|
||||
async def _concatenate_segments(
|
||||
self,
|
||||
segment_paths: list[str],
|
||||
|
|
|
|||
|
|
@ -18,41 +18,37 @@ class VTTRetimerService:
|
|||
"""
|
||||
Generate new VTT with adjusted timings for pause-insert accessible video.
|
||||
|
||||
Uses the "full gap overlap" algorithm where:
|
||||
- Video plays until pause_point (just before next sentence)
|
||||
- Freeze frame with AD audio plays for ad_duration
|
||||
- Video resumes from resume_from (just after previous sentence)
|
||||
Uses the simplified midpoint algorithm with silence buffers:
|
||||
- Video plays until pause_point (midpoint between sentences)
|
||||
- Freeze frame shows with: 500ms silence + AD audio + 500ms silence
|
||||
- Video resumes from the same pause_point
|
||||
|
||||
This means the effective time offset is: ad_duration + (pause_point - resume_from)
|
||||
because after the AD, we jump BACK in the source timeline.
|
||||
|
||||
Captions in the overlap zone [resume_from, pause_point] only show once
|
||||
(during the first playback, before the AD).
|
||||
The effective time offset is: ad_duration + 1.0 seconds (for the silence buffers)
|
||||
|
||||
Args:
|
||||
original_vtt: Original VTT content
|
||||
analysis: Gemini analysis with placements containing pause_point, resume_from, and ad_duration
|
||||
analysis: Gemini analysis with placements containing pause_point and ad_duration
|
||||
|
||||
Returns:
|
||||
Re-timed VTT content
|
||||
"""
|
||||
placements = analysis.get("placements", [])
|
||||
|
||||
# Silence buffer duration (500ms before + 500ms after AD)
|
||||
silence_buffer_total = 1.0
|
||||
|
||||
# Build list of (pause_point, effective_offset) sorted by time
|
||||
# effective_offset = ad_duration + overlap (where overlap = pause_point - resume_from)
|
||||
# effective_offset = ad_duration + 1.0s (for silence buffers)
|
||||
pauses = []
|
||||
for placement in placements:
|
||||
pause_point = placement.get("pause_point")
|
||||
resume_from = placement.get("resume_from", pause_point) # Fallback for backwards compat
|
||||
ad_duration = placement.get("ad_duration", 0)
|
||||
if pause_point is not None and ad_duration > 0:
|
||||
# Overlap is the video portion that plays twice (between resume_from and pause_point)
|
||||
overlap = max(0, pause_point - resume_from) if resume_from else 0
|
||||
effective_offset = ad_duration + overlap
|
||||
effective_offset = ad_duration + silence_buffer_total
|
||||
pauses.append((pause_point, effective_offset))
|
||||
logger.debug(
|
||||
f"Pause at {pause_point:.2f}s: ad_duration={ad_duration:.2f}s, "
|
||||
f"overlap={overlap:.2f}s, effective_offset={effective_offset:.2f}s"
|
||||
f"silence_buffers=1.0s, effective_offset={effective_offset:.2f}s"
|
||||
)
|
||||
|
||||
pauses.sort(key=lambda x: x[0])
|
||||
|
|
@ -61,7 +57,7 @@ class VTTRetimerService:
|
|||
logger.info("No pauses to apply, returning original VTT")
|
||||
return original_vtt
|
||||
|
||||
logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (full-gap-overlap algorithm)")
|
||||
logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (midpoint + silence buffers)")
|
||||
|
||||
# Parse and retime cues
|
||||
cues = self._parse_vtt(original_vtt)
|
||||
|
|
@ -69,8 +65,6 @@ class VTTRetimerService:
|
|||
|
||||
for cue in cues:
|
||||
# Calculate cumulative offset from all pauses that occur before this cue's start
|
||||
# Captions in overlap zone [resume_from, pause_point] only shift on pause_point,
|
||||
# so they show during the first playback (before AD), not the second
|
||||
cumulative_offset = sum(
|
||||
effective_offset for pause_point, effective_offset in pauses
|
||||
if pause_point <= cue["start_time"]
|
||||
|
|
|
|||
|
|
@ -345,16 +345,18 @@ class WhisperService:
|
|||
speaking_threshold: float = 2.0
|
||||
) -> tuple[float, float, str | None]:
|
||||
"""
|
||||
Snap a Gemini pause point to the nearest sentence boundary using ordered logic.
|
||||
Snap a Gemini pause point to the nearest sentence boundary.
|
||||
|
||||
Algorithm (in order):
|
||||
Simplified algorithm:
|
||||
1. Check if "during speaking" (words within ±threshold)
|
||||
- If NO → Use Gemini's exact pause point, no overlap
|
||||
2. If during speaking, find nearest sentence boundary
|
||||
3. Apply appropriate buffering based on context:
|
||||
- Case A: Beginning of sentence, no previous → pause 500ms before sentence starts
|
||||
- Case B: End of sentence, no next → pause 500ms after sentence ends
|
||||
- Case C: Gap between sentences → full double buffer
|
||||
- If NO → Use Gemini's exact pause point
|
||||
2. If during speaking, find nearest sentence gap and snap to MIDPOINT
|
||||
3. Edge cases:
|
||||
- Case A: First sentence in video → pause at video start (0.0)
|
||||
- Case B: Last sentence in video → pause at video end
|
||||
|
||||
The video renderer adds 500ms silence buffers before/after AD audio,
|
||||
so no overlap or catch-up logic is needed here.
|
||||
|
||||
Args:
|
||||
gemini_pause: Original pause point from Gemini (seconds)
|
||||
|
|
@ -365,6 +367,7 @@ class WhisperService:
|
|||
|
||||
Returns:
|
||||
Tuple of (pause_point, resume_from, warning_message_or_none)
|
||||
Note: resume_from always equals pause_point with the simplified algorithm
|
||||
"""
|
||||
# Step 1: Check if "during speaking" (words within ±threshold)
|
||||
if not self._is_during_speaking(gemini_pause, words, speaking_threshold):
|
||||
|
|
@ -389,63 +392,44 @@ class WhisperService:
|
|||
f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)"
|
||||
)
|
||||
|
||||
# Step 3: Apply appropriate buffering based on context
|
||||
edge_buffer = 0.5 # 500ms buffer for edge cases
|
||||
|
||||
# Case A: Beginning of sentence with no previous sentence (first sentence in video)
|
||||
# Case A: First sentence in video (no previous sentence) → snap to video start
|
||||
if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence:
|
||||
# Pause 500ms BEFORE the sentence starts (away from speech)
|
||||
pause_point = max(0.0, closest_boundary.time - edge_buffer)
|
||||
resume_from = pause_point # No overlap
|
||||
pause_point = 0.0
|
||||
logger.info(
|
||||
f"Case A (first sentence): pause_point={pause_point:.2f}s "
|
||||
f"(500ms before sentence start at {closest_boundary.time:.2f}s)"
|
||||
f"(snapped to video start)"
|
||||
)
|
||||
return pause_point, resume_from, None
|
||||
return pause_point, pause_point, None
|
||||
|
||||
# Case B: End of sentence with no next sentence (last sentence in video)
|
||||
# Case B: Last sentence in video (no next sentence) → snap to boundary time
|
||||
if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence:
|
||||
# Pause 500ms AFTER the sentence ends (away from speech)
|
||||
pause_point = closest_boundary.time + edge_buffer
|
||||
resume_from = pause_point # No overlap
|
||||
pause_point = closest_boundary.time
|
||||
logger.info(
|
||||
f"Case B (last sentence): pause_point={pause_point:.2f}s "
|
||||
f"(500ms after sentence end at {closest_boundary.time:.2f}s)"
|
||||
f"(snapped to video end at sentence boundary)"
|
||||
)
|
||||
return pause_point, resume_from, None
|
||||
return pause_point, pause_point, None
|
||||
|
||||
# Case C: Gap between two sentences (normal case with double buffer)
|
||||
# Case C: Gap between two sentences → snap to MIDPOINT of the gap
|
||||
if closest_boundary.gap:
|
||||
gap = closest_boundary.gap
|
||||
# Small edge buffer to avoid cutting exactly at speech boundaries
|
||||
small_buffer = min(0.05, gap.duration * 0.1)
|
||||
|
||||
# Full double buffer:
|
||||
# - pause_point: Just BEFORE next sentence (gap.end - small_buffer)
|
||||
# - resume_from: Just AFTER previous sentence (gap.start + small_buffer)
|
||||
pause_point = gap.end - small_buffer
|
||||
resume_from = gap.start + small_buffer
|
||||
# Calculate midpoint between end of previous sentence and start of next
|
||||
midpoint = (gap.start + gap.end) / 2.0
|
||||
|
||||
logger.info(
|
||||
f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, "
|
||||
f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, "
|
||||
f"overlap={pause_point - resume_from:.2f}s"
|
||||
f"midpoint={midpoint:.2f}s (resume from same point)"
|
||||
)
|
||||
return pause_point, resume_from, None
|
||||
return midpoint, midpoint, None
|
||||
|
||||
# Fallback: No gap associated with boundary - use the boundary time with small buffer
|
||||
# Fallback: No gap associated with boundary - use the boundary time directly
|
||||
# This shouldn't normally happen but handles edge cases
|
||||
if closest_boundary.boundary_type == "sentence_end":
|
||||
pause_point = closest_boundary.time + 0.05 # Small buffer after end
|
||||
else:
|
||||
pause_point = max(0.0, closest_boundary.time - 0.05) # Small buffer before start
|
||||
resume_from = pause_point
|
||||
|
||||
pause_point = closest_boundary.time
|
||||
logger.info(
|
||||
f"Fallback: Using boundary at {closest_boundary.time:.2f}s, "
|
||||
f"pause_point={pause_point:.2f}s (no gap available)"
|
||||
)
|
||||
return pause_point, resume_from, None
|
||||
return pause_point, pause_point, None
|
||||
|
||||
def refine_all_pause_points(
|
||||
self,
|
||||
|
|
@ -518,13 +502,9 @@ class WhisperService:
|
|||
"""
|
||||
Consolidate AD cues whose pause points are within threshold seconds of each other.
|
||||
|
||||
Consolidated cues are treated as a single AD segment:
|
||||
- All cues in a group share the same pause_point (front buffer applied once)
|
||||
- Only the LAST cue in the group keeps resume_from (back buffer applied once)
|
||||
- Middle cues have resume_from = pause_point (no video between ADs)
|
||||
|
||||
This ensures consolidated ADs play seamlessly back-to-back without
|
||||
repeating the overlap video segment between each one.
|
||||
Consolidated cues share the same pause_point and play back-to-back during
|
||||
the freeze frame. With the simplified midpoint algorithm, resume_from always
|
||||
equals pause_point, so no complex buffer logic is needed.
|
||||
|
||||
Args:
|
||||
placements: List of refined placement dicts
|
||||
|
|
@ -532,12 +512,11 @@ class WhisperService:
|
|||
warnings: List to append warning messages to
|
||||
|
||||
Returns:
|
||||
Updated placements with consolidated pause points and resume_from values
|
||||
Updated placements with consolidated pause points
|
||||
"""
|
||||
if len(placements) < 2:
|
||||
return placements
|
||||
|
||||
# First pass: identify consolidated groups and assign same pause_point
|
||||
consolidated = [placements[0].copy()]
|
||||
|
||||
for i in range(1, len(placements)):
|
||||
|
|
@ -553,12 +532,10 @@ class WhisperService:
|
|||
if 0 < gap <= threshold:
|
||||
# Consolidate: set current cue to use same pause point as previous
|
||||
original_pause = current_pause
|
||||
original_resume = current.get("resume_from")
|
||||
current["pause_point"] = previous_pause
|
||||
current["resume_from"] = previous_pause # Always same as pause_point
|
||||
current["consolidated_with_previous"] = True
|
||||
current["original_pause_point_before_consolidation"] = original_pause
|
||||
current["original_resume_from_before_consolidation"] = original_resume
|
||||
# Keep tracking the group's resume_from (we'll assign it to the last cue later)
|
||||
|
||||
logger.info(
|
||||
f"Consolidated cue {current['ad_cue_index']} with previous cue: "
|
||||
|
|
@ -571,31 +548,6 @@ class WhisperService:
|
|||
)
|
||||
consolidated.append(current)
|
||||
|
||||
# Second pass: fix resume_from values for consolidated groups
|
||||
# Only the LAST cue in each group should have the back buffer (resume_from < pause_point)
|
||||
# All other cues should have resume_from = pause_point (no video between ADs)
|
||||
for i in range(len(consolidated)):
|
||||
current = consolidated[i]
|
||||
current_pause = current.get("pause_point")
|
||||
|
||||
if current_pause is None:
|
||||
continue
|
||||
|
||||
# Check if next cue has the same pause_point (meaning current is NOT last in group)
|
||||
if i < len(consolidated) - 1:
|
||||
next_pause = consolidated[i + 1].get("pause_point")
|
||||
if next_pause == current_pause:
|
||||
# Current is NOT the last in the group - remove back buffer
|
||||
# Set resume_from = pause_point so no video plays between this AD and the next
|
||||
original_resume = current.get("resume_from")
|
||||
if original_resume != current_pause:
|
||||
current["resume_from"] = current_pause
|
||||
current["resume_from_removed_for_consolidation"] = original_resume
|
||||
logger.debug(
|
||||
f"Cue {current.get('ad_cue_index')}: Removed back buffer for seamless "
|
||||
f"consolidated playback (resume_from {original_resume:.2f}s -> {current_pause:.2f}s)"
|
||||
)
|
||||
|
||||
# Log the final consolidated groups
|
||||
self._log_consolidated_groups(consolidated)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue