refactor: simplify pause point algorithm with midpoint snapping and silence buffers

Replace complex overlap/catch-up logic with simpler approach: - Snap pause points to midpoint between sentences (not sentence boundaries) - Add 500ms silence before AND after AD audio during freeze frame - Resume playback from same midpoint (no overlap, no visual jump-back) This eliminates audio/visual anomalies caused by the previous algorithm's complexity around sentence boundary snapping and audio catch-up. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 09:55:40 -06:00 · 2025-12-29 09:55:40 -06:00 · 37593dd4bc
commit 37593dd4bc
parent 37f5e8d1b0
3 changed files with 106 additions and 142 deletions
--- a/backend/app/services/video_renderer.py
+++ b/backend/app/services/video_renderer.py
@ -285,15 +285,12 @@ class VideoRendererService:
            key=lambda p: p["pause_point"]
        )

-        # Debug logging for pause points (full-gap-overlap algorithm)
-        logger.info(f"Pause-insert (full-gap-overlap): {len(sorted_placements)} placements with pause points")
+        # Debug logging for pause points (midpoint algorithm with silence buffers)
+        logger.info(f"Pause-insert (midpoint + 500ms silence buffers): {len(sorted_placements)} placements")
        for i, p in enumerate(sorted_placements):
-            resume_from = p.get('resume_from', p.get('pause_point'))  # Fallback for backwards compat
-            overlap = p.get('pause_point', 0) - resume_from if resume_from else 0
            logger.info(
                f"  Placement {i}: cue_index={p.get('ad_cue_index')}, "
-                f"pause_at={p.get('pause_point'):.2f}s, resume_from={resume_from:.2f}s, "
-                f"overlap={overlap:.2f}s, ad_duration={p.get('ad_duration'):.2f}s, "
+                f"pause_at={p.get('pause_point'):.2f}s, ad_duration={p.get('ad_duration'):.2f}s, "
                f"consolidated={p.get('consolidated_with_previous', False)}"
            )

@ -320,7 +317,6 @@ class VideoRendererService:
                pause_point = placement["pause_point"]
                cue_index = placement["ad_cue_index"]
                ad_duration = placement["ad_duration"]
-                resume_from = placement.get("resume_from", pause_point)  # Fallback for backwards compat

                # Validate pause_point is within video bounds
                if pause_point >= source_duration:
@ -330,11 +326,6 @@ class VideoRendererService:
                    )
                    pause_point = max(0, source_duration - 0.1)  # Clamp to 100ms before end

-                if resume_from >= source_duration:
-                    resume_from = pause_point
-
-                overlap = max(0, pause_point - resume_from)
-
                # Get the AD audio for this cue
                ad_mp3_path = cue_to_mp3.get(cue_index)
                if not ad_mp3_path:
@ -362,40 +353,30 @@ class VideoRendererService:
                )

                # 3. Prepare audio for freeze segment
-                # If there's overlap (Case C - between sentences), we need catch-up audio
-                # The freeze frame stays visible while source audio "catches up" to pause_point
-                # This avoids visual jump-back artifacts
-                if overlap > 0.01:  # Only if meaningful overlap (> 10ms)
-                    # Extract catch-up audio from source video [resume_from, pause_point]
-                    catchup_audio_path = temp_dir_path / f"catchup_{i}.m4a"
-                    await self._extract_audio_segment(
-                        source_video_path,
-                        resume_from,
-                        overlap,
-                        str(catchup_audio_path),
-                        video_props
-                    )
+                # Add 500ms silence before AND after the AD audio for smooth transitions
+                silence_duration = 0.5  # 500ms
+                silence_path = temp_dir_path / f"silence_{i}.m4a"
+                await self._generate_silence(
+                    silence_duration,
+                    str(silence_path),
+                    video_props
+                )

-                    # Concatenate AD audio + catch-up audio
-                    # Order: AD plays first, then source audio catches up
-                    combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
-                    await self._concatenate_audio(
-                        [ad_mp3_path, str(catchup_audio_path)],
-                        str(combined_audio_path),
-                        video_props
-                    )
+                # Concatenate: 500ms silence + AD audio + 500ms silence
+                combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
+                await self._concatenate_audio(
+                    [str(silence_path), ad_mp3_path, str(silence_path)],
+                    str(combined_audio_path),
+                    video_props
+                )

-                    freeze_audio_path = str(combined_audio_path)
-                    total_freeze_duration = ad_duration + overlap
+                freeze_audio_path = str(combined_audio_path)
+                total_freeze_duration = ad_duration + (2 * silence_duration)  # AD + 1.0s total silence

-                    logger.info(
-                        f"Cue {cue_index}: Audio catch-up enabled - "
-                        f"AD={ad_duration:.2f}s + catchup={overlap:.2f}s = {total_freeze_duration:.2f}s"
-                    )
-                else:
-                    # No overlap (Case A/B) - just use AD audio
-                    freeze_audio_path = ad_mp3_path
-                    total_freeze_duration = ad_duration
+                logger.info(
+                    f"Cue {cue_index}: Freeze segment with silence buffers - "
+                    f"500ms + AD={ad_duration:.2f}s + 500ms = {total_freeze_duration:.2f}s"
+                )

                # 4. Create freeze segment with prepared audio
                freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
@ -774,6 +755,43 @@ class VideoRendererService:
        ]
        await self._run_ffmpeg(cmd)

+    async def _generate_silence(
+        self,
+        duration: float,
+        output_path: str,
+        props: dict[str, Any]
+    ):
+        """
+        Generate a silent audio file of specified duration.
+
+        Used to create 500ms silence buffers before/after AD audio.
+
+        Args:
+            duration: Duration of silence in seconds
+            output_path: Path to output audio file
+            props: Video properties (for sample_rate, channels)
+        """
+        if duration <= 0:
+            raise ValueError(f"Invalid silence duration: {duration}")
+
+        logger.debug(
+            f"Generating {duration:.2f}s silence: output={output_path}"
+        )
+
+        cmd = [
+            self.ffmpeg_path,
+            "-y",
+            "-f", "lavfi",
+            "-i", f"anullsrc=r={props['sample_rate']}:cl={'stereo' if props['channels'] == '2' else 'mono'}",
+            "-t", str(duration),
+            "-c:a", "aac",
+            "-ar", props["sample_rate"],
+            "-ac", props["channels"],
+            "-b:a", "192k",
+            output_path
+        ]
+        await self._run_ffmpeg(cmd)
+
    async def _concatenate_segments(
        self,
        segment_paths: list[str],
--- a/backend/app/services/vtt_retimer.py
+++ b/backend/app/services/vtt_retimer.py
@ -18,41 +18,37 @@ class VTTRetimerService:
        """
        Generate new VTT with adjusted timings for pause-insert accessible video.

-        Uses the "full gap overlap" algorithm where:
-        - Video plays until pause_point (just before next sentence)
-        - Freeze frame with AD audio plays for ad_duration
-        - Video resumes from resume_from (just after previous sentence)
+        Uses the simplified midpoint algorithm with silence buffers:
+        - Video plays until pause_point (midpoint between sentences)
+        - Freeze frame shows with: 500ms silence + AD audio + 500ms silence
+        - Video resumes from the same pause_point

-        This means the effective time offset is: ad_duration + (pause_point - resume_from)
-        because after the AD, we jump BACK in the source timeline.
-
-        Captions in the overlap zone [resume_from, pause_point] only show once
-        (during the first playback, before the AD).
+        The effective time offset is: ad_duration + 1.0 seconds (for the silence buffers)

        Args:
            original_vtt: Original VTT content
-            analysis: Gemini analysis with placements containing pause_point, resume_from, and ad_duration
+            analysis: Gemini analysis with placements containing pause_point and ad_duration

        Returns:
            Re-timed VTT content
        """
        placements = analysis.get("placements", [])

+        # Silence buffer duration (500ms before + 500ms after AD)
+        silence_buffer_total = 1.0
+
        # Build list of (pause_point, effective_offset) sorted by time
-        # effective_offset = ad_duration + overlap (where overlap = pause_point - resume_from)
+        # effective_offset = ad_duration + 1.0s (for silence buffers)
        pauses = []
        for placement in placements:
            pause_point = placement.get("pause_point")
-            resume_from = placement.get("resume_from", pause_point)  # Fallback for backwards compat
            ad_duration = placement.get("ad_duration", 0)
            if pause_point is not None and ad_duration > 0:
-                # Overlap is the video portion that plays twice (between resume_from and pause_point)
-                overlap = max(0, pause_point - resume_from) if resume_from else 0
-                effective_offset = ad_duration + overlap
+                effective_offset = ad_duration + silence_buffer_total
                pauses.append((pause_point, effective_offset))
                logger.debug(
                    f"Pause at {pause_point:.2f}s: ad_duration={ad_duration:.2f}s, "
-                    f"overlap={overlap:.2f}s, effective_offset={effective_offset:.2f}s"
+                    f"silence_buffers=1.0s, effective_offset={effective_offset:.2f}s"
                )

        pauses.sort(key=lambda x: x[0])
@ -61,7 +57,7 @@ class VTTRetimerService:
            logger.info("No pauses to apply, returning original VTT")
            return original_vtt

-        logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (full-gap-overlap algorithm)")
+        logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (midpoint + silence buffers)")

        # Parse and retime cues
        cues = self._parse_vtt(original_vtt)
@ -69,8 +65,6 @@ class VTTRetimerService:

        for cue in cues:
            # Calculate cumulative offset from all pauses that occur before this cue's start
-            # Captions in overlap zone [resume_from, pause_point] only shift on pause_point,
-            # so they show during the first playback (before AD), not the second
            cumulative_offset = sum(
                effective_offset for pause_point, effective_offset in pauses
                if pause_point <= cue["start_time"]
--- a/backend/app/services/whisper_service.py
+++ b/backend/app/services/whisper_service.py
@ -345,16 +345,18 @@ class WhisperService:
        speaking_threshold: float = 2.0
    ) -> tuple[float, float, str | None]:
        """
-        Snap a Gemini pause point to the nearest sentence boundary using ordered logic.
+        Snap a Gemini pause point to the nearest sentence boundary.

-        Algorithm (in order):
+        Simplified algorithm:
        1. Check if "during speaking" (words within ±threshold)
-           - If NO → Use Gemini's exact pause point, no overlap
-        2. If during speaking, find nearest sentence boundary
-        3. Apply appropriate buffering based on context:
-           - Case A: Beginning of sentence, no previous → pause 500ms before sentence starts
-           - Case B: End of sentence, no next → pause 500ms after sentence ends
-           - Case C: Gap between sentences → full double buffer
+           - If NO → Use Gemini's exact pause point
+        2. If during speaking, find nearest sentence gap and snap to MIDPOINT
+        3. Edge cases:
+           - Case A: First sentence in video → pause at video start (0.0)
+           - Case B: Last sentence in video → pause at video end
+
+        The video renderer adds 500ms silence buffers before/after AD audio,
+        so no overlap or catch-up logic is needed here.

        Args:
            gemini_pause: Original pause point from Gemini (seconds)
@ -365,6 +367,7 @@ class WhisperService:

        Returns:
            Tuple of (pause_point, resume_from, warning_message_or_none)
+            Note: resume_from always equals pause_point with the simplified algorithm
        """
        # Step 1: Check if "during speaking" (words within ±threshold)
        if not self._is_during_speaking(gemini_pause, words, speaking_threshold):
@ -389,63 +392,44 @@ class WhisperService:
            f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)"
        )

-        # Step 3: Apply appropriate buffering based on context
-        edge_buffer = 0.5  # 500ms buffer for edge cases
-
-        # Case A: Beginning of sentence with no previous sentence (first sentence in video)
+        # Case A: First sentence in video (no previous sentence) → snap to video start
        if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence:
-            # Pause 500ms BEFORE the sentence starts (away from speech)
-            pause_point = max(0.0, closest_boundary.time - edge_buffer)
-            resume_from = pause_point  # No overlap
+            pause_point = 0.0
            logger.info(
                f"Case A (first sentence): pause_point={pause_point:.2f}s "
-                f"(500ms before sentence start at {closest_boundary.time:.2f}s)"
+                f"(snapped to video start)"
            )
-            return pause_point, resume_from, None
+            return pause_point, pause_point, None

-        # Case B: End of sentence with no next sentence (last sentence in video)
+        # Case B: Last sentence in video (no next sentence) → snap to boundary time
        if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence:
-            # Pause 500ms AFTER the sentence ends (away from speech)
-            pause_point = closest_boundary.time + edge_buffer
-            resume_from = pause_point  # No overlap
+            pause_point = closest_boundary.time
            logger.info(
                f"Case B (last sentence): pause_point={pause_point:.2f}s "
-                f"(500ms after sentence end at {closest_boundary.time:.2f}s)"
+                f"(snapped to video end at sentence boundary)"
            )
-            return pause_point, resume_from, None
+            return pause_point, pause_point, None

-        # Case C: Gap between two sentences (normal case with double buffer)
+        # Case C: Gap between two sentences → snap to MIDPOINT of the gap
        if closest_boundary.gap:
            gap = closest_boundary.gap
-            # Small edge buffer to avoid cutting exactly at speech boundaries
-            small_buffer = min(0.05, gap.duration * 0.1)
-
-            # Full double buffer:
-            # - pause_point: Just BEFORE next sentence (gap.end - small_buffer)
-            # - resume_from: Just AFTER previous sentence (gap.start + small_buffer)
-            pause_point = gap.end - small_buffer
-            resume_from = gap.start + small_buffer
+            # Calculate midpoint between end of previous sentence and start of next
+            midpoint = (gap.start + gap.end) / 2.0

            logger.info(
                f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, "
-                f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, "
-                f"overlap={pause_point - resume_from:.2f}s"
+                f"midpoint={midpoint:.2f}s (resume from same point)"
            )
-            return pause_point, resume_from, None
+            return midpoint, midpoint, None

-        # Fallback: No gap associated with boundary - use the boundary time with small buffer
+        # Fallback: No gap associated with boundary - use the boundary time directly
        # This shouldn't normally happen but handles edge cases
-        if closest_boundary.boundary_type == "sentence_end":
-            pause_point = closest_boundary.time + 0.05  # Small buffer after end
-        else:
-            pause_point = max(0.0, closest_boundary.time - 0.05)  # Small buffer before start
-        resume_from = pause_point
-
+        pause_point = closest_boundary.time
        logger.info(
            f"Fallback: Using boundary at {closest_boundary.time:.2f}s, "
            f"pause_point={pause_point:.2f}s (no gap available)"
        )
-        return pause_point, resume_from, None
+        return pause_point, pause_point, None

    def refine_all_pause_points(
        self,
@ -518,13 +502,9 @@ class WhisperService:
        """
        Consolidate AD cues whose pause points are within threshold seconds of each other.

-        Consolidated cues are treated as a single AD segment:
-        - All cues in a group share the same pause_point (front buffer applied once)
-        - Only the LAST cue in the group keeps resume_from (back buffer applied once)
-        - Middle cues have resume_from = pause_point (no video between ADs)
-
-        This ensures consolidated ADs play seamlessly back-to-back without
-        repeating the overlap video segment between each one.
+        Consolidated cues share the same pause_point and play back-to-back during
+        the freeze frame. With the simplified midpoint algorithm, resume_from always
+        equals pause_point, so no complex buffer logic is needed.

        Args:
            placements: List of refined placement dicts
@ -532,12 +512,11 @@ class WhisperService:
            warnings: List to append warning messages to

        Returns:
-            Updated placements with consolidated pause points and resume_from values
+            Updated placements with consolidated pause points
        """
        if len(placements) < 2:
            return placements

-        # First pass: identify consolidated groups and assign same pause_point
        consolidated = [placements[0].copy()]

        for i in range(1, len(placements)):
@ -553,12 +532,10 @@ class WhisperService:
                if 0 < gap <= threshold:
                    # Consolidate: set current cue to use same pause point as previous
                    original_pause = current_pause
-                    original_resume = current.get("resume_from")
                    current["pause_point"] = previous_pause
+                    current["resume_from"] = previous_pause  # Always same as pause_point
                    current["consolidated_with_previous"] = True
                    current["original_pause_point_before_consolidation"] = original_pause
-                    current["original_resume_from_before_consolidation"] = original_resume
-                    # Keep tracking the group's resume_from (we'll assign it to the last cue later)

                    logger.info(
                        f"Consolidated cue {current['ad_cue_index']} with previous cue: "
@ -571,31 +548,6 @@ class WhisperService:
                    )
            consolidated.append(current)

-        # Second pass: fix resume_from values for consolidated groups
-        # Only the LAST cue in each group should have the back buffer (resume_from < pause_point)
-        # All other cues should have resume_from = pause_point (no video between ADs)
-        for i in range(len(consolidated)):
-            current = consolidated[i]
-            current_pause = current.get("pause_point")
-
-            if current_pause is None:
-                continue
-
-            # Check if next cue has the same pause_point (meaning current is NOT last in group)
-            if i < len(consolidated) - 1:
-                next_pause = consolidated[i + 1].get("pause_point")
-                if next_pause == current_pause:
-                    # Current is NOT the last in the group - remove back buffer
-                    # Set resume_from = pause_point so no video plays between this AD and the next
-                    original_resume = current.get("resume_from")
-                    if original_resume != current_pause:
-                        current["resume_from"] = current_pause
-                        current["resume_from_removed_for_consolidation"] = original_resume
-                        logger.debug(
-                            f"Cue {current.get('ad_cue_index')}: Removed back buffer for seamless "
-                            f"consolidated playback (resume_from {original_resume:.2f}s -> {current_pause:.2f}s)"
-                        )
-
        # Log the final consolidated groups
        self._log_consolidated_groups(consolidated)