diff --git a/backend/app/services/video_renderer.py b/backend/app/services/video_renderer.py
index 90daac1..94439c1 100644
--- a/backend/app/services/video_renderer.py
+++ b/backend/app/services/video_renderer.py
@@ -285,15 +285,12 @@ class VideoRendererService:
             key=lambda p: p["pause_point"]
         )
 
-        # Debug logging for pause points (full-gap-overlap algorithm)
-        logger.info(f"Pause-insert (full-gap-overlap): {len(sorted_placements)} placements with pause points")
+        # Debug logging for pause points (midpoint algorithm with silence buffers)
+        logger.info(f"Pause-insert (midpoint + 500ms silence buffers): {len(sorted_placements)} placements")
         for i, p in enumerate(sorted_placements):
-            resume_from = p.get('resume_from', p.get('pause_point'))  # Fallback for backwards compat
-            overlap = p.get('pause_point', 0) - resume_from if resume_from else 0
             logger.info(
                 f"  Placement {i}: cue_index={p.get('ad_cue_index')}, "
-                f"pause_at={p.get('pause_point'):.2f}s, resume_from={resume_from:.2f}s, "
-                f"overlap={overlap:.2f}s, ad_duration={p.get('ad_duration'):.2f}s, "
+                f"pause_at={p.get('pause_point'):.2f}s, ad_duration={p.get('ad_duration'):.2f}s, "
                 f"consolidated={p.get('consolidated_with_previous', False)}"
             )
 
@@ -320,7 +317,6 @@ class VideoRendererService:
                 pause_point = placement["pause_point"]
                 cue_index = placement["ad_cue_index"]
                 ad_duration = placement["ad_duration"]
-                resume_from = placement.get("resume_from", pause_point)  # Fallback for backwards compat
 
                 # Validate pause_point is within video bounds
                 if pause_point >= source_duration:
@@ -330,11 +326,6 @@ class VideoRendererService:
                     )
                     pause_point = max(0, source_duration - 0.1)  # Clamp to 100ms before end
 
-                if resume_from >= source_duration:
-                    resume_from = pause_point
-
-                overlap = max(0, pause_point - resume_from)
-
                 # Get the AD audio for this cue
                 ad_mp3_path = cue_to_mp3.get(cue_index)
                 if not ad_mp3_path:
@@ -362,40 +353,30 @@ class VideoRendererService:
                 )
 
                 # 3. Prepare audio for freeze segment
-                # If there's overlap (Case C - between sentences), we need catch-up audio
-                # The freeze frame stays visible while source audio "catches up" to pause_point
-                # This avoids visual jump-back artifacts
-                if overlap > 0.01:  # Only if meaningful overlap (> 10ms)
-                    # Extract catch-up audio from source video [resume_from, pause_point]
-                    catchup_audio_path = temp_dir_path / f"catchup_{i}.m4a"
-                    await self._extract_audio_segment(
-                        source_video_path,
-                        resume_from,
-                        overlap,
-                        str(catchup_audio_path),
-                        video_props
-                    )
+                # Add 500ms silence before AND after the AD audio for smooth transitions
+                silence_duration = 0.5  # 500ms
+                silence_path = temp_dir_path / f"silence_{i}.m4a"
+                await self._generate_silence(
+                    silence_duration,
+                    str(silence_path),
+                    video_props
+                )
 
-                    # Concatenate AD audio + catch-up audio
-                    # Order: AD plays first, then source audio catches up
-                    combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
-                    await self._concatenate_audio(
-                        [ad_mp3_path, str(catchup_audio_path)],
-                        str(combined_audio_path),
-                        video_props
-                    )
+                # Concatenate: 500ms silence + AD audio + 500ms silence
+                combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
+                await self._concatenate_audio(
+                    [str(silence_path), ad_mp3_path, str(silence_path)],
+                    str(combined_audio_path),
+                    video_props
+                )
 
-                    freeze_audio_path = str(combined_audio_path)
-                    total_freeze_duration = ad_duration + overlap
+                freeze_audio_path = str(combined_audio_path)
+                total_freeze_duration = ad_duration + (2 * silence_duration)  # AD + 1.0s total silence
 
-                    logger.info(
-                        f"Cue {cue_index}: Audio catch-up enabled - "
-                        f"AD={ad_duration:.2f}s + catchup={overlap:.2f}s = {total_freeze_duration:.2f}s"
-                    )
-                else:
-                    # No overlap (Case A/B) - just use AD audio
-                    freeze_audio_path = ad_mp3_path
-                    total_freeze_duration = ad_duration
+                logger.info(
+                    f"Cue {cue_index}: Freeze segment with silence buffers - "
+                    f"500ms + AD={ad_duration:.2f}s + 500ms = {total_freeze_duration:.2f}s"
+                )
 
                 # 4. Create freeze segment with prepared audio
                 freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
@@ -774,6 +755,43 @@ class VideoRendererService:
         ]
         await self._run_ffmpeg(cmd)
 
+    async def _generate_silence(
+        self,
+        duration: float,
+        output_path: str,
+        props: dict[str, Any]
+    ):
+        """
+        Generate a silent audio file of specified duration.
+
+        Used to create 500ms silence buffers before/after AD audio.
+
+        Args:
+            duration: Duration of silence in seconds
+            output_path: Path to output audio file
+            props: Video properties (for sample_rate, channels)
+        """
+        if duration <= 0:
+            raise ValueError(f"Invalid silence duration: {duration}")
+
+        logger.debug(
+            f"Generating {duration:.2f}s silence: output={output_path}"
+        )
+
+        cmd = [
+            self.ffmpeg_path,
+            "-y",
+            "-f", "lavfi",
+            "-i", f"anullsrc=r={props['sample_rate']}:cl={'stereo' if props['channels'] == '2' else 'mono'}",
+            "-t", str(duration),
+            "-c:a", "aac",
+            "-ar", props["sample_rate"],
+            "-ac", props["channels"],
+            "-b:a", "192k",
+            output_path
+        ]
+        await self._run_ffmpeg(cmd)
+
     async def _concatenate_segments(
         self,
         segment_paths: list[str],
diff --git a/backend/app/services/vtt_retimer.py b/backend/app/services/vtt_retimer.py
index 6373e56..c355c0e 100644
--- a/backend/app/services/vtt_retimer.py
+++ b/backend/app/services/vtt_retimer.py
@@ -18,41 +18,37 @@ class VTTRetimerService:
         """
         Generate new VTT with adjusted timings for pause-insert accessible video.
 
-        Uses the "full gap overlap" algorithm where:
-        - Video plays until pause_point (just before next sentence)
-        - Freeze frame with AD audio plays for ad_duration
-        - Video resumes from resume_from (just after previous sentence)
+        Uses the simplified midpoint algorithm with silence buffers:
+        - Video plays until pause_point (midpoint between sentences)
+        - Freeze frame shows with: 500ms silence + AD audio + 500ms silence
+        - Video resumes from the same pause_point
 
-        This means the effective time offset is: ad_duration + (pause_point - resume_from)
-        because after the AD, we jump BACK in the source timeline.
-
-        Captions in the overlap zone [resume_from, pause_point] only show once
-        (during the first playback, before the AD).
+        The effective time offset is: ad_duration + 1.0 seconds (for the silence buffers)
 
         Args:
             original_vtt: Original VTT content
-            analysis: Gemini analysis with placements containing pause_point, resume_from, and ad_duration
+            analysis: Gemini analysis with placements containing pause_point and ad_duration
 
         Returns:
             Re-timed VTT content
         """
         placements = analysis.get("placements", [])
 
+        # Silence buffer duration (500ms before + 500ms after AD)
+        silence_buffer_total = 1.0
+
         # Build list of (pause_point, effective_offset) sorted by time
-        # effective_offset = ad_duration + overlap (where overlap = pause_point - resume_from)
+        # effective_offset = ad_duration + 1.0s (for silence buffers)
         pauses = []
         for placement in placements:
             pause_point = placement.get("pause_point")
-            resume_from = placement.get("resume_from", pause_point)  # Fallback for backwards compat
             ad_duration = placement.get("ad_duration", 0)
             if pause_point is not None and ad_duration > 0:
-                # Overlap is the video portion that plays twice (between resume_from and pause_point)
-                overlap = max(0, pause_point - resume_from) if resume_from else 0
-                effective_offset = ad_duration + overlap
+                effective_offset = ad_duration + silence_buffer_total
                 pauses.append((pause_point, effective_offset))
                 logger.debug(
                     f"Pause at {pause_point:.2f}s: ad_duration={ad_duration:.2f}s, "
-                    f"overlap={overlap:.2f}s, effective_offset={effective_offset:.2f}s"
+                    f"silence_buffers=1.0s, effective_offset={effective_offset:.2f}s"
                 )
 
         pauses.sort(key=lambda x: x[0])
@@ -61,7 +57,7 @@ class VTTRetimerService:
             logger.info("No pauses to apply, returning original VTT")
             return original_vtt
 
-        logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (full-gap-overlap algorithm)")
+        logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (midpoint + silence buffers)")
 
         # Parse and retime cues
         cues = self._parse_vtt(original_vtt)
@@ -69,8 +65,6 @@ class VTTRetimerService:
 
         for cue in cues:
             # Calculate cumulative offset from all pauses that occur before this cue's start
-            # Captions in overlap zone [resume_from, pause_point] only shift on pause_point,
-            # so they show during the first playback (before AD), not the second
             cumulative_offset = sum(
                 effective_offset for pause_point, effective_offset in pauses
                 if pause_point <= cue["start_time"]
diff --git a/backend/app/services/whisper_service.py b/backend/app/services/whisper_service.py
index 5fb895c..fc8215c 100644
--- a/backend/app/services/whisper_service.py
+++ b/backend/app/services/whisper_service.py
@@ -345,16 +345,18 @@ class WhisperService:
         speaking_threshold: float = 2.0
     ) -> tuple[float, float, str | None]:
         """
-        Snap a Gemini pause point to the nearest sentence boundary using ordered logic.
+        Snap a Gemini pause point to the nearest sentence boundary.
 
-        Algorithm (in order):
+        Simplified algorithm:
         1. Check if "during speaking" (words within ±threshold)
-           - If NO → Use Gemini's exact pause point, no overlap
-        2. If during speaking, find nearest sentence boundary
-        3. Apply appropriate buffering based on context:
-           - Case A: Beginning of sentence, no previous → pause 500ms before sentence starts
-           - Case B: End of sentence, no next → pause 500ms after sentence ends
-           - Case C: Gap between sentences → full double buffer
+           - If NO → Use Gemini's exact pause point
+        2. If during speaking, find nearest sentence gap and snap to MIDPOINT
+        3. Edge cases:
+           - Case A: First sentence in video → pause at video start (0.0)
+           - Case B: Last sentence in video → pause at video end
+
+        The video renderer adds 500ms silence buffers before/after AD audio,
+        so no overlap or catch-up logic is needed here.
 
         Args:
             gemini_pause: Original pause point from Gemini (seconds)
@@ -365,6 +367,7 @@ class WhisperService:
 
         Returns:
             Tuple of (pause_point, resume_from, warning_message_or_none)
+            Note: resume_from always equals pause_point with the simplified algorithm
         """
         # Step 1: Check if "during speaking" (words within ±threshold)
         if not self._is_during_speaking(gemini_pause, words, speaking_threshold):
@@ -389,63 +392,44 @@ class WhisperService:
             f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)"
         )
 
-        # Step 3: Apply appropriate buffering based on context
-        edge_buffer = 0.5  # 500ms buffer for edge cases
-
-        # Case A: Beginning of sentence with no previous sentence (first sentence in video)
+        # Case A: First sentence in video (no previous sentence) → snap to video start
         if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence:
-            # Pause 500ms BEFORE the sentence starts (away from speech)
-            pause_point = max(0.0, closest_boundary.time - edge_buffer)
-            resume_from = pause_point  # No overlap
+            pause_point = 0.0
             logger.info(
                 f"Case A (first sentence): pause_point={pause_point:.2f}s "
-                f"(500ms before sentence start at {closest_boundary.time:.2f}s)"
+                f"(snapped to video start)"
             )
-            return pause_point, resume_from, None
+            return pause_point, pause_point, None
 
-        # Case B: End of sentence with no next sentence (last sentence in video)
+        # Case B: Last sentence in video (no next sentence) → snap to boundary time
         if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence:
-            # Pause 500ms AFTER the sentence ends (away from speech)
-            pause_point = closest_boundary.time + edge_buffer
-            resume_from = pause_point  # No overlap
+            pause_point = closest_boundary.time
             logger.info(
                 f"Case B (last sentence): pause_point={pause_point:.2f}s "
-                f"(500ms after sentence end at {closest_boundary.time:.2f}s)"
+                f"(snapped to video end at sentence boundary)"
             )
-            return pause_point, resume_from, None
+            return pause_point, pause_point, None
 
-        # Case C: Gap between two sentences (normal case with double buffer)
+        # Case C: Gap between two sentences → snap to MIDPOINT of the gap
         if closest_boundary.gap:
             gap = closest_boundary.gap
-            # Small edge buffer to avoid cutting exactly at speech boundaries
-            small_buffer = min(0.05, gap.duration * 0.1)
-
-            # Full double buffer:
-            # - pause_point: Just BEFORE next sentence (gap.end - small_buffer)
-            # - resume_from: Just AFTER previous sentence (gap.start + small_buffer)
-            pause_point = gap.end - small_buffer
-            resume_from = gap.start + small_buffer
+            # Calculate midpoint between end of previous sentence and start of next
+            midpoint = (gap.start + gap.end) / 2.0
 
             logger.info(
                 f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, "
-                f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, "
-                f"overlap={pause_point - resume_from:.2f}s"
+                f"midpoint={midpoint:.2f}s (resume from same point)"
             )
-            return pause_point, resume_from, None
+            return midpoint, midpoint, None
 
-        # Fallback: No gap associated with boundary - use the boundary time with small buffer
+        # Fallback: No gap associated with boundary - use the boundary time directly
         # This shouldn't normally happen but handles edge cases
-        if closest_boundary.boundary_type == "sentence_end":
-            pause_point = closest_boundary.time + 0.05  # Small buffer after end
-        else:
-            pause_point = max(0.0, closest_boundary.time - 0.05)  # Small buffer before start
-        resume_from = pause_point
-
+        pause_point = closest_boundary.time
         logger.info(
             f"Fallback: Using boundary at {closest_boundary.time:.2f}s, "
             f"pause_point={pause_point:.2f}s (no gap available)"
         )
-        return pause_point, resume_from, None
+        return pause_point, pause_point, None
 
     def refine_all_pause_points(
         self,
@@ -518,13 +502,9 @@ class WhisperService:
         """
         Consolidate AD cues whose pause points are within threshold seconds of each other.
 
-        Consolidated cues are treated as a single AD segment:
-        - All cues in a group share the same pause_point (front buffer applied once)
-        - Only the LAST cue in the group keeps resume_from (back buffer applied once)
-        - Middle cues have resume_from = pause_point (no video between ADs)
-
-        This ensures consolidated ADs play seamlessly back-to-back without
-        repeating the overlap video segment between each one.
+        Consolidated cues share the same pause_point and play back-to-back during
+        the freeze frame. With the simplified midpoint algorithm, resume_from always
+        equals pause_point, so no complex buffer logic is needed.
 
         Args:
             placements: List of refined placement dicts
@@ -532,12 +512,11 @@ class WhisperService:
             warnings: List to append warning messages to
 
         Returns:
-            Updated placements with consolidated pause points and resume_from values
+            Updated placements with consolidated pause points
         """
         if len(placements) < 2:
             return placements
 
-        # First pass: identify consolidated groups and assign same pause_point
         consolidated = [placements[0].copy()]
 
         for i in range(1, len(placements)):
@@ -553,12 +532,10 @@ class WhisperService:
                 if 0 < gap <= threshold:
                     # Consolidate: set current cue to use same pause point as previous
                     original_pause = current_pause
-                    original_resume = current.get("resume_from")
                     current["pause_point"] = previous_pause
+                    current["resume_from"] = previous_pause  # Always same as pause_point
                     current["consolidated_with_previous"] = True
                     current["original_pause_point_before_consolidation"] = original_pause
-                    current["original_resume_from_before_consolidation"] = original_resume
-                    # Keep tracking the group's resume_from (we'll assign it to the last cue later)
 
                     logger.info(
                         f"Consolidated cue {current['ad_cue_index']} with previous cue: "
@@ -571,31 +548,6 @@ class WhisperService:
                     )
             consolidated.append(current)
 
-        # Second pass: fix resume_from values for consolidated groups
-        # Only the LAST cue in each group should have the back buffer (resume_from < pause_point)
-        # All other cues should have resume_from = pause_point (no video between ADs)
-        for i in range(len(consolidated)):
-            current = consolidated[i]
-            current_pause = current.get("pause_point")
-
-            if current_pause is None:
-                continue
-
-            # Check if next cue has the same pause_point (meaning current is NOT last in group)
-            if i < len(consolidated) - 1:
-                next_pause = consolidated[i + 1].get("pause_point")
-                if next_pause == current_pause:
-                    # Current is NOT the last in the group - remove back buffer
-                    # Set resume_from = pause_point so no video plays between this AD and the next
-                    original_resume = current.get("resume_from")
-                    if original_resume != current_pause:
-                        current["resume_from"] = current_pause
-                        current["resume_from_removed_for_consolidation"] = original_resume
-                        logger.debug(
-                            f"Cue {current.get('ad_cue_index')}: Removed back buffer for seamless "
-                            f"consolidated playback (resume_from {original_resume:.2f}s -> {current_pause:.2f}s)"
-                        )
-
         # Log the final consolidated groups
         self._log_consolidated_groups(consolidated)