From 54667fbcb8933f65b72a2dcd48eed98e670c45bd Mon Sep 17 00:00:00 2001
From: michael <michael@modernfreedom.com>
Date: Fri, 26 Dec 2025 12:05:32 -0600
Subject: [PATCH] fix: resolve audio/video sync issues in accessible video
 renderer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update _get_video_properties() to extract audio sample_rate, channels,
  and pix_fmt in addition to video properties
- Add _extract_segment_reencoded() for frame-accurate cuts using
  re-encoding instead of stream copy (fixes keyframe-only cut limitation)
- Add _create_freeze_segment_matched() to enforce source audio property
  matching (fixes silent pauses caused by sample rate mismatch)
- Update _render_pause_insert_method() to use new methods with uniform
  encoding parameters
- Add -video_track_timescale 90000 for consistent timebase across segments

Root causes fixed:
1. -c copy could only cut at keyframes, causing audio dropouts
2. Sample rate mismatch (48kHz source vs 44.1kHz MP3) caused silent
   freeze-frame segments when concatenated

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 backend/app/services/video_renderer.py | 151 ++++++++++++++++++++-----
 1 file changed, 122 insertions(+), 29 deletions(-)

diff --git a/backend/app/services/video_renderer.py b/backend/app/services/video_renderer.py
index abf2ab2..e47ec99 100644
--- a/backend/app/services/video_renderer.py
+++ b/backend/app/services/video_renderer.py
@@ -199,9 +199,9 @@ class VideoRendererService:
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_dir_path = Path(temp_dir)
 
-            # Get video properties for re-encoding freeze frames
+            # Get detailed video AND audio properties for uniform encoding
             video_props = await self._get_video_properties(source_video_path)
-            logger.info(f"Video properties: {video_props}")
+            logger.info(f"Source Properties: {video_props}")
 
             segment_files = []
             current_time = 0.0
@@ -220,14 +220,15 @@ class VideoRendererService:
                     logger.warning(f"No AD audio found for cue {cue_index}, skipping")
                     continue
 
-                # 1. Extract video segment from current_time to pause_point
+                # 1. Extract video segment from current_time to pause_point (re-encoded for frame accuracy)
                 if pause_point > current_time:
                     segment_path = temp_dir_path / f"segment_{i}_video.mp4"
-                    await self._extract_segment(
+                    await self._extract_segment_reencoded(
                         source_video_path,
                         current_time,
                         pause_point - current_time,
-                        str(segment_path)
+                        str(segment_path),
+                        video_props
                     )
                     segment_files.append(str(segment_path))
 
@@ -239,9 +240,9 @@ class VideoRendererService:
                     str(freeze_frame_path)
                 )
 
-                # 3. Create freeze segment with AD audio
+                # 3. Create freeze segment with AD audio (matched to source properties)
                 freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
-                await self._create_freeze_segment(
+                await self._create_freeze_segment_matched(
                     str(freeze_frame_path),
                     ad_mp3_path,
                     ad_duration,
@@ -252,15 +253,16 @@ class VideoRendererService:
 
                 current_time = pause_point
 
-            # 4. Add final segment from last pause point to end
+            # 4. Add final segment from last pause point to end (re-encoded for uniformity)
             source_duration = await self._get_video_duration(source_video_path)
             if current_time < source_duration:
                 final_segment_path = temp_dir_path / "segment_final.mp4"
-                await self._extract_segment(
+                await self._extract_segment_reencoded(
                     source_video_path,
                     current_time,
                     source_duration - current_time,
-                    str(final_segment_path)
+                    str(final_segment_path),
+                    video_props
                 )
                 segment_files.append(str(final_segment_path))
 
@@ -292,12 +294,11 @@ class VideoRendererService:
         return float(result.stdout.strip())
 
     async def _get_video_properties(self, video_path: str) -> dict[str, Any]:
-        """Get video properties (resolution, framerate, codec) using ffprobe."""
+        """Get detailed video and audio properties to ensure matching during concatenation."""
         cmd = [
             self.ffprobe_path,
             "-v", "quiet",
-            "-select_streams", "v:0",
-            "-show_entries", "stream=width,height,r_frame_rate,codec_name",
+            "-show_streams",
             "-of", "json",
             video_path
         ]
@@ -311,23 +312,39 @@ class VideoRendererService:
 
         import json
         data = json.loads(result.stdout)
-        stream = data.get("streams", [{}])[0]
 
-        # Parse frame rate (e.g., "30000/1001" or "30/1")
-        fps_str = stream.get("r_frame_rate", "30/1")
-        if "/" in fps_str:
-            num, den = fps_str.split("/")
-            fps = float(num) / float(den)
-        else:
-            fps = float(fps_str)
-
-        return {
-            "width": stream.get("width", 1920),
-            "height": stream.get("height", 1080),
-            "fps": fps,
-            "codec": stream.get("codec_name", "h264")
+        # Defaults (44100 is common for MP3, but we detect from source)
+        props = {
+            "width": 1920,
+            "height": 1080,
+            "fps": 30.0,
+            "sample_rate": "44100",
+            "channels": "2",
+            "pix_fmt": "yuv420p",
+            "codec": "h264"
         }
 
+        for stream in data.get("streams", []):
+            if stream.get("codec_type") == "video":
+                props["width"] = stream.get("width", props["width"])
+                props["height"] = stream.get("height", props["height"])
+                props["pix_fmt"] = stream.get("pix_fmt", props["pix_fmt"])
+                props["codec"] = stream.get("codec_name", props["codec"])
+
+                # Parse frame rate (e.g., "30000/1001" or "30/1")
+                fps_str = stream.get("r_frame_rate", "30/1")
+                if "/" in fps_str:
+                    num, den = fps_str.split("/")
+                    props["fps"] = float(num) / float(den)
+                else:
+                    props["fps"] = float(fps_str)
+
+            elif stream.get("codec_type") == "audio":
+                props["sample_rate"] = stream.get("sample_rate", props["sample_rate"])
+                props["channels"] = str(stream.get("channels", props["channels"]))
+
+        return props
+
     async def _extract_segment(
         self,
         source_path: str,
@@ -335,7 +352,7 @@ class VideoRendererService:
         duration: float,
         output_path: str
     ):
-        """Extract a video segment using ffmpeg."""
+        """Extract a video segment using ffmpeg (stream copy - for overlay method)."""
         cmd = [
             self.ffmpeg_path,
             "-y",
@@ -348,6 +365,43 @@ class VideoRendererService:
         ]
         await self._run_ffmpeg(cmd)
 
+    async def _extract_segment_reencoded(
+        self,
+        source_path: str,
+        start_time: float,
+        duration: float,
+        output_path: str,
+        props: dict[str, Any]
+    ):
+        """
+        Extract segment with RE-ENCODING for frame-accurate cuts.
+
+        Crucial for pause-insert method to avoid:
+        - Keyframe-only cuts causing audio dropouts
+        - Timestamp desynchronization
+        """
+        cmd = [
+            self.ffmpeg_path,
+            "-y",
+            "-ss", str(start_time),
+            "-i", source_path,
+            "-t", str(duration),
+            # Video Encoding
+            "-c:v", "libx264",
+            "-preset", "fast",
+            "-pix_fmt", props["pix_fmt"],
+            "-r", str(props["fps"]),
+            # Audio Encoding (Force match source)
+            "-c:a", "aac",
+            "-ar", props["sample_rate"],
+            "-ac", props["channels"],
+            "-b:a", "192k",
+            # Ensure timestamp continuity
+            "-video_track_timescale", "90000",
+            output_path
+        ]
+        await self._run_ffmpeg(cmd)
+
     async def _extract_frame(self, video_path: str, time_point: float, output_path: str):
         """Extract a single frame as PNG using ffmpeg."""
         cmd = [
@@ -369,7 +423,7 @@ class VideoRendererService:
         output_path: str,
         video_props: dict[str, Any]
     ):
-        """Create a freeze-frame video segment with audio overlay."""
+        """Create a freeze-frame video segment with audio overlay (for overlay method)."""
         width = video_props.get("width", 1920)
         height = video_props.get("height", 1080)
         fps = video_props.get("fps", 30)
@@ -394,6 +448,45 @@ class VideoRendererService:
         ]
         await self._run_ffmpeg(cmd)
 
+    async def _create_freeze_segment_matched(
+        self,
+        frame_path: str,
+        audio_path: str,
+        duration: float,
+        output_path: str,
+        props: dict[str, Any]
+    ):
+        """
+        Create freeze frame that rigidly matches the source video properties.
+
+        This fixes the "silent pause" issue caused by sample rate mismatch
+        when concatenating with extracted video segments.
+        """
+        cmd = [
+            self.ffmpeg_path,
+            "-y",
+            "-loop", "1",
+            "-i", frame_path,
+            "-i", audio_path,
+            "-c:v", "libx264",
+            "-preset", "fast",
+            "-tune", "stillimage",
+            "-pix_fmt", props["pix_fmt"],
+            "-r", str(props["fps"]),
+            # Scale filter to ensure dimensions match exactly
+            "-vf", f"scale={props['width']}:{props['height']}:force_original_aspect_ratio=decrease,pad={props['width']}:{props['height']}:(ow-iw)/2:(oh-ih)/2",
+            # Audio Encoding (CRITICAL: Match source sample rate and channels)
+            "-c:a", "aac",
+            "-ar", props["sample_rate"],
+            "-ac", props["channels"],
+            "-b:a", "192k",
+            "-t", str(duration),
+            "-video_track_timescale", "90000",
+            "-shortest",
+            output_path
+        ]
+        await self._run_ffmpeg(cmd)
+
     async def _concatenate_segments(
         self,
         segment_paths: list[str],