fix: resolve audio/video sync issues in accessible video renderer

- Update _get_video_properties() to extract audio sample_rate, channels, and pix_fmt in addition to video properties - Add _extract_segment_reencoded() for frame-accurate cuts using re-encoding instead of stream copy (fixes keyframe-only cut limitation) - Add _create_freeze_segment_matched() to enforce source audio property matching (fixes silent pauses caused by sample rate mismatch) - Update _render_pause_insert_method() to use new methods with uniform encoding parameters - Add -video_track_timescale 90000 for consistent timebase across segments Root causes fixed: 1. -c copy could only cut at keyframes, causing audio dropouts 2. Sample rate mismatch (48kHz source vs 44.1kHz MP3) caused silent freeze-frame segments when concatenated 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 12:05:32 -06:00 · 2025-12-26 12:05:32 -06:00 · 54667fbcb8
commit 54667fbcb8
parent 6acb452cfa
1 changed files with 122 additions and 29 deletions
--- a/backend/app/services/video_renderer.py
+++ b/backend/app/services/video_renderer.py
@ -199,9 +199,9 @@ class VideoRendererService:
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_dir_path = Path(temp_dir)

-            # Get video properties for re-encoding freeze frames
+            # Get detailed video AND audio properties for uniform encoding
            video_props = await self._get_video_properties(source_video_path)
-            logger.info(f"Video properties: {video_props}")
+            logger.info(f"Source Properties: {video_props}")

            segment_files = []
            current_time = 0.0
@ -220,14 +220,15 @@ class VideoRendererService:
                    logger.warning(f"No AD audio found for cue {cue_index}, skipping")
                    continue

-                # 1. Extract video segment from current_time to pause_point
+                # 1. Extract video segment from current_time to pause_point (re-encoded for frame accuracy)
                if pause_point > current_time:
                    segment_path = temp_dir_path / f"segment_{i}_video.mp4"
-                    await self._extract_segment(
+                    await self._extract_segment_reencoded(
                        source_video_path,
                        current_time,
                        pause_point - current_time,
-                        str(segment_path)
+                        str(segment_path),
+                        video_props
                    )
                    segment_files.append(str(segment_path))

@ -239,9 +240,9 @@ class VideoRendererService:
                    str(freeze_frame_path)
                )

-                # 3. Create freeze segment with AD audio
+                # 3. Create freeze segment with AD audio (matched to source properties)
                freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
-                await self._create_freeze_segment(
+                await self._create_freeze_segment_matched(
                    str(freeze_frame_path),
                    ad_mp3_path,
                    ad_duration,
@ -252,15 +253,16 @@ class VideoRendererService:

                current_time = pause_point

-            # 4. Add final segment from last pause point to end
+            # 4. Add final segment from last pause point to end (re-encoded for uniformity)
            source_duration = await self._get_video_duration(source_video_path)
            if current_time < source_duration:
                final_segment_path = temp_dir_path / "segment_final.mp4"
-                await self._extract_segment(
+                await self._extract_segment_reencoded(
                    source_video_path,
                    current_time,
                    source_duration - current_time,
-                    str(final_segment_path)
+                    str(final_segment_path),
+                    video_props
                )
                segment_files.append(str(final_segment_path))

@ -292,12 +294,11 @@ class VideoRendererService:
        return float(result.stdout.strip())

    async def _get_video_properties(self, video_path: str) -> dict[str, Any]:
-        """Get video properties (resolution, framerate, codec) using ffprobe."""
+        """Get detailed video and audio properties to ensure matching during concatenation."""
        cmd = [
            self.ffprobe_path,
            "-v", "quiet",
-            "-select_streams", "v:0",
-            "-show_entries", "stream=width,height,r_frame_rate,codec_name",
+            "-show_streams",
            "-of", "json",
            video_path
        ]
@ -311,23 +312,39 @@ class VideoRendererService:

        import json
        data = json.loads(result.stdout)
-        stream = data.get("streams", [{}])[0]

-        # Parse frame rate (e.g., "30000/1001" or "30/1")
-        fps_str = stream.get("r_frame_rate", "30/1")
-        if "/" in fps_str:
-            num, den = fps_str.split("/")
-            fps = float(num) / float(den)
-        else:
-            fps = float(fps_str)
-
-        return {
-            "width": stream.get("width", 1920),
-            "height": stream.get("height", 1080),
-            "fps": fps,
-            "codec": stream.get("codec_name", "h264")
+        # Defaults (44100 is common for MP3, but we detect from source)
+        props = {
+            "width": 1920,
+            "height": 1080,
+            "fps": 30.0,
+            "sample_rate": "44100",
+            "channels": "2",
+            "pix_fmt": "yuv420p",
+            "codec": "h264"
        }

+        for stream in data.get("streams", []):
+            if stream.get("codec_type") == "video":
+                props["width"] = stream.get("width", props["width"])
+                props["height"] = stream.get("height", props["height"])
+                props["pix_fmt"] = stream.get("pix_fmt", props["pix_fmt"])
+                props["codec"] = stream.get("codec_name", props["codec"])
+
+                # Parse frame rate (e.g., "30000/1001" or "30/1")
+                fps_str = stream.get("r_frame_rate", "30/1")
+                if "/" in fps_str:
+                    num, den = fps_str.split("/")
+                    props["fps"] = float(num) / float(den)
+                else:
+                    props["fps"] = float(fps_str)
+
+            elif stream.get("codec_type") == "audio":
+                props["sample_rate"] = stream.get("sample_rate", props["sample_rate"])
+                props["channels"] = str(stream.get("channels", props["channels"]))
+
+        return props
+
    async def _extract_segment(
        self,
        source_path: str,
@ -335,7 +352,7 @@ class VideoRendererService:
        duration: float,
        output_path: str
    ):
-        """Extract a video segment using ffmpeg."""
+        """Extract a video segment using ffmpeg (stream copy - for overlay method)."""
        cmd = [
            self.ffmpeg_path,
            "-y",
@ -348,6 +365,43 @@ class VideoRendererService:
        ]
        await self._run_ffmpeg(cmd)

+    async def _extract_segment_reencoded(
+        self,
+        source_path: str,
+        start_time: float,
+        duration: float,
+        output_path: str,
+        props: dict[str, Any]
+    ):
+        """
+        Extract segment with RE-ENCODING for frame-accurate cuts.
+
+        Crucial for pause-insert method to avoid:
+        - Keyframe-only cuts causing audio dropouts
+        - Timestamp desynchronization
+        """
+        cmd = [
+            self.ffmpeg_path,
+            "-y",
+            "-ss", str(start_time),
+            "-i", source_path,
+            "-t", str(duration),
+            # Video Encoding
+            "-c:v", "libx264",
+            "-preset", "fast",
+            "-pix_fmt", props["pix_fmt"],
+            "-r", str(props["fps"]),
+            # Audio Encoding (Force match source)
+            "-c:a", "aac",
+            "-ar", props["sample_rate"],
+            "-ac", props["channels"],
+            "-b:a", "192k",
+            # Ensure timestamp continuity
+            "-video_track_timescale", "90000",
+            output_path
+        ]
+        await self._run_ffmpeg(cmd)
+
    async def _extract_frame(self, video_path: str, time_point: float, output_path: str):
        """Extract a single frame as PNG using ffmpeg."""
        cmd = [
@ -369,7 +423,7 @@ class VideoRendererService:
        output_path: str,
        video_props: dict[str, Any]
    ):
-        """Create a freeze-frame video segment with audio overlay."""
+        """Create a freeze-frame video segment with audio overlay (for overlay method)."""
        width = video_props.get("width", 1920)
        height = video_props.get("height", 1080)
        fps = video_props.get("fps", 30)
@ -394,6 +448,45 @@ class VideoRendererService:
        ]
        await self._run_ffmpeg(cmd)

+    async def _create_freeze_segment_matched(
+        self,
+        frame_path: str,
+        audio_path: str,
+        duration: float,
+        output_path: str,
+        props: dict[str, Any]
+    ):
+        """
+        Create freeze frame that rigidly matches the source video properties.
+
+        This fixes the "silent pause" issue caused by sample rate mismatch
+        when concatenating with extracted video segments.
+        """
+        cmd = [
+            self.ffmpeg_path,
+            "-y",
+            "-loop", "1",
+            "-i", frame_path,
+            "-i", audio_path,
+            "-c:v", "libx264",
+            "-preset", "fast",
+            "-tune", "stillimage",
+            "-pix_fmt", props["pix_fmt"],
+            "-r", str(props["fps"]),
+            # Scale filter to ensure dimensions match exactly
+            "-vf", f"scale={props['width']}:{props['height']}:force_original_aspect_ratio=decrease,pad={props['width']}:{props['height']}:(ow-iw)/2:(oh-ih)/2",
+            # Audio Encoding (CRITICAL: Match source sample rate and channels)
+            "-c:a", "aac",
+            "-ar", props["sample_rate"],
+            "-ac", props["channels"],
+            "-b:a", "192k",
+            "-t", str(duration),
+            "-video_track_timescale", "90000",
+            "-shortest",
+            output_path
+        ]
+        await self._run_ffmpeg(cmd)
+
    async def _concatenate_segments(
        self,
        segment_paths: list[str],