From 54667fbcb8933f65b72a2dcd48eed98e670c45bd Mon Sep 17 00:00:00 2001 From: michael Date: Fri, 26 Dec 2025 12:05:32 -0600 Subject: [PATCH] fix: resolve audio/video sync issues in accessible video renderer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update _get_video_properties() to extract audio sample_rate, channels, and pix_fmt in addition to video properties - Add _extract_segment_reencoded() for frame-accurate cuts using re-encoding instead of stream copy (fixes keyframe-only cut limitation) - Add _create_freeze_segment_matched() to enforce source audio property matching (fixes silent pauses caused by sample rate mismatch) - Update _render_pause_insert_method() to use new methods with uniform encoding parameters - Add -video_track_timescale 90000 for consistent timebase across segments Root causes fixed: 1. -c copy could only cut at keyframes, causing audio dropouts 2. Sample rate mismatch (48kHz source vs 44.1kHz MP3) caused silent freeze-frame segments when concatenated 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/app/services/video_renderer.py | 151 ++++++++++++++++++++----- 1 file changed, 122 insertions(+), 29 deletions(-) diff --git a/backend/app/services/video_renderer.py b/backend/app/services/video_renderer.py index abf2ab2..e47ec99 100644 --- a/backend/app/services/video_renderer.py +++ b/backend/app/services/video_renderer.py @@ -199,9 +199,9 @@ class VideoRendererService: with tempfile.TemporaryDirectory() as temp_dir: temp_dir_path = Path(temp_dir) - # Get video properties for re-encoding freeze frames + # Get detailed video AND audio properties for uniform encoding video_props = await self._get_video_properties(source_video_path) - logger.info(f"Video properties: {video_props}") + logger.info(f"Source Properties: {video_props}") segment_files = [] current_time = 0.0 @@ -220,14 +220,15 @@ class VideoRendererService: logger.warning(f"No AD audio found for cue {cue_index}, skipping") continue - # 1. Extract video segment from current_time to pause_point + # 1. Extract video segment from current_time to pause_point (re-encoded for frame accuracy) if pause_point > current_time: segment_path = temp_dir_path / f"segment_{i}_video.mp4" - await self._extract_segment( + await self._extract_segment_reencoded( source_video_path, current_time, pause_point - current_time, - str(segment_path) + str(segment_path), + video_props ) segment_files.append(str(segment_path)) @@ -239,9 +240,9 @@ class VideoRendererService: str(freeze_frame_path) ) - # 3. Create freeze segment with AD audio + # 3. Create freeze segment with AD audio (matched to source properties) freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4" - await self._create_freeze_segment( + await self._create_freeze_segment_matched( str(freeze_frame_path), ad_mp3_path, ad_duration, @@ -252,15 +253,16 @@ class VideoRendererService: current_time = pause_point - # 4. Add final segment from last pause point to end + # 4. Add final segment from last pause point to end (re-encoded for uniformity) source_duration = await self._get_video_duration(source_video_path) if current_time < source_duration: final_segment_path = temp_dir_path / "segment_final.mp4" - await self._extract_segment( + await self._extract_segment_reencoded( source_video_path, current_time, source_duration - current_time, - str(final_segment_path) + str(final_segment_path), + video_props ) segment_files.append(str(final_segment_path)) @@ -292,12 +294,11 @@ class VideoRendererService: return float(result.stdout.strip()) async def _get_video_properties(self, video_path: str) -> dict[str, Any]: - """Get video properties (resolution, framerate, codec) using ffprobe.""" + """Get detailed video and audio properties to ensure matching during concatenation.""" cmd = [ self.ffprobe_path, "-v", "quiet", - "-select_streams", "v:0", - "-show_entries", "stream=width,height,r_frame_rate,codec_name", + "-show_streams", "-of", "json", video_path ] @@ -311,23 +312,39 @@ class VideoRendererService: import json data = json.loads(result.stdout) - stream = data.get("streams", [{}])[0] - # Parse frame rate (e.g., "30000/1001" or "30/1") - fps_str = stream.get("r_frame_rate", "30/1") - if "/" in fps_str: - num, den = fps_str.split("/") - fps = float(num) / float(den) - else: - fps = float(fps_str) - - return { - "width": stream.get("width", 1920), - "height": stream.get("height", 1080), - "fps": fps, - "codec": stream.get("codec_name", "h264") + # Defaults (44100 is common for MP3, but we detect from source) + props = { + "width": 1920, + "height": 1080, + "fps": 30.0, + "sample_rate": "44100", + "channels": "2", + "pix_fmt": "yuv420p", + "codec": "h264" } + for stream in data.get("streams", []): + if stream.get("codec_type") == "video": + props["width"] = stream.get("width", props["width"]) + props["height"] = stream.get("height", props["height"]) + props["pix_fmt"] = stream.get("pix_fmt", props["pix_fmt"]) + props["codec"] = stream.get("codec_name", props["codec"]) + + # Parse frame rate (e.g., "30000/1001" or "30/1") + fps_str = stream.get("r_frame_rate", "30/1") + if "/" in fps_str: + num, den = fps_str.split("/") + props["fps"] = float(num) / float(den) + else: + props["fps"] = float(fps_str) + + elif stream.get("codec_type") == "audio": + props["sample_rate"] = stream.get("sample_rate", props["sample_rate"]) + props["channels"] = str(stream.get("channels", props["channels"])) + + return props + async def _extract_segment( self, source_path: str, @@ -335,7 +352,7 @@ class VideoRendererService: duration: float, output_path: str ): - """Extract a video segment using ffmpeg.""" + """Extract a video segment using ffmpeg (stream copy - for overlay method).""" cmd = [ self.ffmpeg_path, "-y", @@ -348,6 +365,43 @@ class VideoRendererService: ] await self._run_ffmpeg(cmd) + async def _extract_segment_reencoded( + self, + source_path: str, + start_time: float, + duration: float, + output_path: str, + props: dict[str, Any] + ): + """ + Extract segment with RE-ENCODING for frame-accurate cuts. + + Crucial for pause-insert method to avoid: + - Keyframe-only cuts causing audio dropouts + - Timestamp desynchronization + """ + cmd = [ + self.ffmpeg_path, + "-y", + "-ss", str(start_time), + "-i", source_path, + "-t", str(duration), + # Video Encoding + "-c:v", "libx264", + "-preset", "fast", + "-pix_fmt", props["pix_fmt"], + "-r", str(props["fps"]), + # Audio Encoding (Force match source) + "-c:a", "aac", + "-ar", props["sample_rate"], + "-ac", props["channels"], + "-b:a", "192k", + # Ensure timestamp continuity + "-video_track_timescale", "90000", + output_path + ] + await self._run_ffmpeg(cmd) + async def _extract_frame(self, video_path: str, time_point: float, output_path: str): """Extract a single frame as PNG using ffmpeg.""" cmd = [ @@ -369,7 +423,7 @@ class VideoRendererService: output_path: str, video_props: dict[str, Any] ): - """Create a freeze-frame video segment with audio overlay.""" + """Create a freeze-frame video segment with audio overlay (for overlay method).""" width = video_props.get("width", 1920) height = video_props.get("height", 1080) fps = video_props.get("fps", 30) @@ -394,6 +448,45 @@ class VideoRendererService: ] await self._run_ffmpeg(cmd) + async def _create_freeze_segment_matched( + self, + frame_path: str, + audio_path: str, + duration: float, + output_path: str, + props: dict[str, Any] + ): + """ + Create freeze frame that rigidly matches the source video properties. + + This fixes the "silent pause" issue caused by sample rate mismatch + when concatenating with extracted video segments. + """ + cmd = [ + self.ffmpeg_path, + "-y", + "-loop", "1", + "-i", frame_path, + "-i", audio_path, + "-c:v", "libx264", + "-preset", "fast", + "-tune", "stillimage", + "-pix_fmt", props["pix_fmt"], + "-r", str(props["fps"]), + # Scale filter to ensure dimensions match exactly + "-vf", f"scale={props['width']}:{props['height']}:force_original_aspect_ratio=decrease,pad={props['width']}:{props['height']}:(ow-iw)/2:(oh-ih)/2", + # Audio Encoding (CRITICAL: Match source sample rate and channels) + "-c:a", "aac", + "-ar", props["sample_rate"], + "-ac", props["channels"], + "-b:a", "192k", + "-t", str(duration), + "-video_track_timescale", "90000", + "-shortest", + output_path + ] + await self._run_ffmpeg(cmd) + async def _concatenate_segments( self, segment_paths: list[str],