fix: resolve audio/video sync issues in accessible video renderer

- Update _get_video_properties() to extract audio sample_rate, channels,
  and pix_fmt in addition to video properties
- Add _extract_segment_reencoded() for frame-accurate cuts using
  re-encoding instead of stream copy (fixes keyframe-only cut limitation)
- Add _create_freeze_segment_matched() to enforce source audio property
  matching (fixes silent pauses caused by sample rate mismatch)
- Update _render_pause_insert_method() to use new methods with uniform
  encoding parameters
- Add -video_track_timescale 90000 for consistent timebase across segments

Root causes fixed:
1. -c copy could only cut at keyframes, causing audio dropouts
2. Sample rate mismatch (48kHz source vs 44.1kHz MP3) caused silent
   freeze-frame segments when concatenated

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
michael 2025-12-26 12:05:32 -06:00
parent 6acb452cfa
commit 54667fbcb8

View file

@ -199,9 +199,9 @@ class VideoRendererService:
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir_path = Path(temp_dir)
# Get video properties for re-encoding freeze frames
# Get detailed video AND audio properties for uniform encoding
video_props = await self._get_video_properties(source_video_path)
logger.info(f"Video properties: {video_props}")
logger.info(f"Source Properties: {video_props}")
segment_files = []
current_time = 0.0
@ -220,14 +220,15 @@ class VideoRendererService:
logger.warning(f"No AD audio found for cue {cue_index}, skipping")
continue
# 1. Extract video segment from current_time to pause_point
# 1. Extract video segment from current_time to pause_point (re-encoded for frame accuracy)
if pause_point > current_time:
segment_path = temp_dir_path / f"segment_{i}_video.mp4"
await self._extract_segment(
await self._extract_segment_reencoded(
source_video_path,
current_time,
pause_point - current_time,
str(segment_path)
str(segment_path),
video_props
)
segment_files.append(str(segment_path))
@ -239,9 +240,9 @@ class VideoRendererService:
str(freeze_frame_path)
)
# 3. Create freeze segment with AD audio
# 3. Create freeze segment with AD audio (matched to source properties)
freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
await self._create_freeze_segment(
await self._create_freeze_segment_matched(
str(freeze_frame_path),
ad_mp3_path,
ad_duration,
@ -252,15 +253,16 @@ class VideoRendererService:
current_time = pause_point
# 4. Add final segment from last pause point to end
# 4. Add final segment from last pause point to end (re-encoded for uniformity)
source_duration = await self._get_video_duration(source_video_path)
if current_time < source_duration:
final_segment_path = temp_dir_path / "segment_final.mp4"
await self._extract_segment(
await self._extract_segment_reencoded(
source_video_path,
current_time,
source_duration - current_time,
str(final_segment_path)
str(final_segment_path),
video_props
)
segment_files.append(str(final_segment_path))
@ -292,12 +294,11 @@ class VideoRendererService:
return float(result.stdout.strip())
async def _get_video_properties(self, video_path: str) -> dict[str, Any]:
"""Get video properties (resolution, framerate, codec) using ffprobe."""
"""Get detailed video and audio properties to ensure matching during concatenation."""
cmd = [
self.ffprobe_path,
"-v", "quiet",
"-select_streams", "v:0",
"-show_entries", "stream=width,height,r_frame_rate,codec_name",
"-show_streams",
"-of", "json",
video_path
]
@ -311,23 +312,39 @@ class VideoRendererService:
import json
data = json.loads(result.stdout)
stream = data.get("streams", [{}])[0]
# Parse frame rate (e.g., "30000/1001" or "30/1")
fps_str = stream.get("r_frame_rate", "30/1")
if "/" in fps_str:
num, den = fps_str.split("/")
fps = float(num) / float(den)
else:
fps = float(fps_str)
return {
"width": stream.get("width", 1920),
"height": stream.get("height", 1080),
"fps": fps,
"codec": stream.get("codec_name", "h264")
# Defaults (44100 is common for MP3, but we detect from source)
props = {
"width": 1920,
"height": 1080,
"fps": 30.0,
"sample_rate": "44100",
"channels": "2",
"pix_fmt": "yuv420p",
"codec": "h264"
}
for stream in data.get("streams", []):
if stream.get("codec_type") == "video":
props["width"] = stream.get("width", props["width"])
props["height"] = stream.get("height", props["height"])
props["pix_fmt"] = stream.get("pix_fmt", props["pix_fmt"])
props["codec"] = stream.get("codec_name", props["codec"])
# Parse frame rate (e.g., "30000/1001" or "30/1")
fps_str = stream.get("r_frame_rate", "30/1")
if "/" in fps_str:
num, den = fps_str.split("/")
props["fps"] = float(num) / float(den)
else:
props["fps"] = float(fps_str)
elif stream.get("codec_type") == "audio":
props["sample_rate"] = stream.get("sample_rate", props["sample_rate"])
props["channels"] = str(stream.get("channels", props["channels"]))
return props
async def _extract_segment(
self,
source_path: str,
@ -335,7 +352,7 @@ class VideoRendererService:
duration: float,
output_path: str
):
"""Extract a video segment using ffmpeg."""
"""Extract a video segment using ffmpeg (stream copy - for overlay method)."""
cmd = [
self.ffmpeg_path,
"-y",
@ -348,6 +365,43 @@ class VideoRendererService:
]
await self._run_ffmpeg(cmd)
async def _extract_segment_reencoded(
self,
source_path: str,
start_time: float,
duration: float,
output_path: str,
props: dict[str, Any]
):
"""
Extract segment with RE-ENCODING for frame-accurate cuts.
Crucial for pause-insert method to avoid:
- Keyframe-only cuts causing audio dropouts
- Timestamp desynchronization
"""
cmd = [
self.ffmpeg_path,
"-y",
"-ss", str(start_time),
"-i", source_path,
"-t", str(duration),
# Video Encoding
"-c:v", "libx264",
"-preset", "fast",
"-pix_fmt", props["pix_fmt"],
"-r", str(props["fps"]),
# Audio Encoding (Force match source)
"-c:a", "aac",
"-ar", props["sample_rate"],
"-ac", props["channels"],
"-b:a", "192k",
# Ensure timestamp continuity
"-video_track_timescale", "90000",
output_path
]
await self._run_ffmpeg(cmd)
async def _extract_frame(self, video_path: str, time_point: float, output_path: str):
"""Extract a single frame as PNG using ffmpeg."""
cmd = [
@ -369,7 +423,7 @@ class VideoRendererService:
output_path: str,
video_props: dict[str, Any]
):
"""Create a freeze-frame video segment with audio overlay."""
"""Create a freeze-frame video segment with audio overlay (for overlay method)."""
width = video_props.get("width", 1920)
height = video_props.get("height", 1080)
fps = video_props.get("fps", 30)
@ -394,6 +448,45 @@ class VideoRendererService:
]
await self._run_ffmpeg(cmd)
async def _create_freeze_segment_matched(
self,
frame_path: str,
audio_path: str,
duration: float,
output_path: str,
props: dict[str, Any]
):
"""
Create freeze frame that rigidly matches the source video properties.
This fixes the "silent pause" issue caused by sample rate mismatch
when concatenating with extracted video segments.
"""
cmd = [
self.ffmpeg_path,
"-y",
"-loop", "1",
"-i", frame_path,
"-i", audio_path,
"-c:v", "libx264",
"-preset", "fast",
"-tune", "stillimage",
"-pix_fmt", props["pix_fmt"],
"-r", str(props["fps"]),
# Scale filter to ensure dimensions match exactly
"-vf", f"scale={props['width']}:{props['height']}:force_original_aspect_ratio=decrease,pad={props['width']}:{props['height']}:(ow-iw)/2:(oh-ih)/2",
# Audio Encoding (CRITICAL: Match source sample rate and channels)
"-c:a", "aac",
"-ar", props["sample_rate"],
"-ac", props["channels"],
"-b:a", "192k",
"-t", str(duration),
"-video_track_timescale", "90000",
"-shortest",
output_path
]
await self._run_ffmpeg(cmd)
async def _concatenate_segments(
self,
segment_paths: list[str],