fix: resolve audio/video sync issues in accessible video renderer
- Update _get_video_properties() to extract audio sample_rate, channels, and pix_fmt in addition to video properties - Add _extract_segment_reencoded() for frame-accurate cuts using re-encoding instead of stream copy (fixes keyframe-only cut limitation) - Add _create_freeze_segment_matched() to enforce source audio property matching (fixes silent pauses caused by sample rate mismatch) - Update _render_pause_insert_method() to use new methods with uniform encoding parameters - Add -video_track_timescale 90000 for consistent timebase across segments Root causes fixed: 1. -c copy could only cut at keyframes, causing audio dropouts 2. Sample rate mismatch (48kHz source vs 44.1kHz MP3) caused silent freeze-frame segments when concatenated 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
6acb452cfa
commit
54667fbcb8
1 changed files with 122 additions and 29 deletions
|
|
@ -199,9 +199,9 @@ class VideoRendererService:
|
|||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_dir_path = Path(temp_dir)
|
||||
|
||||
# Get video properties for re-encoding freeze frames
|
||||
# Get detailed video AND audio properties for uniform encoding
|
||||
video_props = await self._get_video_properties(source_video_path)
|
||||
logger.info(f"Video properties: {video_props}")
|
||||
logger.info(f"Source Properties: {video_props}")
|
||||
|
||||
segment_files = []
|
||||
current_time = 0.0
|
||||
|
|
@ -220,14 +220,15 @@ class VideoRendererService:
|
|||
logger.warning(f"No AD audio found for cue {cue_index}, skipping")
|
||||
continue
|
||||
|
||||
# 1. Extract video segment from current_time to pause_point
|
||||
# 1. Extract video segment from current_time to pause_point (re-encoded for frame accuracy)
|
||||
if pause_point > current_time:
|
||||
segment_path = temp_dir_path / f"segment_{i}_video.mp4"
|
||||
await self._extract_segment(
|
||||
await self._extract_segment_reencoded(
|
||||
source_video_path,
|
||||
current_time,
|
||||
pause_point - current_time,
|
||||
str(segment_path)
|
||||
str(segment_path),
|
||||
video_props
|
||||
)
|
||||
segment_files.append(str(segment_path))
|
||||
|
||||
|
|
@ -239,9 +240,9 @@ class VideoRendererService:
|
|||
str(freeze_frame_path)
|
||||
)
|
||||
|
||||
# 3. Create freeze segment with AD audio
|
||||
# 3. Create freeze segment with AD audio (matched to source properties)
|
||||
freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
|
||||
await self._create_freeze_segment(
|
||||
await self._create_freeze_segment_matched(
|
||||
str(freeze_frame_path),
|
||||
ad_mp3_path,
|
||||
ad_duration,
|
||||
|
|
@ -252,15 +253,16 @@ class VideoRendererService:
|
|||
|
||||
current_time = pause_point
|
||||
|
||||
# 4. Add final segment from last pause point to end
|
||||
# 4. Add final segment from last pause point to end (re-encoded for uniformity)
|
||||
source_duration = await self._get_video_duration(source_video_path)
|
||||
if current_time < source_duration:
|
||||
final_segment_path = temp_dir_path / "segment_final.mp4"
|
||||
await self._extract_segment(
|
||||
await self._extract_segment_reencoded(
|
||||
source_video_path,
|
||||
current_time,
|
||||
source_duration - current_time,
|
||||
str(final_segment_path)
|
||||
str(final_segment_path),
|
||||
video_props
|
||||
)
|
||||
segment_files.append(str(final_segment_path))
|
||||
|
||||
|
|
@ -292,12 +294,11 @@ class VideoRendererService:
|
|||
return float(result.stdout.strip())
|
||||
|
||||
async def _get_video_properties(self, video_path: str) -> dict[str, Any]:
|
||||
"""Get video properties (resolution, framerate, codec) using ffprobe."""
|
||||
"""Get detailed video and audio properties to ensure matching during concatenation."""
|
||||
cmd = [
|
||||
self.ffprobe_path,
|
||||
"-v", "quiet",
|
||||
"-select_streams", "v:0",
|
||||
"-show_entries", "stream=width,height,r_frame_rate,codec_name",
|
||||
"-show_streams",
|
||||
"-of", "json",
|
||||
video_path
|
||||
]
|
||||
|
|
@ -311,23 +312,39 @@ class VideoRendererService:
|
|||
|
||||
import json
|
||||
data = json.loads(result.stdout)
|
||||
stream = data.get("streams", [{}])[0]
|
||||
|
||||
# Parse frame rate (e.g., "30000/1001" or "30/1")
|
||||
fps_str = stream.get("r_frame_rate", "30/1")
|
||||
if "/" in fps_str:
|
||||
num, den = fps_str.split("/")
|
||||
fps = float(num) / float(den)
|
||||
else:
|
||||
fps = float(fps_str)
|
||||
|
||||
return {
|
||||
"width": stream.get("width", 1920),
|
||||
"height": stream.get("height", 1080),
|
||||
"fps": fps,
|
||||
"codec": stream.get("codec_name", "h264")
|
||||
# Defaults (44100 is common for MP3, but we detect from source)
|
||||
props = {
|
||||
"width": 1920,
|
||||
"height": 1080,
|
||||
"fps": 30.0,
|
||||
"sample_rate": "44100",
|
||||
"channels": "2",
|
||||
"pix_fmt": "yuv420p",
|
||||
"codec": "h264"
|
||||
}
|
||||
|
||||
for stream in data.get("streams", []):
|
||||
if stream.get("codec_type") == "video":
|
||||
props["width"] = stream.get("width", props["width"])
|
||||
props["height"] = stream.get("height", props["height"])
|
||||
props["pix_fmt"] = stream.get("pix_fmt", props["pix_fmt"])
|
||||
props["codec"] = stream.get("codec_name", props["codec"])
|
||||
|
||||
# Parse frame rate (e.g., "30000/1001" or "30/1")
|
||||
fps_str = stream.get("r_frame_rate", "30/1")
|
||||
if "/" in fps_str:
|
||||
num, den = fps_str.split("/")
|
||||
props["fps"] = float(num) / float(den)
|
||||
else:
|
||||
props["fps"] = float(fps_str)
|
||||
|
||||
elif stream.get("codec_type") == "audio":
|
||||
props["sample_rate"] = stream.get("sample_rate", props["sample_rate"])
|
||||
props["channels"] = str(stream.get("channels", props["channels"]))
|
||||
|
||||
return props
|
||||
|
||||
async def _extract_segment(
|
||||
self,
|
||||
source_path: str,
|
||||
|
|
@ -335,7 +352,7 @@ class VideoRendererService:
|
|||
duration: float,
|
||||
output_path: str
|
||||
):
|
||||
"""Extract a video segment using ffmpeg."""
|
||||
"""Extract a video segment using ffmpeg (stream copy - for overlay method)."""
|
||||
cmd = [
|
||||
self.ffmpeg_path,
|
||||
"-y",
|
||||
|
|
@ -348,6 +365,43 @@ class VideoRendererService:
|
|||
]
|
||||
await self._run_ffmpeg(cmd)
|
||||
|
||||
async def _extract_segment_reencoded(
|
||||
self,
|
||||
source_path: str,
|
||||
start_time: float,
|
||||
duration: float,
|
||||
output_path: str,
|
||||
props: dict[str, Any]
|
||||
):
|
||||
"""
|
||||
Extract segment with RE-ENCODING for frame-accurate cuts.
|
||||
|
||||
Crucial for pause-insert method to avoid:
|
||||
- Keyframe-only cuts causing audio dropouts
|
||||
- Timestamp desynchronization
|
||||
"""
|
||||
cmd = [
|
||||
self.ffmpeg_path,
|
||||
"-y",
|
||||
"-ss", str(start_time),
|
||||
"-i", source_path,
|
||||
"-t", str(duration),
|
||||
# Video Encoding
|
||||
"-c:v", "libx264",
|
||||
"-preset", "fast",
|
||||
"-pix_fmt", props["pix_fmt"],
|
||||
"-r", str(props["fps"]),
|
||||
# Audio Encoding (Force match source)
|
||||
"-c:a", "aac",
|
||||
"-ar", props["sample_rate"],
|
||||
"-ac", props["channels"],
|
||||
"-b:a", "192k",
|
||||
# Ensure timestamp continuity
|
||||
"-video_track_timescale", "90000",
|
||||
output_path
|
||||
]
|
||||
await self._run_ffmpeg(cmd)
|
||||
|
||||
async def _extract_frame(self, video_path: str, time_point: float, output_path: str):
|
||||
"""Extract a single frame as PNG using ffmpeg."""
|
||||
cmd = [
|
||||
|
|
@ -369,7 +423,7 @@ class VideoRendererService:
|
|||
output_path: str,
|
||||
video_props: dict[str, Any]
|
||||
):
|
||||
"""Create a freeze-frame video segment with audio overlay."""
|
||||
"""Create a freeze-frame video segment with audio overlay (for overlay method)."""
|
||||
width = video_props.get("width", 1920)
|
||||
height = video_props.get("height", 1080)
|
||||
fps = video_props.get("fps", 30)
|
||||
|
|
@ -394,6 +448,45 @@ class VideoRendererService:
|
|||
]
|
||||
await self._run_ffmpeg(cmd)
|
||||
|
||||
async def _create_freeze_segment_matched(
|
||||
self,
|
||||
frame_path: str,
|
||||
audio_path: str,
|
||||
duration: float,
|
||||
output_path: str,
|
||||
props: dict[str, Any]
|
||||
):
|
||||
"""
|
||||
Create freeze frame that rigidly matches the source video properties.
|
||||
|
||||
This fixes the "silent pause" issue caused by sample rate mismatch
|
||||
when concatenating with extracted video segments.
|
||||
"""
|
||||
cmd = [
|
||||
self.ffmpeg_path,
|
||||
"-y",
|
||||
"-loop", "1",
|
||||
"-i", frame_path,
|
||||
"-i", audio_path,
|
||||
"-c:v", "libx264",
|
||||
"-preset", "fast",
|
||||
"-tune", "stillimage",
|
||||
"-pix_fmt", props["pix_fmt"],
|
||||
"-r", str(props["fps"]),
|
||||
# Scale filter to ensure dimensions match exactly
|
||||
"-vf", f"scale={props['width']}:{props['height']}:force_original_aspect_ratio=decrease,pad={props['width']}:{props['height']}:(ow-iw)/2:(oh-ih)/2",
|
||||
# Audio Encoding (CRITICAL: Match source sample rate and channels)
|
||||
"-c:a", "aac",
|
||||
"-ar", props["sample_rate"],
|
||||
"-ac", props["channels"],
|
||||
"-b:a", "192k",
|
||||
"-t", str(duration),
|
||||
"-video_track_timescale", "90000",
|
||||
"-shortest",
|
||||
output_path
|
||||
]
|
||||
await self._run_ffmpeg(cmd)
|
||||
|
||||
async def _concatenate_segments(
|
||||
self,
|
||||
segment_paths: list[str],
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue