fix: use actual freeze segment durations for VTT subtitle retiming

Subtitles were appearing progressively out of sync (~1.0s early per AD)
because the VTT retimer calculated freeze durations theoretically
rather than using actual rendered segment durations.

Changes:
- video_renderer: Measure actual freeze segment duration after creation
- video_renderer: Return updated placements with actual_freeze_duration
- vtt_retimer: Prefer actual_freeze_duration over calculated values
- render_task: Pass actual durations to VTT retimer

This ensures subtitle timing matches the real video timeline regardless
of any FFmpeg encoding variations.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
michael 2026-01-05 15:52:57 -06:00
parent e44210ea64
commit add958008a
4 changed files with 198 additions and 14 deletions

View file

@ -300,7 +300,7 @@ class VideoRendererService:
ad_segments: list[tuple[int, str]], # [(cue_index, mp3_path), ...]
analysis: dict[str, Any],
output_path: str,
) -> str:
) -> tuple[str, list[dict] | None]:
"""
Render accessible video based on Gemini analysis.
@ -311,7 +311,9 @@ class VideoRendererService:
output_path: Where to save the output MP4
Returns:
Path to rendered accessible video
Tuple of (output_path, updated_placements)
- output_path: Path to rendered accessible video
- updated_placements: Placements with actual_freeze_duration added (pause-insert only)
"""
method = analysis.get("method", "pause_insert")
@ -323,9 +325,10 @@ class VideoRendererService:
try:
if method == "overlay":
return await self._render_overlay_method(
result_path = await self._render_overlay_method(
source_video_path, ad_segments, analysis, output_path
)
return (result_path, None)
else:
return await self._render_pause_insert_method(
source_video_path, ad_segments, analysis, output_path
@ -457,7 +460,7 @@ class VideoRendererService:
ad_segments: list[tuple[int, str]],
analysis: dict[str, Any],
output_path: str,
) -> str:
) -> tuple[str, list[dict]]:
"""
Render with pause-insert method:
1. Split video at each pause point
@ -486,7 +489,7 @@ class VideoRendererService:
if not sorted_placements:
logger.warning("No pause points found, copying source video")
await self._copy_video(source_video_path, output_path)
return output_path
return (output_path, [])
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir_path = Path(temp_dir)
@ -642,6 +645,30 @@ class VideoRendererService:
await asyncio.gather(*phase3_tasks)
logger.info(f"Phase 3 complete: created {len(freeze_segment_paths)} freeze segments")
# ============================================================
# PHASE 3.5: Measure actual freeze segment durations for VTT retiming
# ============================================================
logger.info("Measuring actual freeze segment durations...")
for p in valid_placements:
i = p["index"]
freeze_path = freeze_segment_paths[i]
actual_duration = await self._get_video_duration(freeze_path)
p["actual_freeze_duration"] = actual_duration
# Log any discrepancy between expected and actual duration
expected = p["ad_duration"] + (2 * silence_duration)
discrepancy = actual_duration - expected
if abs(discrepancy) > 0.01: # 10ms threshold
logger.warning(
f"Freeze segment duration mismatch for cue {p['cue_index']}: "
f"expected={expected:.3f}s, actual={actual_duration:.3f}s, "
f"discrepancy={discrepancy:+.3f}s"
)
else:
logger.debug(
f"Freeze segment cue {p['cue_index']}: duration={actual_duration:.3f}s (expected={expected:.3f}s)"
)
# ============================================================
# PHASE 4: Assemble segment list in correct order
# ============================================================
@ -678,7 +705,24 @@ class VideoRendererService:
await self._copy_video(source_video_path, output_path)
logger.info(f"Pause-insert render complete: {output_path}")
return output_path
# Build updated placements with actual_freeze_duration
# Map from cue_index to actual_freeze_duration
actual_durations = {
p["cue_index"]: p["actual_freeze_duration"]
for p in valid_placements
}
# Update original placements with actual freeze durations
updated_placements = []
for placement in sorted_placements:
updated = placement.copy()
cue_index = placement.get("ad_cue_index")
if cue_index in actual_durations:
updated["actual_freeze_duration"] = actual_durations[cue_index]
updated_placements.append(updated)
return (output_path, updated_placements)
async def _get_video_duration(self, video_path: str) -> float:
"""Get video duration in seconds using ffprobe."""

View file

@ -66,8 +66,12 @@ class VTTRetimerService:
self,
placements: list[dict]
) -> list[tuple[float, float]]:
"""Build sorted list of (pause_point, effective_offset) tuples."""
silence_buffer_total = 1.0 # 500ms + 500ms
"""Build sorted list of (pause_point, effective_offset) tuples.
Uses actual_freeze_duration when available (measured from rendered video),
otherwise falls back to calculated value (ad_duration + 1.0s).
"""
silence_buffer_total = 1.0 # 500ms + 500ms (fallback calculation)
pauses = []
for placement in placements:
@ -75,12 +79,22 @@ class VTTRetimerService:
ad_duration = placement.get("ad_duration", 0)
if pause_point is not None and ad_duration > 0:
effective_offset = ad_duration + silence_buffer_total
# Prefer actual freeze duration if available (measured from rendered video)
actual_freeze = placement.get("actual_freeze_duration")
if actual_freeze is not None:
effective_offset = actual_freeze
logger.debug(
f"Pause at {pause_point:.2f}s: using actual_freeze_duration={effective_offset:.2f}s "
f"(ad_duration={ad_duration:.2f}s)"
)
else:
effective_offset = ad_duration + silence_buffer_total
logger.debug(
f"Pause at {pause_point:.2f}s: using calculated freeze_duration={effective_offset:.2f}s "
f"(ad_duration={ad_duration:.2f}s + 1.0s buffer)"
)
pauses.append((pause_point, effective_offset))
logger.debug(
f"Pause at {pause_point:.2f}s: ad_duration={ad_duration:.2f}s, "
f"freeze_duration={effective_offset:.2f}s"
)
return sorted(pauses, key=lambda x: x[0])

View file

@ -202,13 +202,18 @@ async def _async_render_accessible_video(job_id: str, language: str):
output_video_path = os.path.join(temp_dir, "accessible_video.mp4")
logger.info(f"Rendering accessible video using {method} method...")
await video_renderer_service.render_accessible_video(
rendered_path, updated_placements = await video_renderer_service.render_accessible_video(
source_video_path,
ad_segments,
analysis,
output_video_path
)
# Update analysis with actual freeze durations for VTT retiming
if updated_placements:
analysis["placements"] = updated_placements
logger.info(f"Updated {len(updated_placements)} placements with actual freeze durations")
# 7. Upload rendered video to GCS
video_blob_path = f"{job_id}/{language}/accessible_video.mp4"
video_blob = gcs_service.bucket.blob(video_blob_path)

View file

@ -389,3 +389,124 @@ class TestVTTRetimerHelpers:
assert len(filtered) == 2
assert filtered[0]["text"] == "long enough"
assert filtered[1]["text"] == "just enough"
def test_build_pause_list_uses_actual_freeze_duration(self, retimer):
"""_build_pause_list should use actual_freeze_duration when available."""
placements = [
{"pause_point": 10.0, "ad_duration": 3.0, "actual_freeze_duration": 4.5},
{"pause_point": 20.0, "ad_duration": 5.0}, # No actual, should calculate
]
pauses = retimer._build_pause_list(placements)
assert len(pauses) == 2
# First pause: should use actual_freeze_duration (4.5s)
assert pauses[0] == (10.0, 4.5)
# Second pause: should use calculated (5.0 + 1.0 = 6.0s)
assert pauses[1] == (20.0, 6.0)
class TestVTTRetimerActualFreezeDuration:
"""Test VTT retiming with actual_freeze_duration field."""
@pytest.fixture
def retimer(self):
"""Create a VTTRetimerService instance."""
return VTTRetimerService()
def test_uses_actual_freeze_duration_when_provided(self, retimer):
"""Should use actual_freeze_duration instead of calculated value."""
vtt = """WEBVTT
00:00:15.000 --> 00:00:18.000
Later caption
"""
analysis = {
"placements": [{
"pause_point": 10.0,
"ad_duration": 3.0,
"actual_freeze_duration": 4.5 # Actual is 0.5s longer than calculated
}]
}
result = retimer.retime_for_pause_insert(vtt, analysis)
cues = retimer._parse_vtt(result)
# Should use actual_freeze_duration (4.5s) not calculated (4.0s)
assert len(cues) == 1
assert cues[0]["start_time"] == 19.5 # 15 + 4.5
assert cues[0]["end_time"] == 22.5 # 18 + 4.5
def test_falls_back_to_calculated_when_actual_not_provided(self, retimer):
"""Should use calculated value when actual_freeze_duration not provided."""
vtt = """WEBVTT
00:00:15.000 --> 00:00:18.000
Later caption
"""
analysis = {
"placements": [{
"pause_point": 10.0,
"ad_duration": 3.0
# No actual_freeze_duration
}]
}
result = retimer.retime_for_pause_insert(vtt, analysis)
cues = retimer._parse_vtt(result)
# Should use calculated (3.0 + 1.0 = 4.0s)
assert len(cues) == 1
assert cues[0]["start_time"] == 19.0 # 15 + 4
assert cues[0]["end_time"] == 22.0 # 18 + 4
def test_mixed_actual_and_calculated(self, retimer):
"""Should handle mix of actual and calculated freeze durations."""
vtt = """WEBVTT
00:00:25.000 --> 00:00:28.000
Late caption
"""
analysis = {
"placements": [
{"pause_point": 10.0, "ad_duration": 3.0, "actual_freeze_duration": 4.2},
{"pause_point": 20.0, "ad_duration": 5.0} # No actual
]
}
result = retimer.retime_for_pause_insert(vtt, analysis)
cues = retimer._parse_vtt(result)
# Total offset = 4.2 (actual) + 6.0 (calculated: 5+1) = 10.2
assert len(cues) == 1
assert cues[0]["start_time"] == pytest.approx(35.2, rel=1e-3) # 25 + 10.2
assert cues[0]["end_time"] == pytest.approx(38.2, rel=1e-3) # 28 + 10.2
def test_cue_spanning_pause_with_actual_duration(self, retimer):
"""Cue spanning pause should use actual_freeze_duration for split timing."""
vtt = """WEBVTT
00:00:08.000 --> 00:00:12.000
Spanning caption
"""
analysis = {
"placements": [{
"pause_point": 10.0,
"ad_duration": 3.0,
"actual_freeze_duration": 4.3 # Slightly longer than calculated
}]
}
result = retimer.retime_for_pause_insert(vtt, analysis)
cues = retimer._parse_vtt(result)
# Should split into two segments
assert len(cues) == 2
# Segment 1: 8s-10s (before freeze, no offset)
assert cues[0]["start_time"] == 8.0
assert cues[0]["end_time"] == 10.0
# Segment 2: 14.3s-16.3s (after freeze, +4.3s offset)
assert cues[1]["start_time"] == pytest.approx(14.3, rel=1e-3)
assert cues[1]["end_time"] == pytest.approx(16.3, rel=1e-3)