fix: use actual freeze segment durations for VTT subtitle retiming

Subtitles were appearing progressively out of sync (~1.0s early per AD) because the VTT retimer calculated freeze durations theoretically rather than using actual rendered segment durations. Changes: - video_renderer: Measure actual freeze segment duration after creation - video_renderer: Return updated placements with actual_freeze_duration - vtt_retimer: Prefer actual_freeze_duration over calculated values - render_task: Pass actual durations to VTT retimer This ensures subtitle timing matches the real video timeline regardless of any FFmpeg encoding variations. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-05 15:52:57 -06:00 · 2026-01-05 15:52:57 -06:00 · add958008a
commit add958008a
parent e44210ea64
4 changed files with 198 additions and 14 deletions
--- a/backend/app/services/video_renderer.py
+++ b/backend/app/services/video_renderer.py
@ -300,7 +300,7 @@ class VideoRendererService:
        ad_segments: list[tuple[int, str]],  # [(cue_index, mp3_path), ...]
        analysis: dict[str, Any],
        output_path: str,
-    ) -> str:
+    ) -> tuple[str, list[dict] | None]:
        """
        Render accessible video based on Gemini analysis.

@ -311,7 +311,9 @@ class VideoRendererService:
            output_path: Where to save the output MP4

        Returns:
-            Path to rendered accessible video
+            Tuple of (output_path, updated_placements)
+            - output_path: Path to rendered accessible video
+            - updated_placements: Placements with actual_freeze_duration added (pause-insert only)
        """
        method = analysis.get("method", "pause_insert")

@ -323,9 +325,10 @@ class VideoRendererService:

        try:
            if method == "overlay":
-                return await self._render_overlay_method(
+                result_path = await self._render_overlay_method(
                    source_video_path, ad_segments, analysis, output_path
                )
+                return (result_path, None)
            else:
                return await self._render_pause_insert_method(
                    source_video_path, ad_segments, analysis, output_path
@ -457,7 +460,7 @@ class VideoRendererService:
        ad_segments: list[tuple[int, str]],
        analysis: dict[str, Any],
        output_path: str,
-    ) -> str:
+    ) -> tuple[str, list[dict]]:
        """
        Render with pause-insert method:
        1. Split video at each pause point
@ -486,7 +489,7 @@ class VideoRendererService:
        if not sorted_placements:
            logger.warning("No pause points found, copying source video")
            await self._copy_video(source_video_path, output_path)
-            return output_path
+            return (output_path, [])

        with tempfile.TemporaryDirectory() as temp_dir:
            temp_dir_path = Path(temp_dir)
@ -642,6 +645,30 @@ class VideoRendererService:
            await asyncio.gather(*phase3_tasks)
            logger.info(f"Phase 3 complete: created {len(freeze_segment_paths)} freeze segments")

+            # ============================================================
+            # PHASE 3.5: Measure actual freeze segment durations for VTT retiming
+            # ============================================================
+            logger.info("Measuring actual freeze segment durations...")
+            for p in valid_placements:
+                i = p["index"]
+                freeze_path = freeze_segment_paths[i]
+                actual_duration = await self._get_video_duration(freeze_path)
+                p["actual_freeze_duration"] = actual_duration
+
+                # Log any discrepancy between expected and actual duration
+                expected = p["ad_duration"] + (2 * silence_duration)
+                discrepancy = actual_duration - expected
+                if abs(discrepancy) > 0.01:  # 10ms threshold
+                    logger.warning(
+                        f"Freeze segment duration mismatch for cue {p['cue_index']}: "
+                        f"expected={expected:.3f}s, actual={actual_duration:.3f}s, "
+                        f"discrepancy={discrepancy:+.3f}s"
+                    )
+                else:
+                    logger.debug(
+                        f"Freeze segment cue {p['cue_index']}: duration={actual_duration:.3f}s (expected={expected:.3f}s)"
+                    )
+
            # ============================================================
            # PHASE 4: Assemble segment list in correct order
            # ============================================================
@ -678,7 +705,24 @@ class VideoRendererService:
                await self._copy_video(source_video_path, output_path)

            logger.info(f"Pause-insert render complete: {output_path}")
-            return output_path
+
+            # Build updated placements with actual_freeze_duration
+            # Map from cue_index to actual_freeze_duration
+            actual_durations = {
+                p["cue_index"]: p["actual_freeze_duration"]
+                for p in valid_placements
+            }
+
+            # Update original placements with actual freeze durations
+            updated_placements = []
+            for placement in sorted_placements:
+                updated = placement.copy()
+                cue_index = placement.get("ad_cue_index")
+                if cue_index in actual_durations:
+                    updated["actual_freeze_duration"] = actual_durations[cue_index]
+                updated_placements.append(updated)
+
+            return (output_path, updated_placements)

    async def _get_video_duration(self, video_path: str) -> float:
        """Get video duration in seconds using ffprobe."""
--- a/backend/app/services/vtt_retimer.py
+++ b/backend/app/services/vtt_retimer.py
@ -66,8 +66,12 @@ class VTTRetimerService:
        self,
        placements: list[dict]
    ) -> list[tuple[float, float]]:
-        """Build sorted list of (pause_point, effective_offset) tuples."""
-        silence_buffer_total = 1.0  # 500ms + 500ms
+        """Build sorted list of (pause_point, effective_offset) tuples.
+
+        Uses actual_freeze_duration when available (measured from rendered video),
+        otherwise falls back to calculated value (ad_duration + 1.0s).
+        """
+        silence_buffer_total = 1.0  # 500ms + 500ms (fallback calculation)

        pauses = []
        for placement in placements:
@ -75,12 +79,22 @@ class VTTRetimerService:
            ad_duration = placement.get("ad_duration", 0)

            if pause_point is not None and ad_duration > 0:
-                effective_offset = ad_duration + silence_buffer_total
+                # Prefer actual freeze duration if available (measured from rendered video)
+                actual_freeze = placement.get("actual_freeze_duration")
+                if actual_freeze is not None:
+                    effective_offset = actual_freeze
+                    logger.debug(
+                        f"Pause at {pause_point:.2f}s: using actual_freeze_duration={effective_offset:.2f}s "
+                        f"(ad_duration={ad_duration:.2f}s)"
+                    )
+                else:
+                    effective_offset = ad_duration + silence_buffer_total
+                    logger.debug(
+                        f"Pause at {pause_point:.2f}s: using calculated freeze_duration={effective_offset:.2f}s "
+                        f"(ad_duration={ad_duration:.2f}s + 1.0s buffer)"
+                    )
+
                pauses.append((pause_point, effective_offset))
-                logger.debug(
-                    f"Pause at {pause_point:.2f}s: ad_duration={ad_duration:.2f}s, "
-                    f"freeze_duration={effective_offset:.2f}s"
-                )

        return sorted(pauses, key=lambda x: x[0])

--- a/backend/app/tasks/render_accessible_video.py
+++ b/backend/app/tasks/render_accessible_video.py
@ -202,13 +202,18 @@ async def _async_render_accessible_video(job_id: str, language: str):
            output_video_path = os.path.join(temp_dir, "accessible_video.mp4")

            logger.info(f"Rendering accessible video using {method} method...")
-            await video_renderer_service.render_accessible_video(
+            rendered_path, updated_placements = await video_renderer_service.render_accessible_video(
                source_video_path,
                ad_segments,
                analysis,
                output_video_path
            )

+            # Update analysis with actual freeze durations for VTT retiming
+            if updated_placements:
+                analysis["placements"] = updated_placements
+                logger.info(f"Updated {len(updated_placements)} placements with actual freeze durations")
+
            # 7. Upload rendered video to GCS
            video_blob_path = f"{job_id}/{language}/accessible_video.mp4"
            video_blob = gcs_service.bucket.blob(video_blob_path)
--- a/backend/tests/unit/test_vtt_retimer.py
+++ b/backend/tests/unit/test_vtt_retimer.py
@ -389,3 +389,124 @@ class TestVTTRetimerHelpers:
        assert len(filtered) == 2
        assert filtered[0]["text"] == "long enough"
        assert filtered[1]["text"] == "just enough"
+
+    def test_build_pause_list_uses_actual_freeze_duration(self, retimer):
+        """_build_pause_list should use actual_freeze_duration when available."""
+        placements = [
+            {"pause_point": 10.0, "ad_duration": 3.0, "actual_freeze_duration": 4.5},
+            {"pause_point": 20.0, "ad_duration": 5.0},  # No actual, should calculate
+        ]
+
+        pauses = retimer._build_pause_list(placements)
+
+        assert len(pauses) == 2
+        # First pause: should use actual_freeze_duration (4.5s)
+        assert pauses[0] == (10.0, 4.5)
+        # Second pause: should use calculated (5.0 + 1.0 = 6.0s)
+        assert pauses[1] == (20.0, 6.0)
+
+
+class TestVTTRetimerActualFreezeDuration:
+    """Test VTT retiming with actual_freeze_duration field."""
+
+    @pytest.fixture
+    def retimer(self):
+        """Create a VTTRetimerService instance."""
+        return VTTRetimerService()
+
+    def test_uses_actual_freeze_duration_when_provided(self, retimer):
+        """Should use actual_freeze_duration instead of calculated value."""
+        vtt = """WEBVTT
+
+00:00:15.000 --> 00:00:18.000
+Later caption
+"""
+        analysis = {
+            "placements": [{
+                "pause_point": 10.0,
+                "ad_duration": 3.0,
+                "actual_freeze_duration": 4.5  # Actual is 0.5s longer than calculated
+            }]
+        }
+
+        result = retimer.retime_for_pause_insert(vtt, analysis)
+        cues = retimer._parse_vtt(result)
+
+        # Should use actual_freeze_duration (4.5s) not calculated (4.0s)
+        assert len(cues) == 1
+        assert cues[0]["start_time"] == 19.5  # 15 + 4.5
+        assert cues[0]["end_time"] == 22.5    # 18 + 4.5
+
+    def test_falls_back_to_calculated_when_actual_not_provided(self, retimer):
+        """Should use calculated value when actual_freeze_duration not provided."""
+        vtt = """WEBVTT
+
+00:00:15.000 --> 00:00:18.000
+Later caption
+"""
+        analysis = {
+            "placements": [{
+                "pause_point": 10.0,
+                "ad_duration": 3.0
+                # No actual_freeze_duration
+            }]
+        }
+
+        result = retimer.retime_for_pause_insert(vtt, analysis)
+        cues = retimer._parse_vtt(result)
+
+        # Should use calculated (3.0 + 1.0 = 4.0s)
+        assert len(cues) == 1
+        assert cues[0]["start_time"] == 19.0  # 15 + 4
+        assert cues[0]["end_time"] == 22.0    # 18 + 4
+
+    def test_mixed_actual_and_calculated(self, retimer):
+        """Should handle mix of actual and calculated freeze durations."""
+        vtt = """WEBVTT
+
+00:00:25.000 --> 00:00:28.000
+Late caption
+"""
+        analysis = {
+            "placements": [
+                {"pause_point": 10.0, "ad_duration": 3.0, "actual_freeze_duration": 4.2},
+                {"pause_point": 20.0, "ad_duration": 5.0}  # No actual
+            ]
+        }
+
+        result = retimer.retime_for_pause_insert(vtt, analysis)
+        cues = retimer._parse_vtt(result)
+
+        # Total offset = 4.2 (actual) + 6.0 (calculated: 5+1) = 10.2
+        assert len(cues) == 1
+        assert cues[0]["start_time"] == pytest.approx(35.2, rel=1e-3)  # 25 + 10.2
+        assert cues[0]["end_time"] == pytest.approx(38.2, rel=1e-3)    # 28 + 10.2
+
+    def test_cue_spanning_pause_with_actual_duration(self, retimer):
+        """Cue spanning pause should use actual_freeze_duration for split timing."""
+        vtt = """WEBVTT
+
+00:00:08.000 --> 00:00:12.000
+Spanning caption
+"""
+        analysis = {
+            "placements": [{
+                "pause_point": 10.0,
+                "ad_duration": 3.0,
+                "actual_freeze_duration": 4.3  # Slightly longer than calculated
+            }]
+        }
+
+        result = retimer.retime_for_pause_insert(vtt, analysis)
+        cues = retimer._parse_vtt(result)
+
+        # Should split into two segments
+        assert len(cues) == 2
+
+        # Segment 1: 8s-10s (before freeze, no offset)
+        assert cues[0]["start_time"] == 8.0
+        assert cues[0]["end_time"] == 10.0
+
+        # Segment 2: 14.3s-16.3s (after freeze, +4.3s offset)
+        assert cues[1]["start_time"] == pytest.approx(14.3, rel=1e-3)
+        assert cues[1]["end_time"] == pytest.approx(16.3, rel=1e-3)