From c8a610b3f765bb27775af526d1ff7b64f9d82b36 Mon Sep 17 00:00:00 2001
From: Vadym Samoilenko <vadymsamoilenko@oliver.agency>
Date: Fri, 8 May 2026 13:23:08 +0100
Subject: [PATCH] fix(vtt): auto-fix overlapping cues from AI-generated output

Gemini occasionally produces captions where a cue's start_time is
earlier than the previous cue's end_time. Add VTTEditor.fix_overlapping_cues()
that trims each cue's end_time to 1ms before the next cue's start, applied
to both captions and AD VTT immediately after AI generation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/app/lib/vtt.py             | 14 ++++++++++++++
 backend/app/tasks/ingest_and_ai.py |  7 +++++++
 2 files changed, 21 insertions(+)

diff --git a/backend/app/lib/vtt.py b/backend/app/lib/vtt.py
index 80bf8c2..4dc7435 100644
--- a/backend/app/lib/vtt.py
+++ b/backend/app/lib/vtt.py
@@ -207,6 +207,20 @@ class VTTEditor:
 
         return len(errors) == 0, errors
 
+    @staticmethod
+    def fix_overlapping_cues(vtt_content: str) -> str:
+        """Trim end_time of each cue so it does not overlap the next cue's start_time."""
+        cues = VTTParser.parse(vtt_content)
+        for i in range(1, len(cues)):
+            if cues[i].start_time < cues[i - 1].end_time:
+                # Clamp previous cue end to 1ms before next cue start
+                new_end = cues[i].start_time - 0.001
+                # Never let end_time go at or below start_time
+                if new_end <= cues[i - 1].start_time:
+                    new_end = cues[i - 1].start_time + 0.001
+                cues[i - 1].end_time = new_end
+        return VTTParser.build(cues)
+
     @staticmethod
     def get_cue_count(vtt_content: str) -> int:
         """Get the number of cues in VTT content"""
diff --git a/backend/app/tasks/ingest_and_ai.py b/backend/app/tasks/ingest_and_ai.py
index f567cc9..fb6e9bb 100644
--- a/backend/app/tasks/ingest_and_ai.py
+++ b/backend/app/tasks/ingest_and_ai.py
@@ -218,8 +218,15 @@ async def ingest_and_ai_task_impl(job_id: str):
 
                 # Align caption timings with Whisper word-level timestamps (Bug 5)
                 captions_vtt = await _align_captions_with_whisper(captions_vtt, temp_path, job_id)
+                # Fix overlapping cues that Gemini occasionally produces
+                captions_vtt = VTTEditor.fix_overlapping_cues(captions_vtt)
                 ai_result["captions_vtt"] = captions_vtt
 
+                # Fix overlapping cues in AD VTT as well
+                ai_result["audio_description_vtt"] = VTTEditor.fix_overlapping_cues(
+                    ai_result["audio_description_vtt"]
+                )
+
                 # Upload VTT files to GCS using detected language
                 captions_gcs_uri = await upload_vtt_to_gcs(
                     ai_result["captions_vtt"],