From c8a610b3f765bb27775af526d1ff7b64f9d82b36 Mon Sep 17 00:00:00 2001 From: Vadym Samoilenko Date: Fri, 8 May 2026 13:23:08 +0100 Subject: [PATCH] fix(vtt): auto-fix overlapping cues from AI-generated output Gemini occasionally produces captions where a cue's start_time is earlier than the previous cue's end_time. Add VTTEditor.fix_overlapping_cues() that trims each cue's end_time to 1ms before the next cue's start, applied to both captions and AD VTT immediately after AI generation. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/lib/vtt.py | 14 ++++++++++++++ backend/app/tasks/ingest_and_ai.py | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/backend/app/lib/vtt.py b/backend/app/lib/vtt.py index 80bf8c2..4dc7435 100644 --- a/backend/app/lib/vtt.py +++ b/backend/app/lib/vtt.py @@ -207,6 +207,20 @@ class VTTEditor: return len(errors) == 0, errors + @staticmethod + def fix_overlapping_cues(vtt_content: str) -> str: + """Trim end_time of each cue so it does not overlap the next cue's start_time.""" + cues = VTTParser.parse(vtt_content) + for i in range(1, len(cues)): + if cues[i].start_time < cues[i - 1].end_time: + # Clamp previous cue end to 1ms before next cue start + new_end = cues[i].start_time - 0.001 + # Never let end_time go at or below start_time + if new_end <= cues[i - 1].start_time: + new_end = cues[i - 1].start_time + 0.001 + cues[i - 1].end_time = new_end + return VTTParser.build(cues) + @staticmethod def get_cue_count(vtt_content: str) -> int: """Get the number of cues in VTT content""" diff --git a/backend/app/tasks/ingest_and_ai.py b/backend/app/tasks/ingest_and_ai.py index f567cc9..fb6e9bb 100644 --- a/backend/app/tasks/ingest_and_ai.py +++ b/backend/app/tasks/ingest_and_ai.py @@ -218,8 +218,15 @@ async def ingest_and_ai_task_impl(job_id: str): # Align caption timings with Whisper word-level timestamps (Bug 5) captions_vtt = await _align_captions_with_whisper(captions_vtt, temp_path, job_id) + # Fix overlapping cues that Gemini occasionally produces + captions_vtt = VTTEditor.fix_overlapping_cues(captions_vtt) ai_result["captions_vtt"] = captions_vtt + # Fix overlapping cues in AD VTT as well + ai_result["audio_description_vtt"] = VTTEditor.fix_overlapping_cues( + ai_result["audio_description_vtt"] + ) + # Upload VTT files to GCS using detected language captions_gcs_uri = await upload_vtt_to_gcs( ai_result["captions_vtt"],