diff --git a/backend/app/services/video_renderer.py b/backend/app/services/video_renderer.py index 90daac1..94439c1 100644 --- a/backend/app/services/video_renderer.py +++ b/backend/app/services/video_renderer.py @@ -285,15 +285,12 @@ class VideoRendererService: key=lambda p: p["pause_point"] ) - # Debug logging for pause points (full-gap-overlap algorithm) - logger.info(f"Pause-insert (full-gap-overlap): {len(sorted_placements)} placements with pause points") + # Debug logging for pause points (midpoint algorithm with silence buffers) + logger.info(f"Pause-insert (midpoint + 500ms silence buffers): {len(sorted_placements)} placements") for i, p in enumerate(sorted_placements): - resume_from = p.get('resume_from', p.get('pause_point')) # Fallback for backwards compat - overlap = p.get('pause_point', 0) - resume_from if resume_from else 0 logger.info( f" Placement {i}: cue_index={p.get('ad_cue_index')}, " - f"pause_at={p.get('pause_point'):.2f}s, resume_from={resume_from:.2f}s, " - f"overlap={overlap:.2f}s, ad_duration={p.get('ad_duration'):.2f}s, " + f"pause_at={p.get('pause_point'):.2f}s, ad_duration={p.get('ad_duration'):.2f}s, " f"consolidated={p.get('consolidated_with_previous', False)}" ) @@ -320,7 +317,6 @@ class VideoRendererService: pause_point = placement["pause_point"] cue_index = placement["ad_cue_index"] ad_duration = placement["ad_duration"] - resume_from = placement.get("resume_from", pause_point) # Fallback for backwards compat # Validate pause_point is within video bounds if pause_point >= source_duration: @@ -330,11 +326,6 @@ class VideoRendererService: ) pause_point = max(0, source_duration - 0.1) # Clamp to 100ms before end - if resume_from >= source_duration: - resume_from = pause_point - - overlap = max(0, pause_point - resume_from) - # Get the AD audio for this cue ad_mp3_path = cue_to_mp3.get(cue_index) if not ad_mp3_path: @@ -362,40 +353,30 @@ class VideoRendererService: ) # 3. Prepare audio for freeze segment - # If there's overlap (Case C - between sentences), we need catch-up audio - # The freeze frame stays visible while source audio "catches up" to pause_point - # This avoids visual jump-back artifacts - if overlap > 0.01: # Only if meaningful overlap (> 10ms) - # Extract catch-up audio from source video [resume_from, pause_point] - catchup_audio_path = temp_dir_path / f"catchup_{i}.m4a" - await self._extract_audio_segment( - source_video_path, - resume_from, - overlap, - str(catchup_audio_path), - video_props - ) + # Add 500ms silence before AND after the AD audio for smooth transitions + silence_duration = 0.5 # 500ms + silence_path = temp_dir_path / f"silence_{i}.m4a" + await self._generate_silence( + silence_duration, + str(silence_path), + video_props + ) - # Concatenate AD audio + catch-up audio - # Order: AD plays first, then source audio catches up - combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a" - await self._concatenate_audio( - [ad_mp3_path, str(catchup_audio_path)], - str(combined_audio_path), - video_props - ) + # Concatenate: 500ms silence + AD audio + 500ms silence + combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a" + await self._concatenate_audio( + [str(silence_path), ad_mp3_path, str(silence_path)], + str(combined_audio_path), + video_props + ) - freeze_audio_path = str(combined_audio_path) - total_freeze_duration = ad_duration + overlap + freeze_audio_path = str(combined_audio_path) + total_freeze_duration = ad_duration + (2 * silence_duration) # AD + 1.0s total silence - logger.info( - f"Cue {cue_index}: Audio catch-up enabled - " - f"AD={ad_duration:.2f}s + catchup={overlap:.2f}s = {total_freeze_duration:.2f}s" - ) - else: - # No overlap (Case A/B) - just use AD audio - freeze_audio_path = ad_mp3_path - total_freeze_duration = ad_duration + logger.info( + f"Cue {cue_index}: Freeze segment with silence buffers - " + f"500ms + AD={ad_duration:.2f}s + 500ms = {total_freeze_duration:.2f}s" + ) # 4. Create freeze segment with prepared audio freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4" @@ -774,6 +755,43 @@ class VideoRendererService: ] await self._run_ffmpeg(cmd) + async def _generate_silence( + self, + duration: float, + output_path: str, + props: dict[str, Any] + ): + """ + Generate a silent audio file of specified duration. + + Used to create 500ms silence buffers before/after AD audio. + + Args: + duration: Duration of silence in seconds + output_path: Path to output audio file + props: Video properties (for sample_rate, channels) + """ + if duration <= 0: + raise ValueError(f"Invalid silence duration: {duration}") + + logger.debug( + f"Generating {duration:.2f}s silence: output={output_path}" + ) + + cmd = [ + self.ffmpeg_path, + "-y", + "-f", "lavfi", + "-i", f"anullsrc=r={props['sample_rate']}:cl={'stereo' if props['channels'] == '2' else 'mono'}", + "-t", str(duration), + "-c:a", "aac", + "-ar", props["sample_rate"], + "-ac", props["channels"], + "-b:a", "192k", + output_path + ] + await self._run_ffmpeg(cmd) + async def _concatenate_segments( self, segment_paths: list[str], diff --git a/backend/app/services/vtt_retimer.py b/backend/app/services/vtt_retimer.py index 6373e56..c355c0e 100644 --- a/backend/app/services/vtt_retimer.py +++ b/backend/app/services/vtt_retimer.py @@ -18,41 +18,37 @@ class VTTRetimerService: """ Generate new VTT with adjusted timings for pause-insert accessible video. - Uses the "full gap overlap" algorithm where: - - Video plays until pause_point (just before next sentence) - - Freeze frame with AD audio plays for ad_duration - - Video resumes from resume_from (just after previous sentence) + Uses the simplified midpoint algorithm with silence buffers: + - Video plays until pause_point (midpoint between sentences) + - Freeze frame shows with: 500ms silence + AD audio + 500ms silence + - Video resumes from the same pause_point - This means the effective time offset is: ad_duration + (pause_point - resume_from) - because after the AD, we jump BACK in the source timeline. - - Captions in the overlap zone [resume_from, pause_point] only show once - (during the first playback, before the AD). + The effective time offset is: ad_duration + 1.0 seconds (for the silence buffers) Args: original_vtt: Original VTT content - analysis: Gemini analysis with placements containing pause_point, resume_from, and ad_duration + analysis: Gemini analysis with placements containing pause_point and ad_duration Returns: Re-timed VTT content """ placements = analysis.get("placements", []) + # Silence buffer duration (500ms before + 500ms after AD) + silence_buffer_total = 1.0 + # Build list of (pause_point, effective_offset) sorted by time - # effective_offset = ad_duration + overlap (where overlap = pause_point - resume_from) + # effective_offset = ad_duration + 1.0s (for silence buffers) pauses = [] for placement in placements: pause_point = placement.get("pause_point") - resume_from = placement.get("resume_from", pause_point) # Fallback for backwards compat ad_duration = placement.get("ad_duration", 0) if pause_point is not None and ad_duration > 0: - # Overlap is the video portion that plays twice (between resume_from and pause_point) - overlap = max(0, pause_point - resume_from) if resume_from else 0 - effective_offset = ad_duration + overlap + effective_offset = ad_duration + silence_buffer_total pauses.append((pause_point, effective_offset)) logger.debug( f"Pause at {pause_point:.2f}s: ad_duration={ad_duration:.2f}s, " - f"overlap={overlap:.2f}s, effective_offset={effective_offset:.2f}s" + f"silence_buffers=1.0s, effective_offset={effective_offset:.2f}s" ) pauses.sort(key=lambda x: x[0]) @@ -61,7 +57,7 @@ class VTTRetimerService: logger.info("No pauses to apply, returning original VTT") return original_vtt - logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (full-gap-overlap algorithm)") + logger.info(f"Re-timing VTT with {len(pauses)} pause insertions (midpoint + silence buffers)") # Parse and retime cues cues = self._parse_vtt(original_vtt) @@ -69,8 +65,6 @@ class VTTRetimerService: for cue in cues: # Calculate cumulative offset from all pauses that occur before this cue's start - # Captions in overlap zone [resume_from, pause_point] only shift on pause_point, - # so they show during the first playback (before AD), not the second cumulative_offset = sum( effective_offset for pause_point, effective_offset in pauses if pause_point <= cue["start_time"] diff --git a/backend/app/services/whisper_service.py b/backend/app/services/whisper_service.py index 5fb895c..fc8215c 100644 --- a/backend/app/services/whisper_service.py +++ b/backend/app/services/whisper_service.py @@ -345,16 +345,18 @@ class WhisperService: speaking_threshold: float = 2.0 ) -> tuple[float, float, str | None]: """ - Snap a Gemini pause point to the nearest sentence boundary using ordered logic. + Snap a Gemini pause point to the nearest sentence boundary. - Algorithm (in order): + Simplified algorithm: 1. Check if "during speaking" (words within ±threshold) - - If NO → Use Gemini's exact pause point, no overlap - 2. If during speaking, find nearest sentence boundary - 3. Apply appropriate buffering based on context: - - Case A: Beginning of sentence, no previous → pause 500ms before sentence starts - - Case B: End of sentence, no next → pause 500ms after sentence ends - - Case C: Gap between sentences → full double buffer + - If NO → Use Gemini's exact pause point + 2. If during speaking, find nearest sentence gap and snap to MIDPOINT + 3. Edge cases: + - Case A: First sentence in video → pause at video start (0.0) + - Case B: Last sentence in video → pause at video end + + The video renderer adds 500ms silence buffers before/after AD audio, + so no overlap or catch-up logic is needed here. Args: gemini_pause: Original pause point from Gemini (seconds) @@ -365,6 +367,7 @@ class WhisperService: Returns: Tuple of (pause_point, resume_from, warning_message_or_none) + Note: resume_from always equals pause_point with the simplified algorithm """ # Step 1: Check if "during speaking" (words within ±threshold) if not self._is_during_speaking(gemini_pause, words, speaking_threshold): @@ -389,63 +392,44 @@ class WhisperService: f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)" ) - # Step 3: Apply appropriate buffering based on context - edge_buffer = 0.5 # 500ms buffer for edge cases - - # Case A: Beginning of sentence with no previous sentence (first sentence in video) + # Case A: First sentence in video (no previous sentence) → snap to video start if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence: - # Pause 500ms BEFORE the sentence starts (away from speech) - pause_point = max(0.0, closest_boundary.time - edge_buffer) - resume_from = pause_point # No overlap + pause_point = 0.0 logger.info( f"Case A (first sentence): pause_point={pause_point:.2f}s " - f"(500ms before sentence start at {closest_boundary.time:.2f}s)" + f"(snapped to video start)" ) - return pause_point, resume_from, None + return pause_point, pause_point, None - # Case B: End of sentence with no next sentence (last sentence in video) + # Case B: Last sentence in video (no next sentence) → snap to boundary time if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence: - # Pause 500ms AFTER the sentence ends (away from speech) - pause_point = closest_boundary.time + edge_buffer - resume_from = pause_point # No overlap + pause_point = closest_boundary.time logger.info( f"Case B (last sentence): pause_point={pause_point:.2f}s " - f"(500ms after sentence end at {closest_boundary.time:.2f}s)" + f"(snapped to video end at sentence boundary)" ) - return pause_point, resume_from, None + return pause_point, pause_point, None - # Case C: Gap between two sentences (normal case with double buffer) + # Case C: Gap between two sentences → snap to MIDPOINT of the gap if closest_boundary.gap: gap = closest_boundary.gap - # Small edge buffer to avoid cutting exactly at speech boundaries - small_buffer = min(0.05, gap.duration * 0.1) - - # Full double buffer: - # - pause_point: Just BEFORE next sentence (gap.end - small_buffer) - # - resume_from: Just AFTER previous sentence (gap.start + small_buffer) - pause_point = gap.end - small_buffer - resume_from = gap.start + small_buffer + # Calculate midpoint between end of previous sentence and start of next + midpoint = (gap.start + gap.end) / 2.0 logger.info( f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, " - f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, " - f"overlap={pause_point - resume_from:.2f}s" + f"midpoint={midpoint:.2f}s (resume from same point)" ) - return pause_point, resume_from, None + return midpoint, midpoint, None - # Fallback: No gap associated with boundary - use the boundary time with small buffer + # Fallback: No gap associated with boundary - use the boundary time directly # This shouldn't normally happen but handles edge cases - if closest_boundary.boundary_type == "sentence_end": - pause_point = closest_boundary.time + 0.05 # Small buffer after end - else: - pause_point = max(0.0, closest_boundary.time - 0.05) # Small buffer before start - resume_from = pause_point - + pause_point = closest_boundary.time logger.info( f"Fallback: Using boundary at {closest_boundary.time:.2f}s, " f"pause_point={pause_point:.2f}s (no gap available)" ) - return pause_point, resume_from, None + return pause_point, pause_point, None def refine_all_pause_points( self, @@ -518,13 +502,9 @@ class WhisperService: """ Consolidate AD cues whose pause points are within threshold seconds of each other. - Consolidated cues are treated as a single AD segment: - - All cues in a group share the same pause_point (front buffer applied once) - - Only the LAST cue in the group keeps resume_from (back buffer applied once) - - Middle cues have resume_from = pause_point (no video between ADs) - - This ensures consolidated ADs play seamlessly back-to-back without - repeating the overlap video segment between each one. + Consolidated cues share the same pause_point and play back-to-back during + the freeze frame. With the simplified midpoint algorithm, resume_from always + equals pause_point, so no complex buffer logic is needed. Args: placements: List of refined placement dicts @@ -532,12 +512,11 @@ class WhisperService: warnings: List to append warning messages to Returns: - Updated placements with consolidated pause points and resume_from values + Updated placements with consolidated pause points """ if len(placements) < 2: return placements - # First pass: identify consolidated groups and assign same pause_point consolidated = [placements[0].copy()] for i in range(1, len(placements)): @@ -553,12 +532,10 @@ class WhisperService: if 0 < gap <= threshold: # Consolidate: set current cue to use same pause point as previous original_pause = current_pause - original_resume = current.get("resume_from") current["pause_point"] = previous_pause + current["resume_from"] = previous_pause # Always same as pause_point current["consolidated_with_previous"] = True current["original_pause_point_before_consolidation"] = original_pause - current["original_resume_from_before_consolidation"] = original_resume - # Keep tracking the group's resume_from (we'll assign it to the last cue later) logger.info( f"Consolidated cue {current['ad_cue_index']} with previous cue: " @@ -571,31 +548,6 @@ class WhisperService: ) consolidated.append(current) - # Second pass: fix resume_from values for consolidated groups - # Only the LAST cue in each group should have the back buffer (resume_from < pause_point) - # All other cues should have resume_from = pause_point (no video between ADs) - for i in range(len(consolidated)): - current = consolidated[i] - current_pause = current.get("pause_point") - - if current_pause is None: - continue - - # Check if next cue has the same pause_point (meaning current is NOT last in group) - if i < len(consolidated) - 1: - next_pause = consolidated[i + 1].get("pause_point") - if next_pause == current_pause: - # Current is NOT the last in the group - remove back buffer - # Set resume_from = pause_point so no video plays between this AD and the next - original_resume = current.get("resume_from") - if original_resume != current_pause: - current["resume_from"] = current_pause - current["resume_from_removed_for_consolidation"] = original_resume - logger.debug( - f"Cue {current.get('ad_cue_index')}: Removed back buffer for seamless " - f"consolidated playback (resume_from {original_resume:.2f}s -> {current_pause:.2f}s)" - ) - # Log the final consolidated groups self._log_consolidated_groups(consolidated)