diff --git a/backend/app/services/whisper_service.py b/backend/app/services/whisper_service.py index 39e3129..5fb895c 100644 --- a/backend/app/services/whisper_service.py +++ b/backend/app/services/whisper_service.py @@ -45,6 +45,22 @@ class SpeechGap: return {"sentence": 1, "phrase": 2, "word": 3}.get(self.gap_type, 4) +@dataclass +class SentenceBoundary: + """A sentence boundary (start or end) for pause point snapping. + + Used to determine where to place pause points relative to sentences: + - sentence_end: The end time of a word ending with .!? + - sentence_start: The start time of the first word after a sentence-ending word + """ + time: float # The boundary timestamp + boundary_type: str # "sentence_start" or "sentence_end" + word_index: int # Index of the associated word in the words list + has_previous_sentence: bool # Is there a sentence before this boundary? + has_next_sentence: bool # Is there a sentence after this boundary? + gap: SpeechGap | None # The gap this boundary belongs to (for double-buffer case) + + class WhisperService: """Service for speech analysis using faster-whisper.""" @@ -175,105 +191,283 @@ class WhisperService: return sorted(gaps, key=lambda g: g.start) + def _is_during_speaking( + self, + pause_point: float, + words: list[WordTimestamp], + threshold: float = 2.0 + ) -> bool: + """ + Check if a pause point is "during speaking" (words nearby). + + Args: + pause_point: The timestamp to check + words: List of word timestamps from Whisper + threshold: Max distance in seconds to consider "nearby" (default: 2.0s) + + Returns: + True if any word is within ±threshold seconds of the pause point + """ + for word in words: + # Check if pause point is near word start or end + if abs(word.start - pause_point) <= threshold or abs(word.end - pause_point) <= threshold: + return True + return False + + def _find_sentence_boundaries( + self, + words: list[WordTimestamp], + gaps: list[SpeechGap] + ) -> list[SentenceBoundary]: + """ + Find all sentence boundaries (starts and ends) from the transcript. + + Boundaries are identified from: + 1. Words ending with sentence punctuation (.!?) - these mark sentence ends + 2. Words following sentence-ending words - these mark sentence starts + 3. Fallback: If no punctuation found, use the longest gap as a boundary + + Args: + words: List of word timestamps from Whisper + gaps: List of speech gaps between words + + Returns: + List of SentenceBoundary objects sorted by time + """ + if not words: + return [] + + boundaries: list[SentenceBoundary] = [] + sentence_end_punctuation = ('.', '!', '?', '...', '。', '!', '?') + + # Track which word indices end sentences + sentence_ending_indices: set[int] = set() + + # Find all sentence-ending words + for i, word in enumerate(words): + word_text = word.word.rstrip() + if word_text.endswith(sentence_end_punctuation): + sentence_ending_indices.add(i) + + # If no sentence punctuation found, use the longest gap as a fallback + if not sentence_ending_indices and gaps: + longest_gap = max(gaps, key=lambda g: g.duration) + # Find the word index that ends at this gap + for i, word in enumerate(words[:-1]): + if abs(word.end - longest_gap.start) < 0.01: # Match within 10ms + sentence_ending_indices.add(i) + logger.info( + f"No sentence punctuation found, using longest gap " + f"({longest_gap.duration:.2f}s) at {longest_gap.start:.2f}s as boundary" + ) + break + + # Create boundaries from sentence-ending words + for i in sorted(sentence_ending_indices): + word = words[i] + + # Find the gap after this word (if any) + associated_gap = None + for gap in gaps: + if abs(gap.start - word.end) < 0.01: # Match within 10ms + associated_gap = gap + break + + # Check if there's a previous sentence (any sentence-ending word before this one) + has_previous = any(j < i for j in sentence_ending_indices) or i > 0 + + # Check if there's a next sentence (any word after this one) + has_next = i < len(words) - 1 + + # Add sentence END boundary + boundaries.append(SentenceBoundary( + time=word.end, + boundary_type="sentence_end", + word_index=i, + has_previous_sentence=has_previous, + has_next_sentence=has_next, + gap=associated_gap + )) + + # Add sentence START boundary (next word's start) if there's a next word + if has_next and associated_gap: + next_word = words[i + 1] + # For sentence_start, check if there was a previous sentence + # (the sentence that just ended counts as previous) + boundaries.append(SentenceBoundary( + time=next_word.start, + boundary_type="sentence_start", + word_index=i + 1, + has_previous_sentence=True, # The sentence that just ended + has_next_sentence=any(j > i for j in sentence_ending_indices), + gap=associated_gap + )) + + # Also add boundaries for the very first and last words if not already covered + if words: + # First word boundary (if not already a sentence start) + first_word = words[0] + has_first_boundary = any( + b.boundary_type == "sentence_start" and b.word_index == 0 + for b in boundaries + ) + if not has_first_boundary: + boundaries.append(SentenceBoundary( + time=first_word.start, + boundary_type="sentence_start", + word_index=0, + has_previous_sentence=False, # Nothing before first word + has_next_sentence=len(sentence_ending_indices) > 0 or len(words) > 1, + gap=None + )) + + # Last word boundary (if it's a sentence end not already covered) + last_idx = len(words) - 1 + if last_idx not in sentence_ending_indices: + last_word = words[last_idx] + boundaries.append(SentenceBoundary( + time=last_word.end, + boundary_type="sentence_end", + word_index=last_idx, + has_previous_sentence=len(sentence_ending_indices) > 0 or last_idx > 0, + has_next_sentence=False, # Nothing after last word + gap=None + )) + + return sorted(boundaries, key=lambda b: b.time) + def snap_pause_point( self, gemini_pause: float, + words: list[WordTimestamp], gaps: list[SpeechGap], - max_search_window: float | None = None + boundaries: list[SentenceBoundary], + speaking_threshold: float = 2.0 ) -> tuple[float, float, str | None]: """ - Snap a Gemini pause point to the nearest sentence break, or use exact point if no speech nearby. + Snap a Gemini pause point to the nearest sentence boundary using ordered logic. - Three possible outcomes: - 1. No speech nearby: Use Gemini's exact pause point (no overlap needed) - 2. Speech nearby but no sentence break: Return warning, use Gemini's point - 3. Sentence break found: Apply "full gap overlap" algorithm - - The "full gap overlap" algorithm (case 3): - - pause_point: Just BEFORE the next sentence starts (gap.end - buffer) - - resume_from: Just AFTER the previous sentence ends (gap.start + buffer) - This uses the entire gap as buffer on BOTH sides of the AD. + Algorithm (in order): + 1. Check if "during speaking" (words within ±threshold) + - If NO → Use Gemini's exact pause point, no overlap + 2. If during speaking, find nearest sentence boundary + 3. Apply appropriate buffering based on context: + - Case A: Beginning of sentence, no previous → pause 500ms before sentence starts + - Case B: End of sentence, no next → pause 500ms after sentence ends + - Case C: Gap between sentences → full double buffer Args: gemini_pause: Original pause point from Gemini (seconds) + words: List of word timestamps from Whisper gaps: List of speech gaps from identify_speech_gaps() - max_search_window: Max seconds to search in each direction (default: self.max_search_window) + boundaries: List of sentence boundaries from _find_sentence_boundaries() + speaking_threshold: Max distance to consider "during speaking" (default: 2.0s) Returns: Tuple of (pause_point, resume_from, warning_message_or_none) """ - search_window = max_search_window or self.max_search_window - - # First, check if there's ANY speech (any gap type) within the search window - # Gaps only exist between words, so if there are no gaps nearby, there's no speech - any_gaps_nearby = [ - g for g in gaps - if g.start >= gemini_pause - search_window - and g.start <= gemini_pause + search_window - ] - - if not any_gaps_nearby: - # No speech detected near this pause point - use Gemini's exact recommendation - # No overlap needed since there's no dialogue to buffer around + # Step 1: Check if "during speaking" (words within ±threshold) + if not self._is_during_speaking(gemini_pause, words, speaking_threshold): + # Not during speaking - use Gemini's exact pause point logger.info( - f"No speech detected within +/-{search_window}s of {gemini_pause:.2f}s, " - "using Gemini's exact pause point (no overlap)" + f"Pause point {gemini_pause:.2f}s is NOT during speaking " + f"(no words within ±{speaking_threshold}s), using Gemini's exact point" ) - # Return None for warning - this is not a problem, just a different scenario return gemini_pause, gemini_pause, None - # There IS speech nearby - now look for sentence breaks specifically - sentence_gaps = [g for g in any_gaps_nearby if g.gap_type == "sentence"] + # Step 2: During speaking - find nearest sentence boundary + if not boundaries: + # No boundaries found at all - use Gemini's point with warning + logger.warning(f"No sentence boundaries found, using Gemini's exact point {gemini_pause:.2f}s") + return gemini_pause, gemini_pause, "No sentence boundaries found in transcript" - if not sentence_gaps: - # Speech exists but no sentence break found - return warning - # The caller may apply special handling (e.g., move first AD to video start) - return gemini_pause, gemini_pause, f"No sentence break found within +/-{search_window}s of {gemini_pause:.2f}s" - - # Sort by distance from gemini_pause (closest first) - sentence_gaps.sort(key=lambda g: abs(g.start - gemini_pause)) - - best_gap = sentence_gaps[0] - - # Small edge buffer to avoid cutting exactly at speech boundaries (prevents clicks/pops) - # Use 50ms as minimum, but cap at 10% of gap duration for very short gaps - edge_buffer = min(0.05, best_gap.duration * 0.1) - - # "Full gap overlap" algorithm: - # - pause_point: Play video until just BEFORE next sentence (gap.end - edge_buffer) - # - resume_from: Resume just AFTER previous sentence (gap.start + edge_buffer) - # This means the gap portion (minus 2x edge_buffer) gets played twice, - # providing maximum natural buffer on both sides of the AD. - pause_point = best_gap.end - edge_buffer - resume_from = best_gap.start + edge_buffer + # Find the boundary closest to the Gemini pause point + closest_boundary = min(boundaries, key=lambda b: abs(b.time - gemini_pause)) logger.debug( - f"Full-gap-overlap: gap={best_gap.start:.3f}s-{best_gap.end:.3f}s " - f"(duration={best_gap.duration:.3f}s), edge_buffer={edge_buffer:.3f}s, " - f"pause_point={pause_point:.3f}s, resume_from={resume_from:.3f}s, " - f"overlap_duration={pause_point - resume_from:.3f}s" + f"Nearest boundary to {gemini_pause:.2f}s: {closest_boundary.boundary_type} " + f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)" ) + # Step 3: Apply appropriate buffering based on context + edge_buffer = 0.5 # 500ms buffer for edge cases + + # Case A: Beginning of sentence with no previous sentence (first sentence in video) + if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence: + # Pause 500ms BEFORE the sentence starts (away from speech) + pause_point = max(0.0, closest_boundary.time - edge_buffer) + resume_from = pause_point # No overlap + logger.info( + f"Case A (first sentence): pause_point={pause_point:.2f}s " + f"(500ms before sentence start at {closest_boundary.time:.2f}s)" + ) + return pause_point, resume_from, None + + # Case B: End of sentence with no next sentence (last sentence in video) + if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence: + # Pause 500ms AFTER the sentence ends (away from speech) + pause_point = closest_boundary.time + edge_buffer + resume_from = pause_point # No overlap + logger.info( + f"Case B (last sentence): pause_point={pause_point:.2f}s " + f"(500ms after sentence end at {closest_boundary.time:.2f}s)" + ) + return pause_point, resume_from, None + + # Case C: Gap between two sentences (normal case with double buffer) + if closest_boundary.gap: + gap = closest_boundary.gap + # Small edge buffer to avoid cutting exactly at speech boundaries + small_buffer = min(0.05, gap.duration * 0.1) + + # Full double buffer: + # - pause_point: Just BEFORE next sentence (gap.end - small_buffer) + # - resume_from: Just AFTER previous sentence (gap.start + small_buffer) + pause_point = gap.end - small_buffer + resume_from = gap.start + small_buffer + + logger.info( + f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, " + f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, " + f"overlap={pause_point - resume_from:.2f}s" + ) + return pause_point, resume_from, None + + # Fallback: No gap associated with boundary - use the boundary time with small buffer + # This shouldn't normally happen but handles edge cases + if closest_boundary.boundary_type == "sentence_end": + pause_point = closest_boundary.time + 0.05 # Small buffer after end + else: + pause_point = max(0.0, closest_boundary.time - 0.05) # Small buffer before start + resume_from = pause_point + + logger.info( + f"Fallback: Using boundary at {closest_boundary.time:.2f}s, " + f"pause_point={pause_point:.2f}s (no gap available)" + ) return pause_point, resume_from, None def refine_all_pause_points( self, placements: list[dict], + words: list[WordTimestamp], gaps: list[SpeechGap], consolidation_threshold: float = 5.0 ) -> tuple[list[dict], list[str]]: """ Refine all pause points in a Gemini analysis result. - Uses the "full gap overlap" algorithm where: - - pause_point: Where to pause video (just before next sentence) - - resume_from: Where to resume video (just after previous sentence) + Two-phase algorithm: + Phase 1: Refine each pause point individually using ordered logic: + 1. Check if "during speaking" (words within ±2s) + 2. If not during speaking → use Gemini's exact point + 3. If during speaking → snap to nearest boundary with appropriate buffering - This creates a small overlap where a tiny bit of video plays twice, - but provides maximum natural buffer around the audio description. + Phase 2: Consolidate cues that are within 5s of each other (after all refinements) Args: placements: List of placement dicts from Gemini analysis + words: Word timestamps from Whisper transcription gaps: Speech gaps from Whisper analysis consolidation_threshold: If consecutive cues have pause points within this many seconds, combine them to play back-to-back (default: 5.0s) @@ -284,51 +478,31 @@ class WhisperService: refined_placements = [] warnings = [] + # Pre-compute sentence boundaries once for all placements + boundaries = self._find_sentence_boundaries(words, gaps) + logger.info(f"Found {len(boundaries)} sentence boundaries for pause point refinement") + + # Phase 1: Refine each pause point individually for placement in placements: refined = placement.copy() if placement.get("pause_point") is not None: original = placement["pause_point"] - pause_point, resume_from, warning = self.snap_pause_point(original, gaps) + pause_point, resume_from, warning = self.snap_pause_point( + original, words, gaps, boundaries + ) refined["pause_point"] = pause_point refined["resume_from"] = resume_from refined["original_pause_point"] = original # Preserve for debugging if warning: - # Special handling for first AD cue: if no sentence break found (but speech exists), - # insert at the very beginning of the video to avoid mid-sentence insertion - if placement["ad_cue_index"] == 0: - refined["pause_point"] = 0.0 - refined["resume_from"] = 0.0 - warnings.append( - f"Cue 0: No sentence break found within search window of {original:.2f}s, " - "inserting AD at video start (0.0s)" - ) - logger.info( - f"First AD cue: No sentence break found near {original:.2f}s, " - "using video start (0.0s) to avoid mid-sentence insertion" - ) - else: - warnings.append(f"Cue {placement['ad_cue_index']}: {warning}") - logger.warning(f"Pause point refinement warning for cue {placement['ad_cue_index']}: {warning}") - elif pause_point == resume_from: - # No overlap - either no speech nearby, or some other reason - # Log this as info since it's a valid scenario (non-dialogue section) - logger.info( - f"Cue {placement['ad_cue_index']}: Using exact pause point {pause_point:.2f}s " - f"(no overlap - likely non-dialogue section)" - ) - elif abs(pause_point - original) > 0.1: - logger.info( - f"Refined pause point for cue {placement['ad_cue_index']}: " - f"{original:.2f}s -> pause_at={pause_point:.2f}s, resume_from={resume_from:.2f}s " - f"(overlap={pause_point - resume_from:.2f}s)" - ) + warnings.append(f"Cue {placement['ad_cue_index']}: {warning}") + logger.warning(f"Pause point refinement warning for cue {placement['ad_cue_index']}: {warning}") refined_placements.append(refined) - # Consolidate cues that are close together to avoid mid-sentence insertions + # Phase 2: Consolidate cues that are close together (AFTER all individual refinements) refined_placements = self._consolidate_close_cues( refined_placements, consolidation_threshold, warnings ) @@ -365,7 +539,6 @@ class WhisperService: # First pass: identify consolidated groups and assign same pause_point consolidated = [placements[0].copy()] - group_resume_from = placements[0].get("resume_from") # Track the back buffer for the group for i in range(1, len(placements)): current = placements[i].copy() @@ -396,10 +569,6 @@ class WhisperService: f"Cue {current['ad_cue_index']}: Consolidated with previous cue " f"(pause points were {gap:.2f}s apart, playing back-to-back)" ) - else: - # New group starts - update group_resume_from for the new group - group_resume_from = current.get("resume_from") - consolidated.append(current) # Second pass: fix resume_from values for consolidated groups diff --git a/backend/app/tasks/render_accessible_video.py b/backend/app/tasks/render_accessible_video.py index c6d555e..d035874 100644 --- a/backend/app/tasks/render_accessible_video.py +++ b/backend/app/tasks/render_accessible_video.py @@ -417,9 +417,10 @@ async def _refine_pause_points_with_whisper( gaps = whisper_service.identify_speech_gaps(words) logger.info(f"Found {len(gaps)} speech gaps in video for job {job_id}") - # Refine pause points + # Refine pause points (Phase 1: individual refinement, Phase 2: consolidation) refined_placements, warnings = whisper_service.refine_all_pause_points( analysis.get("placements", []), + words, gaps )