refactor: rewrite pause point refinement algorithm with ordered logic

Completely rewrites the Whisper-based pause point refinement to use a two-phase approach with explicit ordering: Phase 1 - Individual refinement: 1. Check if pause point is "during speaking" (words within ±2s) - If NOT during speaking → use Gemini's exact point, no overlap 2. If during speaking, find nearest sentence boundary 3. Apply appropriate buffering based on context: - Case A: First sentence → pause 500ms before sentence starts - Case B: Last sentence → pause 500ms after sentence ends - Case C: Between sentences → full double buffer (overlap) Phase 2 - Consolidation (after all refinements): - Consolidate cues within 5s of each other to play back-to-back Key changes: - Add SentenceBoundary dataclass for tracking boundaries with context - Add _is_during_speaking() helper to detect speech proximity - Add _find_sentence_boundaries() with longest-gap fallback - Rewrite snap_pause_point() with new ordered algorithm - Update refine_all_pause_points() to pass words and use two phases 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 08:19:03 -06:00 · 2025-12-29 08:19:03 -06:00 · 3588d3fa14
commit 3588d3fa14
parent d092800676
2 changed files with 267 additions and 97 deletions
--- a/backend/app/services/whisper_service.py
+++ b/backend/app/services/whisper_service.py
@ -45,6 +45,22 @@ class SpeechGap:
        return {"sentence": 1, "phrase": 2, "word": 3}.get(self.gap_type, 4)


+@dataclass
+class SentenceBoundary:
+    """A sentence boundary (start or end) for pause point snapping.
+
+    Used to determine where to place pause points relative to sentences:
+    - sentence_end: The end time of a word ending with .!?
+    - sentence_start: The start time of the first word after a sentence-ending word
+    """
+    time: float                # The boundary timestamp
+    boundary_type: str         # "sentence_start" or "sentence_end"
+    word_index: int            # Index of the associated word in the words list
+    has_previous_sentence: bool  # Is there a sentence before this boundary?
+    has_next_sentence: bool      # Is there a sentence after this boundary?
+    gap: SpeechGap | None      # The gap this boundary belongs to (for double-buffer case)
+
+
 class WhisperService:
    """Service for speech analysis using faster-whisper."""

@ -175,105 +191,283 @@ class WhisperService:

        return sorted(gaps, key=lambda g: g.start)

+    def _is_during_speaking(
+        self,
+        pause_point: float,
+        words: list[WordTimestamp],
+        threshold: float = 2.0
+    ) -> bool:
+        """
+        Check if a pause point is "during speaking" (words nearby).
+
+        Args:
+            pause_point: The timestamp to check
+            words: List of word timestamps from Whisper
+            threshold: Max distance in seconds to consider "nearby" (default: 2.0s)
+
+        Returns:
+            True if any word is within ±threshold seconds of the pause point
+        """
+        for word in words:
+            # Check if pause point is near word start or end
+            if abs(word.start - pause_point) <= threshold or abs(word.end - pause_point) <= threshold:
+                return True
+        return False
+
+    def _find_sentence_boundaries(
+        self,
+        words: list[WordTimestamp],
+        gaps: list[SpeechGap]
+    ) -> list[SentenceBoundary]:
+        """
+        Find all sentence boundaries (starts and ends) from the transcript.
+
+        Boundaries are identified from:
+        1. Words ending with sentence punctuation (.!?) - these mark sentence ends
+        2. Words following sentence-ending words - these mark sentence starts
+        3. Fallback: If no punctuation found, use the longest gap as a boundary
+
+        Args:
+            words: List of word timestamps from Whisper
+            gaps: List of speech gaps between words
+
+        Returns:
+            List of SentenceBoundary objects sorted by time
+        """
+        if not words:
+            return []
+
+        boundaries: list[SentenceBoundary] = []
+        sentence_end_punctuation = ('.', '!', '?', '...', '。', '！', '？')
+
+        # Track which word indices end sentences
+        sentence_ending_indices: set[int] = set()
+
+        # Find all sentence-ending words
+        for i, word in enumerate(words):
+            word_text = word.word.rstrip()
+            if word_text.endswith(sentence_end_punctuation):
+                sentence_ending_indices.add(i)
+
+        # If no sentence punctuation found, use the longest gap as a fallback
+        if not sentence_ending_indices and gaps:
+            longest_gap = max(gaps, key=lambda g: g.duration)
+            # Find the word index that ends at this gap
+            for i, word in enumerate(words[:-1]):
+                if abs(word.end - longest_gap.start) < 0.01:  # Match within 10ms
+                    sentence_ending_indices.add(i)
+                    logger.info(
+                        f"No sentence punctuation found, using longest gap "
+                        f"({longest_gap.duration:.2f}s) at {longest_gap.start:.2f}s as boundary"
+                    )
+                    break
+
+        # Create boundaries from sentence-ending words
+        for i in sorted(sentence_ending_indices):
+            word = words[i]
+
+            # Find the gap after this word (if any)
+            associated_gap = None
+            for gap in gaps:
+                if abs(gap.start - word.end) < 0.01:  # Match within 10ms
+                    associated_gap = gap
+                    break
+
+            # Check if there's a previous sentence (any sentence-ending word before this one)
+            has_previous = any(j < i for j in sentence_ending_indices) or i > 0
+
+            # Check if there's a next sentence (any word after this one)
+            has_next = i < len(words) - 1
+
+            # Add sentence END boundary
+            boundaries.append(SentenceBoundary(
+                time=word.end,
+                boundary_type="sentence_end",
+                word_index=i,
+                has_previous_sentence=has_previous,
+                has_next_sentence=has_next,
+                gap=associated_gap
+            ))
+
+            # Add sentence START boundary (next word's start) if there's a next word
+            if has_next and associated_gap:
+                next_word = words[i + 1]
+                # For sentence_start, check if there was a previous sentence
+                # (the sentence that just ended counts as previous)
+                boundaries.append(SentenceBoundary(
+                    time=next_word.start,
+                    boundary_type="sentence_start",
+                    word_index=i + 1,
+                    has_previous_sentence=True,  # The sentence that just ended
+                    has_next_sentence=any(j > i for j in sentence_ending_indices),
+                    gap=associated_gap
+                ))
+
+        # Also add boundaries for the very first and last words if not already covered
+        if words:
+            # First word boundary (if not already a sentence start)
+            first_word = words[0]
+            has_first_boundary = any(
+                b.boundary_type == "sentence_start" and b.word_index == 0
+                for b in boundaries
+            )
+            if not has_first_boundary:
+                boundaries.append(SentenceBoundary(
+                    time=first_word.start,
+                    boundary_type="sentence_start",
+                    word_index=0,
+                    has_previous_sentence=False,  # Nothing before first word
+                    has_next_sentence=len(sentence_ending_indices) > 0 or len(words) > 1,
+                    gap=None
+                ))
+
+            # Last word boundary (if it's a sentence end not already covered)
+            last_idx = len(words) - 1
+            if last_idx not in sentence_ending_indices:
+                last_word = words[last_idx]
+                boundaries.append(SentenceBoundary(
+                    time=last_word.end,
+                    boundary_type="sentence_end",
+                    word_index=last_idx,
+                    has_previous_sentence=len(sentence_ending_indices) > 0 or last_idx > 0,
+                    has_next_sentence=False,  # Nothing after last word
+                    gap=None
+                ))
+
+        return sorted(boundaries, key=lambda b: b.time)
+
    def snap_pause_point(
        self,
        gemini_pause: float,
+        words: list[WordTimestamp],
        gaps: list[SpeechGap],
-        max_search_window: float | None = None
+        boundaries: list[SentenceBoundary],
+        speaking_threshold: float = 2.0
    ) -> tuple[float, float, str | None]:
        """
-        Snap a Gemini pause point to the nearest sentence break, or use exact point if no speech nearby.
+        Snap a Gemini pause point to the nearest sentence boundary using ordered logic.

-        Three possible outcomes:
-        1. No speech nearby: Use Gemini's exact pause point (no overlap needed)
-        2. Speech nearby but no sentence break: Return warning, use Gemini's point
-        3. Sentence break found: Apply "full gap overlap" algorithm
-
-        The "full gap overlap" algorithm (case 3):
-        - pause_point: Just BEFORE the next sentence starts (gap.end - buffer)
-        - resume_from: Just AFTER the previous sentence ends (gap.start + buffer)
-        This uses the entire gap as buffer on BOTH sides of the AD.
+        Algorithm (in order):
+        1. Check if "during speaking" (words within ±threshold)
+           - If NO → Use Gemini's exact pause point, no overlap
+        2. If during speaking, find nearest sentence boundary
+        3. Apply appropriate buffering based on context:
+           - Case A: Beginning of sentence, no previous → pause 500ms before sentence starts
+           - Case B: End of sentence, no next → pause 500ms after sentence ends
+           - Case C: Gap between sentences → full double buffer

        Args:
            gemini_pause: Original pause point from Gemini (seconds)
+            words: List of word timestamps from Whisper
            gaps: List of speech gaps from identify_speech_gaps()
-            max_search_window: Max seconds to search in each direction (default: self.max_search_window)
+            boundaries: List of sentence boundaries from _find_sentence_boundaries()
+            speaking_threshold: Max distance to consider "during speaking" (default: 2.0s)

        Returns:
            Tuple of (pause_point, resume_from, warning_message_or_none)
        """
-        search_window = max_search_window or self.max_search_window
-
-        # First, check if there's ANY speech (any gap type) within the search window
-        # Gaps only exist between words, so if there are no gaps nearby, there's no speech
-        any_gaps_nearby = [
-            g for g in gaps
-            if g.start >= gemini_pause - search_window
-            and g.start <= gemini_pause + search_window
-        ]
-
-        if not any_gaps_nearby:
-            # No speech detected near this pause point - use Gemini's exact recommendation
-            # No overlap needed since there's no dialogue to buffer around
+        # Step 1: Check if "during speaking" (words within ±threshold)
+        if not self._is_during_speaking(gemini_pause, words, speaking_threshold):
+            # Not during speaking - use Gemini's exact pause point
            logger.info(
-                f"No speech detected within +/-{search_window}s of {gemini_pause:.2f}s, "
-                "using Gemini's exact pause point (no overlap)"
+                f"Pause point {gemini_pause:.2f}s is NOT during speaking "
+                f"(no words within ±{speaking_threshold}s), using Gemini's exact point"
            )
-            # Return None for warning - this is not a problem, just a different scenario
            return gemini_pause, gemini_pause, None

-        # There IS speech nearby - now look for sentence breaks specifically
-        sentence_gaps = [g for g in any_gaps_nearby if g.gap_type == "sentence"]
+        # Step 2: During speaking - find nearest sentence boundary
+        if not boundaries:
+            # No boundaries found at all - use Gemini's point with warning
+            logger.warning(f"No sentence boundaries found, using Gemini's exact point {gemini_pause:.2f}s")
+            return gemini_pause, gemini_pause, "No sentence boundaries found in transcript"

-        if not sentence_gaps:
-            # Speech exists but no sentence break found - return warning
-            # The caller may apply special handling (e.g., move first AD to video start)
-            return gemini_pause, gemini_pause, f"No sentence break found within +/-{search_window}s of {gemini_pause:.2f}s"
-
-        # Sort by distance from gemini_pause (closest first)
-        sentence_gaps.sort(key=lambda g: abs(g.start - gemini_pause))
-
-        best_gap = sentence_gaps[0]
-
-        # Small edge buffer to avoid cutting exactly at speech boundaries (prevents clicks/pops)
-        # Use 50ms as minimum, but cap at 10% of gap duration for very short gaps
-        edge_buffer = min(0.05, best_gap.duration * 0.1)
-
-        # "Full gap overlap" algorithm:
-        # - pause_point: Play video until just BEFORE next sentence (gap.end - edge_buffer)
-        # - resume_from: Resume just AFTER previous sentence (gap.start + edge_buffer)
-        # This means the gap portion (minus 2x edge_buffer) gets played twice,
-        # providing maximum natural buffer on both sides of the AD.
-        pause_point = best_gap.end - edge_buffer
-        resume_from = best_gap.start + edge_buffer
+        # Find the boundary closest to the Gemini pause point
+        closest_boundary = min(boundaries, key=lambda b: abs(b.time - gemini_pause))

        logger.debug(
-            f"Full-gap-overlap: gap={best_gap.start:.3f}s-{best_gap.end:.3f}s "
-            f"(duration={best_gap.duration:.3f}s), edge_buffer={edge_buffer:.3f}s, "
-            f"pause_point={pause_point:.3f}s, resume_from={resume_from:.3f}s, "
-            f"overlap_duration={pause_point - resume_from:.3f}s"
+            f"Nearest boundary to {gemini_pause:.2f}s: {closest_boundary.boundary_type} "
+            f"at {closest_boundary.time:.2f}s (distance: {abs(closest_boundary.time - gemini_pause):.2f}s)"
        )

+        # Step 3: Apply appropriate buffering based on context
+        edge_buffer = 0.5  # 500ms buffer for edge cases
+
+        # Case A: Beginning of sentence with no previous sentence (first sentence in video)
+        if closest_boundary.boundary_type == "sentence_start" and not closest_boundary.has_previous_sentence:
+            # Pause 500ms BEFORE the sentence starts (away from speech)
+            pause_point = max(0.0, closest_boundary.time - edge_buffer)
+            resume_from = pause_point  # No overlap
+            logger.info(
+                f"Case A (first sentence): pause_point={pause_point:.2f}s "
+                f"(500ms before sentence start at {closest_boundary.time:.2f}s)"
+            )
+            return pause_point, resume_from, None
+
+        # Case B: End of sentence with no next sentence (last sentence in video)
+        if closest_boundary.boundary_type == "sentence_end" and not closest_boundary.has_next_sentence:
+            # Pause 500ms AFTER the sentence ends (away from speech)
+            pause_point = closest_boundary.time + edge_buffer
+            resume_from = pause_point  # No overlap
+            logger.info(
+                f"Case B (last sentence): pause_point={pause_point:.2f}s "
+                f"(500ms after sentence end at {closest_boundary.time:.2f}s)"
+            )
+            return pause_point, resume_from, None
+
+        # Case C: Gap between two sentences (normal case with double buffer)
+        if closest_boundary.gap:
+            gap = closest_boundary.gap
+            # Small edge buffer to avoid cutting exactly at speech boundaries
+            small_buffer = min(0.05, gap.duration * 0.1)
+
+            # Full double buffer:
+            # - pause_point: Just BEFORE next sentence (gap.end - small_buffer)
+            # - resume_from: Just AFTER previous sentence (gap.start + small_buffer)
+            pause_point = gap.end - small_buffer
+            resume_from = gap.start + small_buffer
+
+            logger.info(
+                f"Case C (between sentences): gap={gap.start:.2f}s-{gap.end:.2f}s, "
+                f"pause_point={pause_point:.2f}s, resume_from={resume_from:.2f}s, "
+                f"overlap={pause_point - resume_from:.2f}s"
+            )
+            return pause_point, resume_from, None
+
+        # Fallback: No gap associated with boundary - use the boundary time with small buffer
+        # This shouldn't normally happen but handles edge cases
+        if closest_boundary.boundary_type == "sentence_end":
+            pause_point = closest_boundary.time + 0.05  # Small buffer after end
+        else:
+            pause_point = max(0.0, closest_boundary.time - 0.05)  # Small buffer before start
+        resume_from = pause_point
+
+        logger.info(
+            f"Fallback: Using boundary at {closest_boundary.time:.2f}s, "
+            f"pause_point={pause_point:.2f}s (no gap available)"
+        )
        return pause_point, resume_from, None

    def refine_all_pause_points(
        self,
        placements: list[dict],
+        words: list[WordTimestamp],
        gaps: list[SpeechGap],
        consolidation_threshold: float = 5.0
    ) -> tuple[list[dict], list[str]]:
        """
        Refine all pause points in a Gemini analysis result.

-        Uses the "full gap overlap" algorithm where:
-        - pause_point: Where to pause video (just before next sentence)
-        - resume_from: Where to resume video (just after previous sentence)
+        Two-phase algorithm:
+        Phase 1: Refine each pause point individually using ordered logic:
+            1. Check if "during speaking" (words within ±2s)
+            2. If not during speaking → use Gemini's exact point
+            3. If during speaking → snap to nearest boundary with appropriate buffering

-        This creates a small overlap where a tiny bit of video plays twice,
-        but provides maximum natural buffer around the audio description.
+        Phase 2: Consolidate cues that are within 5s of each other (after all refinements)

        Args:
            placements: List of placement dicts from Gemini analysis
+            words: Word timestamps from Whisper transcription
            gaps: Speech gaps from Whisper analysis
            consolidation_threshold: If consecutive cues have pause points within
                this many seconds, combine them to play back-to-back (default: 5.0s)
@ -284,51 +478,31 @@ class WhisperService:
        refined_placements = []
        warnings = []

+        # Pre-compute sentence boundaries once for all placements
+        boundaries = self._find_sentence_boundaries(words, gaps)
+        logger.info(f"Found {len(boundaries)} sentence boundaries for pause point refinement")
+
+        # Phase 1: Refine each pause point individually
        for placement in placements:
            refined = placement.copy()

            if placement.get("pause_point") is not None:
                original = placement["pause_point"]
-                pause_point, resume_from, warning = self.snap_pause_point(original, gaps)
+                pause_point, resume_from, warning = self.snap_pause_point(
+                    original, words, gaps, boundaries
+                )

                refined["pause_point"] = pause_point
                refined["resume_from"] = resume_from
                refined["original_pause_point"] = original  # Preserve for debugging

                if warning:
-                    # Special handling for first AD cue: if no sentence break found (but speech exists),
-                    # insert at the very beginning of the video to avoid mid-sentence insertion
-                    if placement["ad_cue_index"] == 0:
-                        refined["pause_point"] = 0.0
-                        refined["resume_from"] = 0.0
-                        warnings.append(
-                            f"Cue 0: No sentence break found within search window of {original:.2f}s, "
-                            "inserting AD at video start (0.0s)"
-                        )
-                        logger.info(
-                            f"First AD cue: No sentence break found near {original:.2f}s, "
-                            "using video start (0.0s) to avoid mid-sentence insertion"
-                        )
-                    else:
-                        warnings.append(f"Cue {placement['ad_cue_index']}: {warning}")
-                        logger.warning(f"Pause point refinement warning for cue {placement['ad_cue_index']}: {warning}")
-                elif pause_point == resume_from:
-                    # No overlap - either no speech nearby, or some other reason
-                    # Log this as info since it's a valid scenario (non-dialogue section)
-                    logger.info(
-                        f"Cue {placement['ad_cue_index']}: Using exact pause point {pause_point:.2f}s "
-                        f"(no overlap - likely non-dialogue section)"
-                    )
-                elif abs(pause_point - original) > 0.1:
-                    logger.info(
-                        f"Refined pause point for cue {placement['ad_cue_index']}: "
-                        f"{original:.2f}s -> pause_at={pause_point:.2f}s, resume_from={resume_from:.2f}s "
-                        f"(overlap={pause_point - resume_from:.2f}s)"
-                    )
+                    warnings.append(f"Cue {placement['ad_cue_index']}: {warning}")
+                    logger.warning(f"Pause point refinement warning for cue {placement['ad_cue_index']}: {warning}")

            refined_placements.append(refined)

-        # Consolidate cues that are close together to avoid mid-sentence insertions
+        # Phase 2: Consolidate cues that are close together (AFTER all individual refinements)
        refined_placements = self._consolidate_close_cues(
            refined_placements, consolidation_threshold, warnings
        )
@ -365,7 +539,6 @@ class WhisperService:

        # First pass: identify consolidated groups and assign same pause_point
        consolidated = [placements[0].copy()]
-        group_resume_from = placements[0].get("resume_from")  # Track the back buffer for the group

        for i in range(1, len(placements)):
            current = placements[i].copy()
@ -396,10 +569,6 @@ class WhisperService:
                        f"Cue {current['ad_cue_index']}: Consolidated with previous cue "
                        f"(pause points were {gap:.2f}s apart, playing back-to-back)"
                    )
-                else:
-                    # New group starts - update group_resume_from for the new group
-                    group_resume_from = current.get("resume_from")
-
            consolidated.append(current)

        # Second pass: fix resume_from values for consolidated groups
--- a/backend/app/tasks/render_accessible_video.py
+++ b/backend/app/tasks/render_accessible_video.py
@ -417,9 +417,10 @@ async def _refine_pause_points_with_whisper(
    gaps = whisper_service.identify_speech_gaps(words)
    logger.info(f"Found {len(gaps)} speech gaps in video for job {job_id}")

-    # Refine pause points
+    # Refine pause points (Phase 1: individual refinement, Phase 2: consolidation)
    refined_placements, warnings = whisper_service.refine_all_pause_points(
        analysis.get("placements", []),
+        words,
        gaps
    )