video-accessibility/backend/app/schemas/accessible_video.py

"""Schemas for accessible video generation with embedded audio descriptions."""

from enum import Enum
from typing import Optional

from pydantic import BaseModel, Field


class AccessibleVideoMethod(str, Enum):
    """Method used for integrating audio descriptions into video."""
    OVERLAY = "overlay"
    PAUSE_INSERT = "pause_insert"


class SentenceBoundary(BaseModel):
    """A sentence boundary detected in the video's spoken audio.

    Only complete sentences ending with terminal punctuation are valid pause points.
    Commas and other mid-sentence punctuation are NOT sentence boundaries.
    """
    text: str = Field(..., description="The transcribed sentence ending with terminal punctuation")
    end_time: float = Field(..., description="Precise timestamp when the sentence ends (seconds)")
    punctuation: str = Field(..., description="Terminal punctuation ONLY: '.', '?', or '!' - NEVER commas")
    silence_after: float = Field(..., description="Duration of silence after this sentence (seconds)")


class ADPlacementCue(BaseModel):
    """Placement instruction for a single audio description cue from Gemini analysis."""
    ad_cue_index: int = Field(..., description="Index of the AD cue in the VTT (0-based)")
    original_start_time: float = Field(..., description="Original VTT start time in seconds")
    original_end_time: float = Field(..., description="Original VTT end time in seconds")
    target_start_time: float = Field(..., description="Target time in output video (seconds)")
    ad_duration: float = Field(..., description="Duration of the AD TTS audio in seconds")
    # For pause-insert method
    pause_point: Optional[float] = Field(
        None,
        description="Where to pause the video - just before the next sentence starts (gap.end - buffer). Used for pause-insert method."
    )
    resume_from: Optional[float] = Field(
        None,
        description="Where to resume video after AD plays - just after the previous sentence ends (gap.start + buffer). Creates a small overlap for natural transitions."
    )
    pause_point_rationale: Optional[str] = Field(
        None,
        description="Explanation of why this pause point was chosen, referencing the sentence boundary."
    )
    # Whisper refinement tracking
    original_pause_point: Optional[float] = Field(
        None,
        description="Original pause point from Gemini before Whisper refinement (seconds)."
    )
    # For overlay method
    duck_start: Optional[float] = Field(
        None,
        description="When to start ducking original audio (seconds). Used for overlay method."
    )
    duck_end: Optional[float] = Field(
        None,
        description="When to end ducking original audio (seconds). Used for overlay method."
    )


class GeminiAccessibleVideoAnalysis(BaseModel):
    """Response schema for Gemini accessible video analysis.

    This model captures the AI's determination of the optimal method
    for integrating audio descriptions and the specific placement
    instructions for each AD cue.
    """
    method: AccessibleVideoMethod = Field(
        ...,
        description="Chosen method: overlay (duck audio) or pause_insert (freeze-frame)"
    )
    method_rationale: str = Field(
        ...,
        description="Explanation of why this method was chosen based on video analysis"
    )
    dialogue_density: float = Field(
        ...,
        ge=0,
        le=1,
        description="Score from 0-1 indicating how much dialogue/speech is in the video"
    )
    sentence_boundaries: list[SentenceBoundary] = Field(
        default_factory=list,
        description="All sentence endings detected in the video's spoken audio (required for pause_insert)"
    )
    placements: list[ADPlacementCue] = Field(
        ...,
        description="Placement instructions for each AD cue"
    )
    total_added_duration: float = Field(
        default=0,
        description="Total pause time added to video (pause-insert method only, in seconds)"
    )
    warnings: list[str] = Field(
        default_factory=list,
        description="Any potential issues or concerns detected during analysis"
    )


class ADCueSegment(BaseModel):
    """Represents a single synthesized AD cue segment."""
    cue_index: int = Field(..., description="Index of the cue (0-based)")
    start_time: float = Field(..., description="Original start time from VTT")
    end_time: float = Field(..., description="Original end time from VTT")
    duration: float = Field(..., description="Actual TTS audio duration in seconds")
    gcs_uri: str = Field(..., description="GCS URI to the individual MP3 segment")
    text: str = Field(..., description="The AD text that was synthesized")


class AccessibleVideoRenderRequest(BaseModel):
    """Request to render an accessible video for a job/language."""
    job_id: str
    language: str


class AccessibleVideoProgress(BaseModel):
    """Progress status for accessible video rendering."""
    status: str = Field(..., description="pending | rendering | completed | failed")
    method: Optional[AccessibleVideoMethod] = None
    error_message: Optional[str] = None
    started_at: Optional[str] = None
    completed_at: Optional[str] = None


# === QC Review Accessible Video Editing Schemas ===


class PausePointResponse(BaseModel):
    """Pause point timing data for QC editing."""
    cue_index: int = Field(..., description="AD cue index this pause point belongs to")
    original_ms: float = Field(..., description="Rendered timeline position (ms) - for display")
    source_ms: Optional[float] = Field(None, description="Source video cut point (ms) - for re-rendering (None = use original_ms)")
    adjusted_ms: Optional[float] = Field(None, description="User-adjusted timestamp (ms)")
    min_bound_ms: float = Field(..., description="Minimum allowed value (ms)")
    max_bound_ms: float = Field(..., description="Maximum allowed value (ms)")


class VideoSegmentResponse(BaseModel):
    """Metadata for a video segment."""
    segment_index: int = Field(..., description="0-based segment index")
    start_ms: float = Field(..., description="Start timestamp in source video (ms)")
    end_ms: float = Field(..., description="End timestamp in source video (ms)")
    gcs_uri: str = Field(..., description="GCS path to segment MP4")
    duration_ms: float = Field(..., description="Actual segment duration (ms)")
    is_freeze_frame: bool = Field(False, description="True if freeze frame with AD audio")
    cue_index: Optional[int] = Field(None, description="AD cue index (freeze frames only)")


class TTSRegenerationItem(BaseModel):
    """A queued TTS regeneration request."""
    cue_index: int = Field(..., description="AD cue index to regenerate")
    requested_at: str = Field(..., description="ISO timestamp when requested")
    new_text: Optional[str] = Field(None, description="Override text (if provided)")
    status: str = Field("pending", description="pending | processing | completed | failed")
    error_message: Optional[str] = None


class AccessibleVideoEditStateResponse(BaseModel):
    """Current editable state for accessible video during QC review."""
    pause_points: list[PausePointResponse] = Field(
        default_factory=list,
        description="All pause points with original and adjusted values"
    )
    video_segments: list[VideoSegmentResponse] = Field(
        default_factory=list,
        description="Video segment metadata for timeline display"
    )
    tts_regeneration_queue: list[TTSRegenerationItem] = Field(
        default_factory=list,
        description="Queued TTS regeneration requests"
    )
    last_render_at: Optional[str] = Field(
        None,
        description="ISO timestamp of last accessible video render"
    )
    total_duration_ms: float = Field(..., description="Total accessible video duration (ms)")
    accessible_video_url: Optional[str] = Field(
        None,
        description="Signed URL for accessible video preview"
    )


class PausePointUpdateRequest(BaseModel):
    """Request to update a pause point's adjusted timing."""
    adjusted_ms: float = Field(
        ...,
        description="New pause point timestamp in milliseconds"
    )


class TTSRegenerationQueueRequest(BaseModel):
    """Request to queue TTS regeneration for specific cues."""
    cue_indices: list[int] = Field(
        ...,
        description="List of AD cue indices to regenerate"
    )


class TTSRegenerationRemoveRequest(BaseModel):
    """Request to remove a cue from the TTS regeneration queue."""
    cue_index: int = Field(..., description="AD cue index to remove from queue")


class RerenderAccessibleVideoRequest(BaseModel):
    """Request to re-render accessible video with QC changes."""
    whisper_refine: bool = Field(
        False,
        description="Run Whisper pause point refinement (enable if cue count/position changed)"
    )