Existing jobs in the database don't have source_ms field. Making it optional allows the API to load these jobs without validation errors. The re-render task already handles the fallback to original_ms when source_ms is None. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
211 lines
8.7 KiB
Python
211 lines
8.7 KiB
Python
"""Schemas for accessible video generation with embedded audio descriptions."""
|
|
|
|
from enum import Enum
|
|
from typing import Optional
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class AccessibleVideoMethod(str, Enum):
|
|
"""Method used for integrating audio descriptions into video."""
|
|
OVERLAY = "overlay"
|
|
PAUSE_INSERT = "pause_insert"
|
|
|
|
|
|
class SentenceBoundary(BaseModel):
|
|
"""A sentence boundary detected in the video's spoken audio.
|
|
|
|
Only complete sentences ending with terminal punctuation are valid pause points.
|
|
Commas and other mid-sentence punctuation are NOT sentence boundaries.
|
|
"""
|
|
text: str = Field(..., description="The transcribed sentence ending with terminal punctuation")
|
|
end_time: float = Field(..., description="Precise timestamp when the sentence ends (seconds)")
|
|
punctuation: str = Field(..., description="Terminal punctuation ONLY: '.', '?', or '!' - NEVER commas")
|
|
silence_after: float = Field(..., description="Duration of silence after this sentence (seconds)")
|
|
|
|
|
|
class ADPlacementCue(BaseModel):
|
|
"""Placement instruction for a single audio description cue from Gemini analysis."""
|
|
ad_cue_index: int = Field(..., description="Index of the AD cue in the VTT (0-based)")
|
|
original_start_time: float = Field(..., description="Original VTT start time in seconds")
|
|
original_end_time: float = Field(..., description="Original VTT end time in seconds")
|
|
target_start_time: float = Field(..., description="Target time in output video (seconds)")
|
|
ad_duration: float = Field(..., description="Duration of the AD TTS audio in seconds")
|
|
# For pause-insert method
|
|
pause_point: Optional[float] = Field(
|
|
None,
|
|
description="Where to pause the video - just before the next sentence starts (gap.end - buffer). Used for pause-insert method."
|
|
)
|
|
resume_from: Optional[float] = Field(
|
|
None,
|
|
description="Where to resume video after AD plays - just after the previous sentence ends (gap.start + buffer). Creates a small overlap for natural transitions."
|
|
)
|
|
pause_point_rationale: Optional[str] = Field(
|
|
None,
|
|
description="Explanation of why this pause point was chosen, referencing the sentence boundary."
|
|
)
|
|
# Whisper refinement tracking
|
|
original_pause_point: Optional[float] = Field(
|
|
None,
|
|
description="Original pause point from Gemini before Whisper refinement (seconds)."
|
|
)
|
|
# For overlay method
|
|
duck_start: Optional[float] = Field(
|
|
None,
|
|
description="When to start ducking original audio (seconds). Used for overlay method."
|
|
)
|
|
duck_end: Optional[float] = Field(
|
|
None,
|
|
description="When to end ducking original audio (seconds). Used for overlay method."
|
|
)
|
|
|
|
|
|
class GeminiAccessibleVideoAnalysis(BaseModel):
|
|
"""Response schema for Gemini accessible video analysis.
|
|
|
|
This model captures the AI's determination of the optimal method
|
|
for integrating audio descriptions and the specific placement
|
|
instructions for each AD cue.
|
|
"""
|
|
method: AccessibleVideoMethod = Field(
|
|
...,
|
|
description="Chosen method: overlay (duck audio) or pause_insert (freeze-frame)"
|
|
)
|
|
method_rationale: str = Field(
|
|
...,
|
|
description="Explanation of why this method was chosen based on video analysis"
|
|
)
|
|
dialogue_density: float = Field(
|
|
...,
|
|
ge=0,
|
|
le=1,
|
|
description="Score from 0-1 indicating how much dialogue/speech is in the video"
|
|
)
|
|
sentence_boundaries: list[SentenceBoundary] = Field(
|
|
default_factory=list,
|
|
description="All sentence endings detected in the video's spoken audio (required for pause_insert)"
|
|
)
|
|
placements: list[ADPlacementCue] = Field(
|
|
...,
|
|
description="Placement instructions for each AD cue"
|
|
)
|
|
total_added_duration: float = Field(
|
|
default=0,
|
|
description="Total pause time added to video (pause-insert method only, in seconds)"
|
|
)
|
|
warnings: list[str] = Field(
|
|
default_factory=list,
|
|
description="Any potential issues or concerns detected during analysis"
|
|
)
|
|
|
|
|
|
class ADCueSegment(BaseModel):
|
|
"""Represents a single synthesized AD cue segment."""
|
|
cue_index: int = Field(..., description="Index of the cue (0-based)")
|
|
start_time: float = Field(..., description="Original start time from VTT")
|
|
end_time: float = Field(..., description="Original end time from VTT")
|
|
duration: float = Field(..., description="Actual TTS audio duration in seconds")
|
|
gcs_uri: str = Field(..., description="GCS URI to the individual MP3 segment")
|
|
text: str = Field(..., description="The AD text that was synthesized")
|
|
|
|
|
|
class AccessibleVideoRenderRequest(BaseModel):
|
|
"""Request to render an accessible video for a job/language."""
|
|
job_id: str
|
|
language: str
|
|
|
|
|
|
class AccessibleVideoProgress(BaseModel):
|
|
"""Progress status for accessible video rendering."""
|
|
status: str = Field(..., description="pending | rendering | completed | failed")
|
|
method: Optional[AccessibleVideoMethod] = None
|
|
error_message: Optional[str] = None
|
|
started_at: Optional[str] = None
|
|
completed_at: Optional[str] = None
|
|
|
|
|
|
# === QC Review Accessible Video Editing Schemas ===
|
|
|
|
|
|
class PausePointResponse(BaseModel):
|
|
"""Pause point timing data for QC editing."""
|
|
cue_index: int = Field(..., description="AD cue index this pause point belongs to")
|
|
original_ms: float = Field(..., description="Rendered timeline position (ms) - for display")
|
|
source_ms: Optional[float] = Field(None, description="Source video cut point (ms) - for re-rendering (None = use original_ms)")
|
|
adjusted_ms: Optional[float] = Field(None, description="User-adjusted timestamp (ms)")
|
|
min_bound_ms: float = Field(..., description="Minimum allowed value (ms)")
|
|
max_bound_ms: float = Field(..., description="Maximum allowed value (ms)")
|
|
|
|
|
|
class VideoSegmentResponse(BaseModel):
|
|
"""Metadata for a video segment."""
|
|
segment_index: int = Field(..., description="0-based segment index")
|
|
start_ms: float = Field(..., description="Start timestamp in source video (ms)")
|
|
end_ms: float = Field(..., description="End timestamp in source video (ms)")
|
|
gcs_uri: str = Field(..., description="GCS path to segment MP4")
|
|
duration_ms: float = Field(..., description="Actual segment duration (ms)")
|
|
is_freeze_frame: bool = Field(False, description="True if freeze frame with AD audio")
|
|
cue_index: Optional[int] = Field(None, description="AD cue index (freeze frames only)")
|
|
|
|
|
|
class TTSRegenerationItem(BaseModel):
|
|
"""A queued TTS regeneration request."""
|
|
cue_index: int = Field(..., description="AD cue index to regenerate")
|
|
requested_at: str = Field(..., description="ISO timestamp when requested")
|
|
new_text: Optional[str] = Field(None, description="Override text (if provided)")
|
|
status: str = Field("pending", description="pending | processing | completed | failed")
|
|
error_message: Optional[str] = None
|
|
|
|
|
|
class AccessibleVideoEditStateResponse(BaseModel):
|
|
"""Current editable state for accessible video during QC review."""
|
|
pause_points: list[PausePointResponse] = Field(
|
|
default_factory=list,
|
|
description="All pause points with original and adjusted values"
|
|
)
|
|
video_segments: list[VideoSegmentResponse] = Field(
|
|
default_factory=list,
|
|
description="Video segment metadata for timeline display"
|
|
)
|
|
tts_regeneration_queue: list[TTSRegenerationItem] = Field(
|
|
default_factory=list,
|
|
description="Queued TTS regeneration requests"
|
|
)
|
|
last_render_at: Optional[str] = Field(
|
|
None,
|
|
description="ISO timestamp of last accessible video render"
|
|
)
|
|
total_duration_ms: float = Field(..., description="Total accessible video duration (ms)")
|
|
accessible_video_url: Optional[str] = Field(
|
|
None,
|
|
description="Signed URL for accessible video preview"
|
|
)
|
|
|
|
|
|
class PausePointUpdateRequest(BaseModel):
|
|
"""Request to update a pause point's adjusted timing."""
|
|
adjusted_ms: float = Field(
|
|
...,
|
|
description="New pause point timestamp in milliseconds"
|
|
)
|
|
|
|
|
|
class TTSRegenerationQueueRequest(BaseModel):
|
|
"""Request to queue TTS regeneration for specific cues."""
|
|
cue_indices: list[int] = Field(
|
|
...,
|
|
description="List of AD cue indices to regenerate"
|
|
)
|
|
|
|
|
|
class TTSRegenerationRemoveRequest(BaseModel):
|
|
"""Request to remove a cue from the TTS regeneration queue."""
|
|
cue_index: int = Field(..., description="AD cue index to remove from queue")
|
|
|
|
|
|
class RerenderAccessibleVideoRequest(BaseModel):
|
|
"""Request to re-render accessible video with QC changes."""
|
|
whisper_refine: bool = Field(
|
|
False,
|
|
description="Run Whisper pause point refinement (enable if cue count/position changed)"
|
|
)
|