from datetime import datetime from enum import Enum from typing import Any, Literal, Optional from pydantic import BaseModel, Field, constr class JobStatus(str, Enum): CREATED = "created" INGESTING = "ingesting" AI_PROCESSING = "ai_processing" PENDING_QC = "pending_qc" APPROVED_ENGLISH = "approved_english" # For English source videos APPROVED_SOURCE = "approved_source" # For non-English source videos REJECTED = "rejected" QC_FEEDBACK = "qc_feedback" TRANSLATING = "translating" TTS_GENERATING = "tts_generating" TTS_FAILED = "tts_failed" # TTS synthesis failed after retries, requires reprocessing RENDERING_VIDEO = "rendering_video" # Accessible video rendering in progress RENDER_FAILED = "render_failed" # Accessible video rendering failed, requires reprocessing RENDERING_QC = "rendering_qc" # Re-rendering accessible video during QC review PENDING_FINAL_REVIEW = "pending_final_review" COMPLETED = "completed" @classmethod def is_approved(cls, status: str) -> bool: """Check if status indicates source approval (any language)""" return status in [cls.APPROVED_ENGLISH.value, cls.APPROVED_SOURCE.value] class Source(BaseModel): filename: str original_filename: Optional[str] = None gcs_uri: str duration_s: Optional[float] = None language: constr(min_length=2, max_length=10) = "en" # Final source language (from detection or explicit) language_hint: Optional[str] = None # User-provided hint for non-English videos detected_language: Optional[str] = None # AI-detected language from Gemini class TTSPreferences(BaseModel): """TTS voice preferences for audio description generation""" provider: Literal["gemini", "google", "elevenlabs"] = "gemini" default_voice: str = "Kore" # Default Gemini voice voices_per_language: dict[str, str] = {} # {"en": "Kore", "es": "Aoede"} # TTS quality and style settings model: Literal["flash", "pro"] = "flash" # flash = fast/cheap, pro = higher quality speed: float = Field(default=1.0, ge=0.5, le=2.0) # Speech rate multiplier style_preset: Literal[ "neutral", "calm", "energetic", "professional", "warm", "documentary", "custom" ] = "neutral" custom_style_prompt: Optional[str] = None # Used when style_preset is "custom" # ElevenLabs-specific settings stability: Optional[float] = None # 0.0-1.0, default 0.5 when used similarity_boost: Optional[float] = None # 0.0-1.0, default 0.5 when used class RequestedOutputs(BaseModel): captions_vtt: bool = True audio_description_vtt: bool = True audio_description_mp3: bool = True accessible_video_mp4: bool = False # Rendered video with embedded audio descriptions accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None # User-selected method sdh_vtt: bool = False # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation languages: list[str] = [] transcreation: list[str] = [] tts_preferences: Optional[TTSPreferences] = None translation_mode: Literal["traditional", "video_native"] = "video_native" class PausePointData(BaseModel): """Pause point timing data for accessible video editing during QC.""" cue_index: int # AD cue index this pause point belongs to original_ms: float # Rendered timeline position (ms) - for UI display source_ms: Optional[float] = None # Source video cut point (ms) - for re-rendering (None = use original_ms) adjusted_ms: Optional[float] = None # User-adjusted timestamp (ms), None = use original min_bound_ms: float # Minimum allowed value (end of previous AD segment) max_bound_ms: float # Maximum allowed value (start of next AD segment) class VideoSegmentMetadata(BaseModel): """Metadata for a video segment between pause points.""" segment_index: int # 0-based segment index start_ms: float # Start timestamp in source video (ms) end_ms: float # End timestamp in source video (ms) gcs_uri: str # GCS path to segment MP4 duration_ms: float # Actual segment duration (ms) is_freeze_frame: bool = False # True if this is a freeze frame segment with AD audio cue_index: Optional[int] = None # AD cue index (only for freeze frame segments) class TTSRegenerationRequest(BaseModel): """Request to regenerate TTS for a specific cue during QC.""" cue_index: int requested_at: datetime new_text: Optional[str] = None # If provided, use this text instead of current VTT status: Literal["pending", "processing", "completed", "failed"] = "pending" error_message: Optional[str] = None class AccessibleVideoEditState(BaseModel): """Editable state for accessible video during QC review.""" pause_points: list[PausePointData] = [] video_segments: list[VideoSegmentMetadata] = [] tts_regeneration_queue: list[TTSRegenerationRequest] = [] last_render_at: Optional[datetime] = None whisper_refine_enabled: bool = False # Default: off (user enables if cue positions changed) class LangOutput(BaseModel): captions_vtt_gcs: Optional[str] = None sdh_captions_vtt_gcs: Optional[str] = None # SDH-format captions (speaker labels, sound effects, music) ad_vtt_gcs: Optional[str] = None ad_mp3_gcs: Optional[str] = None # Accessible video outputs accessible_video_gcs: Optional[str] = None # Rendered accessible MP4 accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None retimed_captions_vtt_gcs: Optional[str] = None # Re-timed captions for pause-insert method ad_cues_gcs_prefix: Optional[str] = None # GCS path prefix for per-cue MP3 segments ad_cue_manifest: Optional[list[dict]] = None # Per-cue manifest: [{cue_index, gcs_uri, text, duration_s}] # QC editing state for accessible video video_segments_gcs_prefix: Optional[str] = None # GCS prefix for persisted video segments accessible_video_edit_state: Optional[AccessibleVideoEditState] = None origin: Optional[Literal["translate", "transcreate", "gemini_translate", "video_native"]] = None qa_notes: Optional[str] = None descriptive_transcript_gcs: Optional[str] = None # WCAG-compliant combined speech+description transcript class ReviewHistoryItem(BaseModel): at: datetime status: str by: Optional[str] = None notes: Optional[str] = None class Review(BaseModel): notes: Optional[str] = "" reviewer_id: Optional[str] = None history: list[ReviewHistoryItem] = [] class AISection(BaseModel): ingestion_json: Optional[dict[str, Any]] = None confidence: Optional[float] = None class AccessibleVideoProgressItem(BaseModel): """Progress tracking for accessible video rendering per language.""" status: Literal["pending", "rendering", "completed", "failed"] = "pending" method: Optional[Literal["overlay", "pause_insert"]] = None error_message: Optional[str] = None started_at: Optional[datetime] = None completed_at: Optional[datetime] = None class Job(BaseModel): id: Optional[str] = Field(None, alias="_id") client_id: str title: str source: Source requested_outputs: RequestedOutputs status: JobStatus = JobStatus.CREATED review: Review = Review() outputs: Optional[dict[str, LangOutput]] = None accessible_video_progress: Optional[dict[str, AccessibleVideoProgressItem]] = None ai: Optional[AISection] = None error: Optional[dict[str, Any]] = None tts_rewrites: Optional[list[dict[str, Any]]] = None # Track auto-rewritten TTS cues project_id: Optional[str] = None # Platform project this job belongs to (Client → Project → Job) brand_context: Optional[str] = None # Brand names present in the video for accurate product identification cost_tracker_project_id: Optional[str] = None # External project ID for AI cost attribution created_at: Optional[datetime] = None updated_at: Optional[datetime] = None class Config: populate_by_name = True use_enum_values = True class JobCreate(BaseModel): title: str source_is_english: bool = True # True = English source, False = other language (auto-detect) language_hint: Optional[str] = None # Optional hint when source_is_english=False requested_outputs: RequestedOutputs brand_context: Optional[str] = None # Comma-separated brand names present in the video (e.g. "Sellotape, Coca-Cola") class JobUpdate(BaseModel): title: Optional[str] = None status: Optional[JobStatus] = None review: Optional[Review] = None outputs: Optional[dict[str, LangOutput]] = None ai: Optional[AISection] = None error: Optional[dict[str, Any]] = None