video-accessibility/backend/app/models/job.py

from datetime import datetime
from enum import Enum
from typing import Any, Literal, Optional

from pydantic import BaseModel, Field, constr


class JobStatus(str, Enum):
    CREATED = "created"
    INGESTING = "ingesting"
    AI_PROCESSING = "ai_processing"
    PENDING_QC = "pending_qc"
    APPROVED_ENGLISH = "approved_english"  # For English source videos
    APPROVED_SOURCE = "approved_source"    # For non-English source videos
    REJECTED = "rejected"
    QC_FEEDBACK = "qc_feedback"
    TRANSLATING = "translating"
    TTS_GENERATING = "tts_generating"
    TTS_FAILED = "tts_failed"  # TTS synthesis failed after retries, requires reprocessing
    RENDERING_VIDEO = "rendering_video"  # Accessible video rendering in progress
    RENDER_FAILED = "render_failed"  # Accessible video rendering failed, requires reprocessing
    RENDERING_QC = "rendering_qc"  # Re-rendering accessible video during QC review
    PENDING_FINAL_REVIEW = "pending_final_review"
    COMPLETED = "completed"

    @classmethod
    def is_approved(cls, status: str) -> bool:
        """Check if status indicates source approval (any language)"""
        return status in [cls.APPROVED_ENGLISH.value, cls.APPROVED_SOURCE.value]


class Source(BaseModel):
    filename: str
    original_filename: Optional[str] = None
    gcs_uri: str
    duration_s: Optional[float] = None
    language: constr(min_length=2, max_length=10) = "en"  # Final source language (from detection or explicit)
    language_hint: Optional[str] = None  # User-provided hint for non-English videos
    detected_language: Optional[str] = None  # AI-detected language from Gemini


class TTSPreferences(BaseModel):
    """TTS voice preferences for audio description generation"""
    provider: Literal["gemini", "google", "elevenlabs"] = "gemini"
    default_voice: str = "Kore"  # Default Gemini voice
    voices_per_language: dict[str, str] = {}  # {"en": "Kore", "es": "Aoede"}
    # TTS quality and style settings
    model: Literal["flash", "pro"] = "flash"  # flash = fast/cheap, pro = higher quality
    speed: float = Field(default=1.0, ge=0.5, le=2.0)  # Speech rate multiplier
    style_preset: Literal[
        "neutral", "calm", "energetic", "professional", "warm", "documentary", "custom"
    ] = "neutral"
    custom_style_prompt: Optional[str] = None  # Used when style_preset is "custom"
    # ElevenLabs-specific settings
    stability: Optional[float] = None  # 0.0-1.0, default 0.5 when used
    similarity_boost: Optional[float] = None  # 0.0-1.0, default 0.5 when used


class RequestedOutputs(BaseModel):
    captions_vtt: bool = True
    audio_description_vtt: bool = True
    audio_description_mp3: bool = True
    accessible_video_mp4: bool = False  # Rendered video with embedded audio descriptions
    accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None  # User-selected method
    sdh_vtt: bool = False  # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation
    languages: list[str] = []
    transcreation: list[str] = []
    tts_preferences: Optional[TTSPreferences] = None
    translation_mode: Literal["traditional", "video_native"] = "video_native"


class PausePointData(BaseModel):
    """Pause point timing data for accessible video editing during QC."""
    cue_index: int  # AD cue index this pause point belongs to
    original_ms: float  # Rendered timeline position (ms) - for UI display
    source_ms: Optional[float] = None  # Source video cut point (ms) - for re-rendering (None = use original_ms)
    adjusted_ms: Optional[float] = None  # User-adjusted timestamp (ms), None = use original
    min_bound_ms: float  # Minimum allowed value (end of previous AD segment)
    max_bound_ms: float  # Maximum allowed value (start of next AD segment)


class VideoSegmentMetadata(BaseModel):
    """Metadata for a video segment between pause points."""
    segment_index: int  # 0-based segment index
    start_ms: float  # Start timestamp in source video (ms)
    end_ms: float  # End timestamp in source video (ms)
    gcs_uri: str  # GCS path to segment MP4
    duration_ms: float  # Actual segment duration (ms)
    is_freeze_frame: bool = False  # True if this is a freeze frame segment with AD audio
    cue_index: Optional[int] = None  # AD cue index (only for freeze frame segments)


class TTSRegenerationRequest(BaseModel):
    """Request to regenerate TTS for a specific cue during QC."""
    cue_index: int
    requested_at: datetime
    new_text: Optional[str] = None  # If provided, use this text instead of current VTT
    status: Literal["pending", "processing", "completed", "failed"] = "pending"
    error_message: Optional[str] = None


class AccessibleVideoEditState(BaseModel):
    """Editable state for accessible video during QC review."""
    pause_points: list[PausePointData] = []
    video_segments: list[VideoSegmentMetadata] = []
    tts_regeneration_queue: list[TTSRegenerationRequest] = []
    last_render_at: Optional[datetime] = None
    whisper_refine_enabled: bool = False  # Default: off (user enables if cue positions changed)


class LangOutput(BaseModel):
    captions_vtt_gcs: Optional[str] = None
    sdh_captions_vtt_gcs: Optional[str] = None  # SDH-format captions (speaker labels, sound effects, music)
    ad_vtt_gcs: Optional[str] = None
    ad_mp3_gcs: Optional[str] = None
    # Accessible video outputs
    accessible_video_gcs: Optional[str] = None  # Rendered accessible MP4
    accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None
    retimed_captions_vtt_gcs: Optional[str] = None  # Re-timed captions for pause-insert method
    ad_cues_gcs_prefix: Optional[str] = None  # GCS path prefix for per-cue MP3 segments
    ad_cue_manifest: Optional[list[dict]] = None  # Per-cue manifest: [{cue_index, gcs_uri, text, duration_s}]
    # QC editing state for accessible video
    video_segments_gcs_prefix: Optional[str] = None  # GCS prefix for persisted video segments
    accessible_video_edit_state: Optional[AccessibleVideoEditState] = None
    origin: Optional[Literal["translate", "transcreate", "gemini_translate", "video_native"]] = None
    qa_notes: Optional[str] = None
    descriptive_transcript_gcs: Optional[str] = None  # WCAG-compliant combined speech+description transcript


class ReviewHistoryItem(BaseModel):
    at: datetime
    status: str
    by: Optional[str] = None
    notes: Optional[str] = None


class Review(BaseModel):
    notes: Optional[str] = ""
    reviewer_id: Optional[str] = None
    history: list[ReviewHistoryItem] = []


class AISection(BaseModel):
    ingestion_json: Optional[dict[str, Any]] = None
    confidence: Optional[float] = None


class AccessibleVideoProgressItem(BaseModel):
    """Progress tracking for accessible video rendering per language."""
    status: Literal["pending", "rendering", "completed", "failed"] = "pending"
    method: Optional[Literal["overlay", "pause_insert"]] = None
    error_message: Optional[str] = None
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None


class Job(BaseModel):
    id: Optional[str] = Field(None, alias="_id")
    client_id: str
    title: str
    source: Source
    requested_outputs: RequestedOutputs
    status: JobStatus = JobStatus.CREATED
    review: Review = Review()
    outputs: Optional[dict[str, LangOutput]] = None
    accessible_video_progress: Optional[dict[str, AccessibleVideoProgressItem]] = None
    ai: Optional[AISection] = None
    error: Optional[dict[str, Any]] = None
    tts_rewrites: Optional[list[dict[str, Any]]] = None  # Track auto-rewritten TTS cues
    project_id: Optional[str] = None  # Platform project this job belongs to (Client → Project → Job)
    brand_context: Optional[str] = None  # Brand names present in the video for accurate product identification
    cost_tracker_project_id: Optional[str] = None  # External project ID for AI cost attribution
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None

    class Config:
        populate_by_name = True
        use_enum_values = True


class JobCreate(BaseModel):
    title: str
    source_is_english: bool = True  # True = English source, False = other language (auto-detect)
    language_hint: Optional[str] = None  # Optional hint when source_is_english=False
    requested_outputs: RequestedOutputs
    brand_context: Optional[str] = None  # Comma-separated brand names present in the video (e.g. "Sellotape, Coca-Cola")


class JobUpdate(BaseModel):
    title: Optional[str] = None
    status: Optional[JobStatus] = None
    review: Optional[Review] = None
    outputs: Optional[dict[str, LangOutput]] = None
    ai: Optional[AISection] = None
    error: Optional[dict[str, Any]] = None