amazon-transcreation/backend/app/pipeline/contracts.py

"""Pipeline data contracts - Pydantic models for inter-agent communication."""

from typing import Any

from pydantic import BaseModel


class TMEntry(BaseModel):
    """A single Translation Memory entry."""
    seg_key: str
    date: str
    en: str
    lc: str
    tx: str
    nt: str = ""
    channel: str = ""
    sub_channel: str = ""
    _text: str = ""

    model_config = {"from_attributes": True}


class SourceLineContract(BaseModel):
    """A parsed source line from the input xlsx."""
    line_id: str
    row_order: int
    en_gb: str
    copy_type: str | None = None
    creative_guidance: str | None = None
    visual_ref: str | None = None
    char_limit: str | None = None
    is_display_format: bool = False


class FileManifest(BaseModel):
    """Manifest of all files loaded for a job."""
    tm_files: list[str] = []
    glossary_file: str | None = None
    blacklist_file: str | None = None
    tov_global_file: str | None = None
    tov_supplement_file: str | None = None
    locale_considerations_file: str | None = None
    date_pct_formats_file: str | None = None
    # Per-job user-uploaded supplementary files (paths on disk). Each
    # entry's filename may begin with a locale code prefix to gate the
    # file to that locale only (e.g. "de-DE_glossary.txt"); files without
    # a recognised locale prefix apply to all locales.
    supplementary_files: list[str] = []


class JobParams(BaseModel):
    """Parameters for a transcreation job."""
    job_id: str
    client_id: str
    locale_code: str
    channel: str
    sub_channel: str | None = None
    programme: str
    campaign_name: str
    context_prompt: str | None = None
    tm_channels: list[str] = []
    llm_model: str | None = None


class ParsedJob(BaseModel):
    """Output of Agent 1 (Validator): validated job parameters + source."""
    job_params: JobParams
    source_lines: list[SourceLineContract]
    file_manifest: FileManifest


class ConfirmedMatch(BaseModel):
    """A confirmed TM match for a source line."""
    seg_key: str
    pass_found: int
    date: str
    en: str
    tx: str
    nt: str = ""
    channel: str = ""
    sub_channel: str = ""
    is_cross_channel: bool = False


class TMSweepResult(BaseModel):
    """TM sweep results for a single source line."""
    line_id: str
    confirmed_matches: list[ConfirmedMatch] = []
    pass_4_triggered: bool = False
    pass_4_result: ConfirmedMatch | None = None
    no_match: bool = False


class RankingDeclaration(BaseModel):
    """Ranking decision for a single source line."""
    line_id: str
    winning_entry: ConfirmedMatch | None = None
    runner_ups: list[ConfirmedMatch] = []
    confidence_tier: str = "low"
    option_count: int = 3
    is_new_creative_line: bool = False
    notes: str = ""


class DraftOption(BaseModel):
    """A single draft transcreation option."""
    text: str
    backtranslation: str
    rationale: str


class DraftOutput(BaseModel):
    """Transcreation draft output for a single source line."""
    line_id: str
    option_1: DraftOption
    option_2: DraftOption | None = None
    option_3: DraftOption | None = None
    tm_entries_cited: list[str] = []
    adaptations_applied: list[str] = []


class ComplianceViolation(BaseModel):
    """A single compliance violation found during checking."""
    type: str
    option_affected: int
    description: str
    severity: str = "warning"


class ComplianceResult(BaseModel):
    """Compliance check result for a single source line."""
    line_id: str
    passed: bool
    violations: list[ComplianceViolation] = []
    character_counts: dict[str, int] = {}


class PipelineContext(BaseModel):
    """Full pipeline context passed between agents."""
    job_params: JobParams
    source_lines: list[SourceLineContract] = []
    file_manifest: FileManifest = FileManifest()
    tm_sweep_results: list[TMSweepResult] = []
    ranking_declarations: list[RankingDeclaration] = []
    draft_outputs: list[DraftOutput] = []
    compliance_results: list[ComplianceResult] = []
    linguistic_summary: str = ""

    # Token usage accumulators (updated by agents after each LLM call)
    total_input_tokens: int = 0
    total_output_tokens: int = 0
    total_estimated_cost: float = 0.0