TM upload-replacement bug (critical):
- Uploads were writing to /storage/clients/<uuid>/tm/... but the pipeline
reads from /storage/amazon/tm/... — replacements were silently ignored
- upload_tm_file now writes to the canonical pipeline path
/storage/amazon/tm/<locale>/flat_<channel>_<lc>.json (overwrites in place)
- Filename casing is preserved when an existing file is being replaced
(the on-disk seeded files use mixed casing: flat_MASS, flat_value,
flat_PrimeSpeed); falls back to CHANNEL_FILE_MAP, then user-typed case
- Registry upsert by (client_id, locale_code, channel): replaces row in
place rather than inserting duplicates
- Verified: replacement file at canonical path, registry COUNT=1, no dupes
Supplementary files now reach the LLM (critical):
- New supplementary_files field on FileManifest
- _resolve_file_manifest scans /storage/jobs/<job_id>/supplementary/ and
populates the manifest, with per-locale gating by filename prefix
(e.g. de-DE_glossary.txt only goes to de-DE; global_brief.txt goes to all)
- _format_supplementary_for_prompt reads each file (.txt/.md/.json/.csv/.tsv
/.docx) and inlines its text into the LLM user message under a
"## SUPPLEMENTARY MATERIAL" header, capped at 40k chars per file
- .docx files are extracted via inline zipfile read (no new dependency)
New job wizard:
- Per-supplementary-file locale dropdown ("Global" or one of 12 locales)
- Filename gets prefixed with the locale on upload (de-DE_brief.docx)
Admin TM upload:
- Channel field is now a free-text input with autocomplete suggestions
(datalist of known channels) — lets users add brand-new channels like
PrimeCBM that didn't exist before
Pipeline scaling:
- Bumped dynamic max_tokens tiers: 80+ lines now gets 64k output budget
(was 32k); 132-line briefs no longer truncate. Sonnet 4.6 caps at 64k
- Added stop_reason logging — "max_tokens" stop now shows up in logs
loud and clear rather than silently truncating
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
152 lines
4.1 KiB
Python
152 lines
4.1 KiB
Python
"""Pipeline data contracts - Pydantic models for inter-agent communication."""
|
|
|
|
from typing import Any
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
class TMEntry(BaseModel):
|
|
"""A single Translation Memory entry."""
|
|
seg_key: str
|
|
date: str
|
|
en: str
|
|
lc: str
|
|
tx: str
|
|
nt: str = ""
|
|
channel: str = ""
|
|
sub_channel: str = ""
|
|
_text: str = ""
|
|
|
|
model_config = {"from_attributes": True}
|
|
|
|
|
|
class SourceLineContract(BaseModel):
|
|
"""A parsed source line from the input xlsx."""
|
|
line_id: str
|
|
row_order: int
|
|
en_gb: str
|
|
copy_type: str | None = None
|
|
creative_guidance: str | None = None
|
|
visual_ref: str | None = None
|
|
char_limit: str | None = None
|
|
is_display_format: bool = False
|
|
|
|
|
|
class FileManifest(BaseModel):
|
|
"""Manifest of all files loaded for a job."""
|
|
tm_files: list[str] = []
|
|
glossary_file: str | None = None
|
|
blacklist_file: str | None = None
|
|
tov_global_file: str | None = None
|
|
tov_supplement_file: str | None = None
|
|
locale_considerations_file: str | None = None
|
|
date_pct_formats_file: str | None = None
|
|
# Per-job user-uploaded supplementary files (paths on disk). Each
|
|
# entry's filename may begin with a locale code prefix to gate the
|
|
# file to that locale only (e.g. "de-DE_glossary.txt"); files without
|
|
# a recognised locale prefix apply to all locales.
|
|
supplementary_files: list[str] = []
|
|
|
|
|
|
class JobParams(BaseModel):
|
|
"""Parameters for a transcreation job."""
|
|
job_id: str
|
|
client_id: str
|
|
locale_code: str
|
|
channel: str
|
|
sub_channel: str | None = None
|
|
programme: str
|
|
campaign_name: str
|
|
context_prompt: str | None = None
|
|
tm_channels: list[str] = []
|
|
llm_model: str | None = None
|
|
|
|
|
|
class ParsedJob(BaseModel):
|
|
"""Output of Agent 1 (Validator): validated job parameters + source."""
|
|
job_params: JobParams
|
|
source_lines: list[SourceLineContract]
|
|
file_manifest: FileManifest
|
|
|
|
|
|
class ConfirmedMatch(BaseModel):
|
|
"""A confirmed TM match for a source line."""
|
|
seg_key: str
|
|
pass_found: int
|
|
date: str
|
|
en: str
|
|
tx: str
|
|
nt: str = ""
|
|
channel: str = ""
|
|
sub_channel: str = ""
|
|
is_cross_channel: bool = False
|
|
|
|
|
|
class TMSweepResult(BaseModel):
|
|
"""TM sweep results for a single source line."""
|
|
line_id: str
|
|
confirmed_matches: list[ConfirmedMatch] = []
|
|
pass_4_triggered: bool = False
|
|
pass_4_result: ConfirmedMatch | None = None
|
|
no_match: bool = False
|
|
|
|
|
|
class RankingDeclaration(BaseModel):
|
|
"""Ranking decision for a single source line."""
|
|
line_id: str
|
|
winning_entry: ConfirmedMatch | None = None
|
|
runner_ups: list[ConfirmedMatch] = []
|
|
confidence_tier: str = "low"
|
|
option_count: int = 3
|
|
is_new_creative_line: bool = False
|
|
notes: str = ""
|
|
|
|
|
|
class DraftOption(BaseModel):
|
|
"""A single draft transcreation option."""
|
|
text: str
|
|
backtranslation: str
|
|
rationale: str
|
|
|
|
|
|
class DraftOutput(BaseModel):
|
|
"""Transcreation draft output for a single source line."""
|
|
line_id: str
|
|
option_1: DraftOption
|
|
option_2: DraftOption | None = None
|
|
option_3: DraftOption | None = None
|
|
tm_entries_cited: list[str] = []
|
|
adaptations_applied: list[str] = []
|
|
|
|
|
|
class ComplianceViolation(BaseModel):
|
|
"""A single compliance violation found during checking."""
|
|
type: str
|
|
option_affected: int
|
|
description: str
|
|
severity: str = "warning"
|
|
|
|
|
|
class ComplianceResult(BaseModel):
|
|
"""Compliance check result for a single source line."""
|
|
line_id: str
|
|
passed: bool
|
|
violations: list[ComplianceViolation] = []
|
|
character_counts: dict[str, int] = {}
|
|
|
|
|
|
class PipelineContext(BaseModel):
|
|
"""Full pipeline context passed between agents."""
|
|
job_params: JobParams
|
|
source_lines: list[SourceLineContract] = []
|
|
file_manifest: FileManifest = FileManifest()
|
|
tm_sweep_results: list[TMSweepResult] = []
|
|
ranking_declarations: list[RankingDeclaration] = []
|
|
draft_outputs: list[DraftOutput] = []
|
|
compliance_results: list[ComplianceResult] = []
|
|
linguistic_summary: str = ""
|
|
|
|
# Token usage accumulators (updated by agents after each LLM call)
|
|
total_input_tokens: int = 0
|
|
total_output_tokens: int = 0
|
|
total_estimated_cost: float = 0.0
|