Full-stack Amazon AI Transcreation Platform with: - FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables - JWT auth (SSO-ready abstract provider pattern) - 6-agent pipeline orchestrator with deterministic modules - Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme) - Job wizard, monitoring HUD, output review, admin screens - 154 TM/reference files imported, 12 locales configured - Docker Compose for all services Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs pending Phase 3 LLM integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
133 lines
3.9 KiB
Python
133 lines
3.9 KiB
Python
"""Translation Memory file loader.
|
|
|
|
Reads JSONL files in two formats:
|
|
1. Compact: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
|
|
2. Multi-field: {"seg_key": "...", "date": "...", "en": "...", ...}
|
|
|
|
Applies a locale hard-match gate: only entries matching the target locale are returned.
|
|
"""
|
|
|
|
import json
|
|
from typing import Any
|
|
|
|
from app.pipeline.contracts import TMEntry
|
|
|
|
|
|
class TMFileLoadError(Exception):
|
|
"""Raised when a TM file cannot be loaded or parsed."""
|
|
pass
|
|
|
|
|
|
def load_tm_file(
|
|
file_path: str,
|
|
target_locale: str,
|
|
) -> list[TMEntry]:
|
|
"""Load and parse a JSONL TM file, filtering by locale.
|
|
|
|
Args:
|
|
file_path: Absolute path to the JSONL file.
|
|
target_locale: Target locale code (e.g., "de_DE"). Only entries
|
|
matching this locale will be returned.
|
|
|
|
Returns:
|
|
List of TMEntry objects matching the target locale.
|
|
|
|
Raises:
|
|
TMFileLoadError: If the file cannot be read or parsed.
|
|
"""
|
|
entries: list[TMEntry] = []
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
for line_num, line in enumerate(f, start=1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
try:
|
|
data = json.loads(line)
|
|
except json.JSONDecodeError as exc:
|
|
raise TMFileLoadError(
|
|
f"Invalid JSON on line {line_num}: {exc}"
|
|
)
|
|
|
|
entry = _parse_entry(data, line_num)
|
|
if entry is None:
|
|
continue
|
|
|
|
# Locale hard-match gate
|
|
if entry.lc == target_locale:
|
|
entries.append(entry)
|
|
|
|
except FileNotFoundError:
|
|
raise TMFileLoadError(f"TM file not found: {file_path}")
|
|
except UnicodeDecodeError as exc:
|
|
raise TMFileLoadError(f"Encoding error reading TM file: {exc}")
|
|
|
|
return entries
|
|
|
|
|
|
def _parse_entry(data: dict[str, Any], line_num: int) -> TMEntry | None:
|
|
"""Parse a single JSON object into a TMEntry.
|
|
|
|
Detects compact vs multi-field format automatically.
|
|
|
|
Args:
|
|
data: Parsed JSON dict.
|
|
line_num: Line number for error reporting.
|
|
|
|
Returns:
|
|
TMEntry or None if the entry is malformed.
|
|
"""
|
|
# Compact format: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
|
|
if "t" in data and isinstance(data["t"], str):
|
|
parts = data["t"].split("|")
|
|
if len(parts) < 5:
|
|
return None # Malformed compact entry
|
|
|
|
return TMEntry(
|
|
seg_key=parts[0] if len(parts) > 0 else "",
|
|
date=parts[1] if len(parts) > 1 else "",
|
|
en=parts[2] if len(parts) > 2 else "",
|
|
lc=parts[3] if len(parts) > 3 else "",
|
|
tx=parts[4] if len(parts) > 4 else "",
|
|
nt=parts[5] if len(parts) > 5 else "",
|
|
channel=parts[6] if len(parts) > 6 else "",
|
|
sub_channel=parts[7] if len(parts) > 7 else "",
|
|
_text=data["t"],
|
|
)
|
|
|
|
# Multi-field format
|
|
if "seg_key" in data and "en" in data:
|
|
return TMEntry(
|
|
seg_key=str(data.get("seg_key", "")),
|
|
date=str(data.get("date", "")),
|
|
en=str(data.get("en", "")),
|
|
lc=str(data.get("lc", "")),
|
|
tx=str(data.get("tx", "")),
|
|
nt=str(data.get("nt", "")),
|
|
channel=str(data.get("channel", "")),
|
|
sub_channel=str(data.get("sub_channel", "")),
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
def load_multiple_tm_files(
|
|
file_paths: list[str],
|
|
target_locale: str,
|
|
) -> list[TMEntry]:
|
|
"""Load and merge multiple TM files.
|
|
|
|
Args:
|
|
file_paths: List of file paths to load.
|
|
target_locale: Target locale code.
|
|
|
|
Returns:
|
|
Combined list of TMEntry objects from all files.
|
|
"""
|
|
all_entries: list[TMEntry] = []
|
|
for path in file_paths:
|
|
entries = load_tm_file(path, target_locale)
|
|
all_entries.extend(entries)
|
|
return all_entries
|