amazon-transcreation/backend/app/pipeline/modules/tm_file_loader.py
DJP 98fa16bfc3 feat: complete Phase 1-2 scaffold — backend, frontend, pipeline skeleton
Full-stack Amazon AI Transcreation Platform with:
- FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables
- JWT auth (SSO-ready abstract provider pattern)
- 6-agent pipeline orchestrator with deterministic modules
- Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme)
- Job wizard, monitoring HUD, output review, admin screens
- 154 TM/reference files imported, 12 locales configured
- Docker Compose for all services

Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs
pending Phase 3 LLM integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 12:31:43 -04:00

133 lines
3.9 KiB
Python

"""Translation Memory file loader.
Reads JSONL files in two formats:
1. Compact: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
2. Multi-field: {"seg_key": "...", "date": "...", "en": "...", ...}
Applies a locale hard-match gate: only entries matching the target locale are returned.
"""
import json
from typing import Any
from app.pipeline.contracts import TMEntry
class TMFileLoadError(Exception):
"""Raised when a TM file cannot be loaded or parsed."""
pass
def load_tm_file(
file_path: str,
target_locale: str,
) -> list[TMEntry]:
"""Load and parse a JSONL TM file, filtering by locale.
Args:
file_path: Absolute path to the JSONL file.
target_locale: Target locale code (e.g., "de_DE"). Only entries
matching this locale will be returned.
Returns:
List of TMEntry objects matching the target locale.
Raises:
TMFileLoadError: If the file cannot be read or parsed.
"""
entries: list[TMEntry] = []
try:
with open(file_path, "r", encoding="utf-8") as f:
for line_num, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
except json.JSONDecodeError as exc:
raise TMFileLoadError(
f"Invalid JSON on line {line_num}: {exc}"
)
entry = _parse_entry(data, line_num)
if entry is None:
continue
# Locale hard-match gate
if entry.lc == target_locale:
entries.append(entry)
except FileNotFoundError:
raise TMFileLoadError(f"TM file not found: {file_path}")
except UnicodeDecodeError as exc:
raise TMFileLoadError(f"Encoding error reading TM file: {exc}")
return entries
def _parse_entry(data: dict[str, Any], line_num: int) -> TMEntry | None:
"""Parse a single JSON object into a TMEntry.
Detects compact vs multi-field format automatically.
Args:
data: Parsed JSON dict.
line_num: Line number for error reporting.
Returns:
TMEntry or None if the entry is malformed.
"""
# Compact format: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
if "t" in data and isinstance(data["t"], str):
parts = data["t"].split("|")
if len(parts) < 5:
return None # Malformed compact entry
return TMEntry(
seg_key=parts[0] if len(parts) > 0 else "",
date=parts[1] if len(parts) > 1 else "",
en=parts[2] if len(parts) > 2 else "",
lc=parts[3] if len(parts) > 3 else "",
tx=parts[4] if len(parts) > 4 else "",
nt=parts[5] if len(parts) > 5 else "",
channel=parts[6] if len(parts) > 6 else "",
sub_channel=parts[7] if len(parts) > 7 else "",
_text=data["t"],
)
# Multi-field format
if "seg_key" in data and "en" in data:
return TMEntry(
seg_key=str(data.get("seg_key", "")),
date=str(data.get("date", "")),
en=str(data.get("en", "")),
lc=str(data.get("lc", "")),
tx=str(data.get("tx", "")),
nt=str(data.get("nt", "")),
channel=str(data.get("channel", "")),
sub_channel=str(data.get("sub_channel", "")),
)
return None
def load_multiple_tm_files(
file_paths: list[str],
target_locale: str,
) -> list[TMEntry]:
"""Load and merge multiple TM files.
Args:
file_paths: List of file paths to load.
target_locale: Target locale code.
Returns:
Combined list of TMEntry objects from all files.
"""
all_entries: list[TMEntry] = []
for path in file_paths:
entries = load_tm_file(path, target_locale)
all_entries.extend(entries)
return all_entries