amazon-transcreation/backend/app/pipeline/modules/tm_file_loader.py
DJP e97d4f81b7 fix: improve TM parser EN/TX split and fix report SQL errors
The compact TM format parser was storing the combined EN+TX text in both
fields, causing the LLM retrieval agent to fail at matching source lines
against TM entries — resulting in all-low confidence tiers. Added
_split_en_tx() heuristic that detects the language boundary at the first
non-ASCII sentence. Also includes raw _text in LLM prompt for context.

Fixed get_jobs_over_time GroupingError by using literal_column for
date_trunc, added date filters to status_breakdown, and fixed Decimal
serialization in locale stats.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 17:47:53 -04:00

244 lines
7.5 KiB
Python

"""Translation Memory file loader.
Reads JSONL files in two formats:
1. Compact: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
2. Multi-field: {"seg_key": "...", "date": "...", "en": "...", ...}
Applies a locale hard-match gate: only entries matching the target locale are returned.
"""
import json
from typing import Any
from app.pipeline.contracts import TMEntry
class TMFileLoadError(Exception):
"""Raised when a TM file cannot be loaded or parsed."""
pass
def load_tm_file(
file_path: str,
target_locale: str,
) -> list[TMEntry]:
"""Load and parse a JSONL TM file, filtering by locale.
Args:
file_path: Absolute path to the JSONL file.
target_locale: Target locale code (e.g., "de_DE"). Only entries
matching this locale will be returned.
Returns:
List of TMEntry objects matching the target locale.
Raises:
TMFileLoadError: If the file cannot be read or parsed.
"""
entries: list[TMEntry] = []
try:
with open(file_path, "r", encoding="utf-8") as f:
for line_num, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
except json.JSONDecodeError as exc:
raise TMFileLoadError(
f"Invalid JSON on line {line_num}: {exc}"
)
entry = _parse_entry(data, line_num)
if entry is None:
continue
# Locale hard-match gate (case-insensitive)
if entry.lc.lower() == target_locale.lower():
entries.append(entry)
except FileNotFoundError:
raise TMFileLoadError(f"TM file not found: {file_path}")
except UnicodeDecodeError as exc:
raise TMFileLoadError(f"Encoding error reading TM file: {exc}")
return entries
def _parse_entry(data: dict[str, Any], line_num: int) -> TMEntry | None:
"""Parse a single JSON object into a TMEntry.
Detects compact vs multi-field format automatically.
Compact format (V25 spec):
{"t": "{seg_key} {note_type} {locale_code} {EN_source} {TX}"}
The locale code (e.g., 'de-de') is the split point between metadata,
EN source text, and target-language translation.
Multi-field format:
{"seg_key": "...", "en": "...", "lc": "...", "tx": "...", ...}
Args:
data: Parsed JSON dict.
line_num: Line number for error reporting.
Returns:
TMEntry or None if the entry is malformed.
"""
# Compact format: {"t": "..."}
if "t" in data and isinstance(data["t"], str):
return _parse_compact_entry(data["t"])
# Multi-field format
if "seg_key" in data and "en" in data:
return TMEntry(
seg_key=str(data.get("seg_key", "")),
date=str(data.get("date", "")),
en=str(data.get("en", "")),
lc=str(data.get("lc", "")),
tx=str(data.get("tx", "")),
nt=str(data.get("nt", "")),
channel=str(data.get("channel", "")),
sub_channel=str(data.get("sub_channel", "")),
)
return None
# Regex to find locale code pattern like "de-de", "fr-fr", "es-es" etc.
import re
_LOCALE_RE = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")
def _split_en_tx(text: str) -> tuple[str, str]:
"""Attempt to split combined EN/TX text from compact TM entries.
The compact format packs both the English source and the target-language
translation into a single string after the locale code. There is no
explicit delimiter, but in practice the boundary is almost always at a
sentence-ending period followed by a capital letter that starts the
target-language text.
Heuristic (works for the vast majority of real Amazon TM files):
1. Split on ". " boundaries.
2. Walk forward and assume the first sentence that contains non-ASCII
characters (ö, ü, é, ñ, etc.) marks the start of the TX portion.
3. If all sentences are ASCII-only (e.g. short entries), fall back to
a 50/50 split at the nearest sentence boundary.
Returns (en, tx).
"""
if not text:
return ("", "")
# Split on sentence boundaries (period followed by space + capital)
parts = re.split(r"(?<=\.)\s+(?=[A-ZÄÖÜÉÈÊÀÁÂÃÇÑ])", text)
if len(parts) <= 1:
# No clear sentence boundary — return full text as EN, empty TX
return (text, text)
# Walk forward: first part with non-ASCII = start of TX
for i, part in enumerate(parts):
if i == 0:
continue # first part is always EN
if re.search(r"[^\x00-\x7F]", part):
en = " ".join(parts[:i]).strip()
tx = " ".join(parts[i:]).strip()
return (en, tx)
# All ASCII — split at midpoint sentence boundary
mid = len(parts) // 2
if mid == 0:
mid = 1
en = " ".join(parts[:mid]).strip()
tx = " ".join(parts[mid:]).strip()
return (en, tx)
def _parse_compact_entry(t_value: str) -> TMEntry | None:
"""Parse the compact 't' field format used in real TM files.
Format: '{seg_key_with_metadata} {locale_code} {EN_source} {TX}'
Example: 'Value Q1 24 Radio 001 VO de-de As Sophie opened... Sophie öffnet...'
The locale code (xx-xx) is the reliable split point:
- Everything BEFORE the locale code is the seg_key + note_type metadata
- Everything AFTER is split into EN source and TX translation
"""
match = _LOCALE_RE.search(t_value)
if not match:
return None
locale_code = match.group(1)
before_locale = t_value[: match.start()].strip()
after_locale = t_value[match.end() :].strip()
seg_key = before_locale
# Extract channel info from seg_key
channel = ""
sub_channel = ""
seg_parts = before_locale.split()
channel_keywords = {
"mass", "value", "onsite", "outbound", "radio",
"tv_olv", "display", "ooh", "dooh", "social", "print",
"digital", "crm", "push",
}
for part in seg_parts:
if part.lower() in channel_keywords:
if not channel:
channel = part
else:
sub_channel = part
# Extract note_type from the end of before_locale
note_type = ""
note_keywords = {"vo", "bvo", "super", "headline", "legal", "cta",
"body", "disclaimer", "endline"}
for part in reversed(seg_parts):
if part.lower() in note_keywords:
note_type = part
break
# Split EN and TX from the combined text
en_text, tx_text = _split_en_tx(after_locale)
# Try to extract year from seg_key for the date field
date = ""
year_match = re.search(r"\b(\d{2})\b", before_locale)
if year_match:
date = year_match.group(1)
return TMEntry(
seg_key=seg_key,
date=date,
en=en_text,
lc=locale_code,
tx=tx_text,
nt=note_type,
channel=channel,
sub_channel=sub_channel,
_text=t_value,
)
def load_multiple_tm_files(
file_paths: list[str],
target_locale: str,
) -> list[TMEntry]:
"""Load and merge multiple TM files.
Args:
file_paths: List of file paths to load.
target_locale: Target locale code.
Returns:
Combined list of TMEntry objects from all files.
"""
all_entries: list[TMEntry] = []
for path in file_paths:
entries = load_tm_file(path, target_locale)
all_entries.extend(entries)
return all_entries