The compact TM format parser was storing the combined EN+TX text in both fields, causing the LLM retrieval agent to fail at matching source lines against TM entries — resulting in all-low confidence tiers. Added _split_en_tx() heuristic that detects the language boundary at the first non-ASCII sentence. Also includes raw _text in LLM prompt for context. Fixed get_jobs_over_time GroupingError by using literal_column for date_trunc, added date filters to status_breakdown, and fixed Decimal serialization in locale stats. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
244 lines
7.5 KiB
Python
244 lines
7.5 KiB
Python
"""Translation Memory file loader.
|
|
|
|
Reads JSONL files in two formats:
|
|
1. Compact: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
|
|
2. Multi-field: {"seg_key": "...", "date": "...", "en": "...", ...}
|
|
|
|
Applies a locale hard-match gate: only entries matching the target locale are returned.
|
|
"""
|
|
|
|
import json
|
|
from typing import Any
|
|
|
|
from app.pipeline.contracts import TMEntry
|
|
|
|
|
|
class TMFileLoadError(Exception):
|
|
"""Raised when a TM file cannot be loaded or parsed."""
|
|
pass
|
|
|
|
|
|
def load_tm_file(
|
|
file_path: str,
|
|
target_locale: str,
|
|
) -> list[TMEntry]:
|
|
"""Load and parse a JSONL TM file, filtering by locale.
|
|
|
|
Args:
|
|
file_path: Absolute path to the JSONL file.
|
|
target_locale: Target locale code (e.g., "de_DE"). Only entries
|
|
matching this locale will be returned.
|
|
|
|
Returns:
|
|
List of TMEntry objects matching the target locale.
|
|
|
|
Raises:
|
|
TMFileLoadError: If the file cannot be read or parsed.
|
|
"""
|
|
entries: list[TMEntry] = []
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
for line_num, line in enumerate(f, start=1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
try:
|
|
data = json.loads(line)
|
|
except json.JSONDecodeError as exc:
|
|
raise TMFileLoadError(
|
|
f"Invalid JSON on line {line_num}: {exc}"
|
|
)
|
|
|
|
entry = _parse_entry(data, line_num)
|
|
if entry is None:
|
|
continue
|
|
|
|
# Locale hard-match gate (case-insensitive)
|
|
if entry.lc.lower() == target_locale.lower():
|
|
entries.append(entry)
|
|
|
|
except FileNotFoundError:
|
|
raise TMFileLoadError(f"TM file not found: {file_path}")
|
|
except UnicodeDecodeError as exc:
|
|
raise TMFileLoadError(f"Encoding error reading TM file: {exc}")
|
|
|
|
return entries
|
|
|
|
|
|
def _parse_entry(data: dict[str, Any], line_num: int) -> TMEntry | None:
|
|
"""Parse a single JSON object into a TMEntry.
|
|
|
|
Detects compact vs multi-field format automatically.
|
|
|
|
Compact format (V25 spec):
|
|
{"t": "{seg_key} {note_type} {locale_code} {EN_source} {TX}"}
|
|
The locale code (e.g., 'de-de') is the split point between metadata,
|
|
EN source text, and target-language translation.
|
|
|
|
Multi-field format:
|
|
{"seg_key": "...", "en": "...", "lc": "...", "tx": "...", ...}
|
|
|
|
Args:
|
|
data: Parsed JSON dict.
|
|
line_num: Line number for error reporting.
|
|
|
|
Returns:
|
|
TMEntry or None if the entry is malformed.
|
|
"""
|
|
# Compact format: {"t": "..."}
|
|
if "t" in data and isinstance(data["t"], str):
|
|
return _parse_compact_entry(data["t"])
|
|
|
|
# Multi-field format
|
|
if "seg_key" in data and "en" in data:
|
|
return TMEntry(
|
|
seg_key=str(data.get("seg_key", "")),
|
|
date=str(data.get("date", "")),
|
|
en=str(data.get("en", "")),
|
|
lc=str(data.get("lc", "")),
|
|
tx=str(data.get("tx", "")),
|
|
nt=str(data.get("nt", "")),
|
|
channel=str(data.get("channel", "")),
|
|
sub_channel=str(data.get("sub_channel", "")),
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
# Regex to find locale code pattern like "de-de", "fr-fr", "es-es" etc.
|
|
import re
|
|
_LOCALE_RE = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")
|
|
|
|
|
|
def _split_en_tx(text: str) -> tuple[str, str]:
|
|
"""Attempt to split combined EN/TX text from compact TM entries.
|
|
|
|
The compact format packs both the English source and the target-language
|
|
translation into a single string after the locale code. There is no
|
|
explicit delimiter, but in practice the boundary is almost always at a
|
|
sentence-ending period followed by a capital letter that starts the
|
|
target-language text.
|
|
|
|
Heuristic (works for the vast majority of real Amazon TM files):
|
|
1. Split on ". " boundaries.
|
|
2. Walk forward and assume the first sentence that contains non-ASCII
|
|
characters (ö, ü, é, ñ, etc.) marks the start of the TX portion.
|
|
3. If all sentences are ASCII-only (e.g. short entries), fall back to
|
|
a 50/50 split at the nearest sentence boundary.
|
|
|
|
Returns (en, tx).
|
|
"""
|
|
if not text:
|
|
return ("", "")
|
|
|
|
# Split on sentence boundaries (period followed by space + capital)
|
|
parts = re.split(r"(?<=\.)\s+(?=[A-ZÄÖÜÉÈÊÀÁÂÃÇÑ])", text)
|
|
if len(parts) <= 1:
|
|
# No clear sentence boundary — return full text as EN, empty TX
|
|
return (text, text)
|
|
|
|
# Walk forward: first part with non-ASCII = start of TX
|
|
for i, part in enumerate(parts):
|
|
if i == 0:
|
|
continue # first part is always EN
|
|
if re.search(r"[^\x00-\x7F]", part):
|
|
en = " ".join(parts[:i]).strip()
|
|
tx = " ".join(parts[i:]).strip()
|
|
return (en, tx)
|
|
|
|
# All ASCII — split at midpoint sentence boundary
|
|
mid = len(parts) // 2
|
|
if mid == 0:
|
|
mid = 1
|
|
en = " ".join(parts[:mid]).strip()
|
|
tx = " ".join(parts[mid:]).strip()
|
|
return (en, tx)
|
|
|
|
|
|
def _parse_compact_entry(t_value: str) -> TMEntry | None:
|
|
"""Parse the compact 't' field format used in real TM files.
|
|
|
|
Format: '{seg_key_with_metadata} {locale_code} {EN_source} {TX}'
|
|
Example: 'Value Q1 24 Radio 001 VO de-de As Sophie opened... Sophie öffnet...'
|
|
|
|
The locale code (xx-xx) is the reliable split point:
|
|
- Everything BEFORE the locale code is the seg_key + note_type metadata
|
|
- Everything AFTER is split into EN source and TX translation
|
|
"""
|
|
match = _LOCALE_RE.search(t_value)
|
|
if not match:
|
|
return None
|
|
|
|
locale_code = match.group(1)
|
|
before_locale = t_value[: match.start()].strip()
|
|
after_locale = t_value[match.end() :].strip()
|
|
|
|
seg_key = before_locale
|
|
|
|
# Extract channel info from seg_key
|
|
channel = ""
|
|
sub_channel = ""
|
|
seg_parts = before_locale.split()
|
|
channel_keywords = {
|
|
"mass", "value", "onsite", "outbound", "radio",
|
|
"tv_olv", "display", "ooh", "dooh", "social", "print",
|
|
"digital", "crm", "push",
|
|
}
|
|
for part in seg_parts:
|
|
if part.lower() in channel_keywords:
|
|
if not channel:
|
|
channel = part
|
|
else:
|
|
sub_channel = part
|
|
|
|
# Extract note_type from the end of before_locale
|
|
note_type = ""
|
|
note_keywords = {"vo", "bvo", "super", "headline", "legal", "cta",
|
|
"body", "disclaimer", "endline"}
|
|
for part in reversed(seg_parts):
|
|
if part.lower() in note_keywords:
|
|
note_type = part
|
|
break
|
|
|
|
# Split EN and TX from the combined text
|
|
en_text, tx_text = _split_en_tx(after_locale)
|
|
|
|
# Try to extract year from seg_key for the date field
|
|
date = ""
|
|
year_match = re.search(r"\b(\d{2})\b", before_locale)
|
|
if year_match:
|
|
date = year_match.group(1)
|
|
|
|
return TMEntry(
|
|
seg_key=seg_key,
|
|
date=date,
|
|
en=en_text,
|
|
lc=locale_code,
|
|
tx=tx_text,
|
|
nt=note_type,
|
|
channel=channel,
|
|
sub_channel=sub_channel,
|
|
_text=t_value,
|
|
)
|
|
|
|
|
|
def load_multiple_tm_files(
|
|
file_paths: list[str],
|
|
target_locale: str,
|
|
) -> list[TMEntry]:
|
|
"""Load and merge multiple TM files.
|
|
|
|
Args:
|
|
file_paths: List of file paths to load.
|
|
target_locale: Target locale code.
|
|
|
|
Returns:
|
|
Combined list of TMEntry objects from all files.
|
|
"""
|
|
all_entries: list[TMEntry] = []
|
|
for path in file_paths:
|
|
entries = load_tm_file(path, target_locale)
|
|
all_entries.extend(entries)
|
|
return all_entries
|