amazon-transcreation/backend/app/pipeline/modules/tm_file_loader.py

"""Translation Memory file loader.

Reads JSONL files in two formats:
1. Compact: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
2. Multi-field: {"seg_key": "...", "date": "...", "en": "...", ...}

Applies a locale hard-match gate: only entries matching the target locale are returned.
"""

import json
from typing import Any

from app.pipeline.contracts import TMEntry


class TMFileLoadError(Exception):
    """Raised when a TM file cannot be loaded or parsed."""
    pass


def load_tm_file(
    file_path: str,
    target_locale: str,
) -> list[TMEntry]:
    """Load and parse a JSONL TM file, filtering by locale.

    Args:
        file_path: Absolute path to the JSONL file.
        target_locale: Target locale code (e.g., "de_DE"). Only entries
            matching this locale will be returned.

    Returns:
        List of TMEntry objects matching the target locale.

    Raises:
        TMFileLoadError: If the file cannot be read or parsed.
    """
    entries: list[TMEntry] = []

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            for line_num, line in enumerate(f, start=1):
                line = line.strip()
                if not line:
                    continue

                try:
                    data = json.loads(line)
                except json.JSONDecodeError as exc:
                    raise TMFileLoadError(
                        f"Invalid JSON on line {line_num}: {exc}"
                    )

                entry = _parse_entry(data, line_num)
                if entry is None:
                    continue

                # Locale hard-match gate (case-insensitive)
                if entry.lc.lower() == target_locale.lower():
                    entries.append(entry)

    except FileNotFoundError:
        raise TMFileLoadError(f"TM file not found: {file_path}")
    except UnicodeDecodeError as exc:
        raise TMFileLoadError(f"Encoding error reading TM file: {exc}")

    return entries


def _parse_entry(data: dict[str, Any], line_num: int) -> TMEntry | None:
    """Parse a single JSON object into a TMEntry.

    Detects compact vs multi-field format automatically.

    Compact format (V25 spec):
        {"t": "{seg_key} {note_type} {locale_code} {EN_source} {TX}"}
        The locale code (e.g., 'de-de') is the split point between metadata,
        EN source text, and target-language translation.

    Multi-field format:
        {"seg_key": "...", "en": "...", "lc": "...", "tx": "...", ...}

    Args:
        data: Parsed JSON dict.
        line_num: Line number for error reporting.

    Returns:
        TMEntry or None if the entry is malformed.
    """
    # Compact format: {"t": "..."}
    if "t" in data and isinstance(data["t"], str):
        return _parse_compact_entry(data["t"])

    # Multi-field format
    if "seg_key" in data and "en" in data:
        return TMEntry(
            seg_key=str(data.get("seg_key", "")),
            date=str(data.get("date", "")),
            en=str(data.get("en", "")),
            lc=str(data.get("lc", "")),
            tx=str(data.get("tx", "")),
            nt=str(data.get("nt", "")),
            channel=str(data.get("channel", "")),
            sub_channel=str(data.get("sub_channel", "")),
        )

    return None


# Regex to find locale code pattern like "de-de", "fr-fr", "es-es" etc.
import re
_LOCALE_RE = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")


def _split_en_tx(text: str) -> tuple[str, str]:
    """Attempt to split combined EN/TX text from compact TM entries.

    The compact format packs both the English source and the target-language
    translation into a single string after the locale code.  There is no
    explicit delimiter, but in practice the boundary is almost always at a
    sentence-ending period followed by a capital letter that starts the
    target-language text.

    Heuristic (works for the vast majority of real Amazon TM files):
      1. Split on ". " boundaries.
      2. Walk forward and assume the first sentence that contains non-ASCII
         characters (ö, ü, é, ñ, etc.) marks the start of the TX portion.
      3. If all sentences are ASCII-only (e.g. short entries), fall back to
         a 50/50 split at the nearest sentence boundary.

    Returns (en, tx).
    """
    if not text:
        return ("", "")

    # Split on sentence boundaries (period followed by space + capital)
    parts = re.split(r"(?<=\.)\s+(?=[A-ZÄÖÜÉÈÊÀÁÂÃÇÑ])", text)
    if len(parts) <= 1:
        # No clear sentence boundary — return full text as EN, empty TX
        return (text, text)

    # Walk forward: first part with non-ASCII = start of TX
    for i, part in enumerate(parts):
        if i == 0:
            continue  # first part is always EN
        if re.search(r"[^\x00-\x7F]", part):
            en = " ".join(parts[:i]).strip()
            tx = " ".join(parts[i:]).strip()
            return (en, tx)

    # All ASCII — split at midpoint sentence boundary
    mid = len(parts) // 2
    if mid == 0:
        mid = 1
    en = " ".join(parts[:mid]).strip()
    tx = " ".join(parts[mid:]).strip()
    return (en, tx)


def _parse_compact_entry(t_value: str) -> TMEntry | None:
    """Parse the compact 't' field format used in real TM files.

    Format: '{seg_key_with_metadata} {locale_code} {EN_source} {TX}'
    Example: 'Value Q1 24 Radio 001 VO de-de As Sophie opened... Sophie öffnet...'

    The locale code (xx-xx) is the reliable split point:
    - Everything BEFORE the locale code is the seg_key + note_type metadata
    - Everything AFTER is split into EN source and TX translation
    """
    match = _LOCALE_RE.search(t_value)
    if not match:
        return None

    locale_code = match.group(1)
    before_locale = t_value[: match.start()].strip()
    after_locale = t_value[match.end() :].strip()

    seg_key = before_locale

    # Extract channel info from seg_key
    channel = ""
    sub_channel = ""
    seg_parts = before_locale.split()
    channel_keywords = {
        "mass", "value", "onsite", "outbound", "radio",
        "tv_olv", "display", "ooh", "dooh", "social", "print",
        "digital", "crm", "push",
    }
    for part in seg_parts:
        if part.lower() in channel_keywords:
            if not channel:
                channel = part
            else:
                sub_channel = part

    # Extract note_type from the end of before_locale
    note_type = ""
    note_keywords = {"vo", "bvo", "super", "headline", "legal", "cta",
                     "body", "disclaimer", "endline"}
    for part in reversed(seg_parts):
        if part.lower() in note_keywords:
            note_type = part
            break

    # Split EN and TX from the combined text
    en_text, tx_text = _split_en_tx(after_locale)

    # Try to extract year from seg_key for the date field
    date = ""
    year_match = re.search(r"\b(\d{2})\b", before_locale)
    if year_match:
        date = year_match.group(1)

    return TMEntry(
        seg_key=seg_key,
        date=date,
        en=en_text,
        lc=locale_code,
        tx=tx_text,
        nt=note_type,
        channel=channel,
        sub_channel=sub_channel,
        _text=t_value,
    )


def load_multiple_tm_files(
    file_paths: list[str],
    target_locale: str,
) -> list[TMEntry]:
    """Load and merge multiple TM files.

    Args:
        file_paths: List of file paths to load.
        target_locale: Target locale code.

    Returns:
        Combined list of TMEntry objects from all files.
    """
    all_entries: list[TMEntry] = []
    for path in file_paths:
        entries = load_tm_file(path, target_locale)
        all_entries.extend(entries)
    return all_entries