amazon-transcreation/backend/app/pipeline/modules/tm_file_loader.py

"""Translation Memory file loader.

Reads JSONL files in two formats:
1. Compact: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
2. Multi-field: {"seg_key": "...", "date": "...", "en": "...", ...}

Applies a locale hard-match gate: only entries matching the target locale are returned.
"""

import json
from typing import Any

from app.pipeline.contracts import TMEntry


class TMFileLoadError(Exception):
    """Raised when a TM file cannot be loaded or parsed."""
    pass


def load_tm_file(
    file_path: str,
    target_locale: str,
) -> list[TMEntry]:
    """Load and parse a JSONL TM file, filtering by locale.

    Args:
        file_path: Absolute path to the JSONL file.
        target_locale: Target locale code (e.g., "de_DE"). Only entries
            matching this locale will be returned.

    Returns:
        List of TMEntry objects matching the target locale.

    Raises:
        TMFileLoadError: If the file cannot be read or parsed.
    """
    entries: list[TMEntry] = []

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            for line_num, line in enumerate(f, start=1):
                line = line.strip()
                if not line:
                    continue

                try:
                    data = json.loads(line)
                except json.JSONDecodeError as exc:
                    raise TMFileLoadError(
                        f"Invalid JSON on line {line_num}: {exc}"
                    )

                entry = _parse_entry(data, line_num)
                if entry is None:
                    continue

                # Locale hard-match gate
                if entry.lc == target_locale:
                    entries.append(entry)

    except FileNotFoundError:
        raise TMFileLoadError(f"TM file not found: {file_path}")
    except UnicodeDecodeError as exc:
        raise TMFileLoadError(f"Encoding error reading TM file: {exc}")

    return entries


def _parse_entry(data: dict[str, Any], line_num: int) -> TMEntry | None:
    """Parse a single JSON object into a TMEntry.

    Detects compact vs multi-field format automatically.

    Args:
        data: Parsed JSON dict.
        line_num: Line number for error reporting.

    Returns:
        TMEntry or None if the entry is malformed.
    """
    # Compact format: {"t": "seg_key|date|en|lc|tx|nt|channel|sub_channel"}
    if "t" in data and isinstance(data["t"], str):
        parts = data["t"].split("|")
        if len(parts) < 5:
            return None  # Malformed compact entry

        return TMEntry(
            seg_key=parts[0] if len(parts) > 0 else "",
            date=parts[1] if len(parts) > 1 else "",
            en=parts[2] if len(parts) > 2 else "",
            lc=parts[3] if len(parts) > 3 else "",
            tx=parts[4] if len(parts) > 4 else "",
            nt=parts[5] if len(parts) > 5 else "",
            channel=parts[6] if len(parts) > 6 else "",
            sub_channel=parts[7] if len(parts) > 7 else "",
            _text=data["t"],
        )

    # Multi-field format
    if "seg_key" in data and "en" in data:
        return TMEntry(
            seg_key=str(data.get("seg_key", "")),
            date=str(data.get("date", "")),
            en=str(data.get("en", "")),
            lc=str(data.get("lc", "")),
            tx=str(data.get("tx", "")),
            nt=str(data.get("nt", "")),
            channel=str(data.get("channel", "")),
            sub_channel=str(data.get("sub_channel", "")),
        )

    return None


def load_multiple_tm_files(
    file_paths: list[str],
    target_locale: str,
) -> list[TMEntry]:
    """Load and merge multiple TM files.

    Args:
        file_paths: List of file paths to load.
        target_locale: Target locale code.

    Returns:
        Combined list of TMEntry objects from all files.
    """
    all_entries: list[TMEntry] = []
    for path in file_paths:
        entries = load_tm_file(path, target_locale)
        all_entries.extend(entries)
    return all_entries