amazon-transcreation/scripts/tm_format_migrator.py

#!/usr/bin/env python3
"""
TM Format Migrator
==================
Converts compact TM files (single "t" field) to the multi-field JSONL format.

Compact format:
  {"t": "{seg_key} {note_type} {locale_code} {EN_source} {TX_target}"}

Multi-field output:
  {"seg_key": "...", "date": "", "en": "", "lc": "...", "tx": "", "nt": "...", "_text": "..."}

Since the EN/TX boundary is ambiguous (no delimiter between languages), both
en and tx are left empty with a combined _text field for manual review.

Usage:
  python scripts/tm_format_migrator.py <input_dir> <output_dir>
"""

import argparse
import json
import re
import sys
from pathlib import Path

# Regex to find locale code in xx-xx format (e.g., es-es, de-de, fr-fr)
LOCALE_PATTERN = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")

# Known note types that appear between the sequence number and the locale code
KNOWN_NOTE_TYPES = [
    "BVO", "VO", "SRT", "SUPER", "GFX", "BANNER", "CTA", "HEADLINE",
    "SUBHEAD", "BODY", "COPY", "LEGAL", "DISCLAIMER", "TITLE", "DESC",
    "ENDCARD", "PRE-ROLL", "MID-ROLL", "POST-ROLL", "OLV", "RADIO",
    "DISPLAY", "SOCIAL", "PRINT", "OOH", "DIGITAL", "AUDIO",
]


def parse_compact_line(raw_text: str) -> dict | None:
    """
    Parse a compact TM entry's 't' field into structured fields.

    Returns a dict with seg_key, date, en, lc, tx, nt, _text or None on failure.
    """
    match = LOCALE_PATTERN.search(raw_text)
    if not match:
        return None

    locale_code = match.group(1)
    locale_start = match.start()
    locale_end = match.end()

    # Everything before the locale code contains seg_key + note_type
    prefix = raw_text[:locale_start].rstrip()

    # Everything after the locale code is the combined EN + TX text
    combined_text = raw_text[locale_end:].strip()

    # Split prefix into seg_key and note_type
    # The note_type is typically the last whitespace-separated token(s) before the locale
    # Strategy: walk backwards from the end of prefix to find note_type tokens
    prefix_tokens = prefix.split()

    if not prefix_tokens:
        return None

    # Find where the note_type begins by looking for known note types
    # or by finding the last numeric token (sequence number)
    note_type_tokens = []
    seg_key_tokens = []

    # Walk backwards through tokens to find the note_type boundary
    found_seq_number = False
    for i in range(len(prefix_tokens) - 1, -1, -1):
        token = prefix_tokens[i]
        # A sequence number is a 3-digit number like 001, 002, etc.
        if re.match(r"^\d{2,4}$", token) and not found_seq_number:
            found_seq_number = True
            seg_key_tokens = prefix_tokens[: i + 1]
            note_type_tokens = prefix_tokens[i + 1 :]
            break

    # If we didn't find a clear sequence number, fall back to putting
    # everything in seg_key with empty note_type
    if not found_seq_number:
        seg_key_tokens = prefix_tokens
        note_type_tokens = []

    seg_key = " ".join(seg_key_tokens)
    note_type = " ".join(note_type_tokens)

    return {
        "seg_key": seg_key,
        "date": "",
        "en": "",
        "lc": locale_code,
        "tx": "",
        "nt": note_type,
        "_text": combined_text,  # TODO: manually split into en/tx
    }


def migrate_file(input_path: Path, output_path: Path) -> dict:
    """
    Migrate a single compact TM file to multi-field JSONL format.

    Returns a stats dict with counts of processed, skipped, and error lines.
    """
    stats = {"processed": 0, "skipped": 0, "errors": 0, "locales": set()}

    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(input_path, "r", encoding="utf-8") as infile, \
         open(output_path, "w", encoding="utf-8") as outfile:

        for line_num, line in enumerate(infile, start=1):
            line = line.strip()
            if not line:
                stats["skipped"] += 1
                continue

            try:
                entry = json.loads(line)
            except json.JSONDecodeError as exc:
                print(
                    f"  WARNING: {input_path.name}:{line_num} - "
                    f"Invalid JSON: {exc}",
                    file=sys.stderr,
                )
                stats["errors"] += 1
                continue

            raw_text = entry.get("t")
            if raw_text is None:
                # Not a compact format line; check if already multi-field
                if "seg_key" in entry:
                    # Already in new format, pass through
                    outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
                    stats["processed"] += 1
                else:
                    print(
                        f"  WARNING: {input_path.name}:{line_num} - "
                        f"No 't' or 'seg_key' field found",
                        file=sys.stderr,
                    )
                    stats["errors"] += 1
                continue

            parsed = parse_compact_line(raw_text)
            if parsed is None:
                print(
                    f"  WARNING: {input_path.name}:{line_num} - "
                    f"Could not parse locale from: {raw_text[:80]}...",
                    file=sys.stderr,
                )
                stats["errors"] += 1
                continue

            outfile.write(json.dumps(parsed, ensure_ascii=False) + "\n")
            stats["processed"] += 1
            stats["locales"].add(parsed["lc"])

    return stats


def migrate_directory(input_dir: Path, output_dir: Path) -> None:
    """Recursively migrate all .json TM files from input_dir to output_dir."""
    json_files = sorted(input_dir.rglob("*.json"))

    if not json_files:
        print(f"No .json files found in {input_dir}")
        return

    print(f"Found {len(json_files)} .json file(s) in {input_dir}\n")

    total_stats = {"files": 0, "processed": 0, "skipped": 0, "errors": 0, "locales": set()}

    for json_file in json_files:
        # Preserve relative directory structure
        relative_path = json_file.relative_to(input_dir)
        output_path = output_dir / relative_path.with_suffix(".jsonl")

        print(f"Processing: {relative_path}")

        stats = migrate_file(json_file, output_path)

        total_stats["files"] += 1
        total_stats["processed"] += stats["processed"]
        total_stats["skipped"] += stats["skipped"]
        total_stats["errors"] += stats["errors"]
        total_stats["locales"].update(stats["locales"])

        print(
            f"  -> {stats['processed']} entries migrated, "
            f"{stats['skipped']} skipped, "
            f"{stats['errors']} errors"
        )

    print("\n" + "=" * 60)
    print("Migration Summary")
    print("=" * 60)
    print(f"  Files processed:   {total_stats['files']}")
    print(f"  Total entries:     {total_stats['processed']}")
    print(f"  Skipped lines:     {total_stats['skipped']}")
    print(f"  Errors:            {total_stats['errors']}")
    print(f"  Locales found:     {', '.join(sorted(total_stats['locales']))}")
    print("=" * 60)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Migrate compact TM files to multi-field JSONL format.",
    )
    parser.add_argument(
        "input_dir",
        type=Path,
        help="Directory containing compact .json TM files",
    )
    parser.add_argument(
        "output_dir",
        type=Path,
        help="Directory to write migrated .jsonl files",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Parse and report without writing output files",
    )

    args = parser.parse_args()

    if not args.input_dir.is_dir():
        print(f"Error: Input directory does not exist: {args.input_dir}", file=sys.stderr)
        sys.exit(1)

    if args.dry_run:
        print("[DRY RUN] No files will be written.\n")

    args.output_dir.mkdir(parents=True, exist_ok=True)
    migrate_directory(args.input_dir, args.output_dir)


if __name__ == "__main__":
    main()