amazon-transcreation/scripts/validate_tm_files.py

#!/usr/bin/env python3
"""
TM File Validator
=================
Reads all TM files from a directory and reports on format, line counts,
locale codes, channels, and parse errors.

Usage:
  python scripts/validate_tm_files.py <tm_directory>
"""

import argparse
import json
import re
import sys
from collections import defaultdict
from pathlib import Path

LOCALE_PATTERN = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")


def detect_format(entry: dict) -> str:
    """Detect whether an entry is compact or multi-field format."""
    if "t" in entry and isinstance(entry["t"], str):
        return "compact"
    if "seg_key" in entry:
        return "multi-field"
    return "unknown"


def extract_channel_from_filename(filename: str) -> str | None:
    """
    Extract channel name from a TM filename.

    Expected pattern: flat_{Channel}_{locale}.json(l)
    Examples: flat_MASS_de-de.json -> MASS, flat_PrimeSpeed_ca-es.json -> PrimeSpeed
    """
    match = re.match(r"flat_(.+?)_[a-z]{2}-[a-z]{2}\.jsonl?$", filename, re.IGNORECASE)
    if match:
        return match.group(1)
    return None


def validate_file(file_path: Path) -> dict:
    """Validate a single TM file and return statistics."""
    stats = {
        "filename": file_path.name,
        "path": str(file_path),
        "line_count": 0,
        "format": None,
        "formats_seen": defaultdict(int),
        "locales": set(),
        "parse_errors": [],
        "empty_lines": 0,
    }

    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                stats["empty_lines"] += 1
                continue

            stats["line_count"] += 1

            try:
                entry = json.loads(line)
            except json.JSONDecodeError as exc:
                stats["parse_errors"].append(
                    f"Line {line_num}: Invalid JSON - {exc}"
                )
                continue

            fmt = detect_format(entry)
            stats["formats_seen"][fmt] += 1

            # Extract locale
            if fmt == "compact":
                text = entry.get("t", "")
                match = LOCALE_PATTERN.search(text)
                if match:
                    stats["locales"].add(match.group(1))
                else:
                    stats["parse_errors"].append(
                        f"Line {line_num}: No locale found in compact entry"
                    )
            elif fmt == "multi-field":
                lc = entry.get("lc", "")
                if lc:
                    stats["locales"].add(lc)
            else:
                stats["parse_errors"].append(
                    f"Line {line_num}: Unknown format (no 't' or 'seg_key' field)"
                )

    # Determine dominant format
    if stats["formats_seen"]:
        stats["format"] = max(stats["formats_seen"], key=stats["formats_seen"].get)
    else:
        stats["format"] = "empty"

    return stats


def validate_directory(tm_dir: Path) -> None:
    """Validate all TM files in a directory and print a report."""
    # Find all .json and .jsonl files
    tm_files = sorted(
        p for p in tm_dir.rglob("*")
        if p.suffix in (".json", ".jsonl") and p.is_file()
    )

    if not tm_files:
        print(f"No .json or .jsonl files found in {tm_dir}")
        return

    print(f"Scanning {len(tm_files)} file(s) in {tm_dir}\n")
    print("=" * 80)

    all_locales = set()
    all_channels = set()
    total_entries = 0
    total_errors = 0
    format_counts = defaultdict(int)

    for file_path in tm_files:
        stats = validate_file(file_path)

        # Extract channel from filename
        channel = extract_channel_from_filename(file_path.name)
        if channel:
            all_channels.add(channel)

        all_locales.update(stats["locales"])
        total_entries += stats["line_count"]
        total_errors += len(stats["parse_errors"])

        for fmt, count in stats["formats_seen"].items():
            format_counts[fmt] += count

        # Per-file report
        relative = file_path.relative_to(tm_dir) if file_path.is_relative_to(tm_dir) else file_path
        locales_str = ", ".join(sorted(stats["locales"])) or "none"
        error_count = len(stats["parse_errors"])

        status = "OK" if error_count == 0 else f"{error_count} ERROR(S)"

        print(f"\n  File:    {relative}")
        print(f"  Format:  {stats['format']}")
        print(f"  Lines:   {stats['line_count']} (+ {stats['empty_lines']} empty)")
        print(f"  Locales: {locales_str}")
        if channel:
            print(f"  Channel: {channel}")
        print(f"  Status:  {status}")

        if stats["parse_errors"]:
            for err in stats["parse_errors"][:5]:
                print(f"    - {err}")
            if len(stats["parse_errors"]) > 5:
                print(f"    ... and {len(stats['parse_errors']) - 5} more errors")

    # Summary
    print("\n" + "=" * 80)
    print("VALIDATION SUMMARY")
    print("=" * 80)
    print(f"  Total files:    {len(tm_files)}")
    print(f"  Total entries:  {total_entries}")
    print(f"  Total errors:   {total_errors}")
    print()
    print("  Format breakdown:")
    for fmt, count in sorted(format_counts.items()):
        print(f"    {fmt:12s}  {count:>6d} entries")
    print()
    print(f"  Locales covered ({len(all_locales)}):")
    for lc in sorted(all_locales):
        print(f"    - {lc}")
    print()
    print(f"  Channels covered ({len(all_channels)}):")
    for ch in sorted(all_channels):
        print(f"    - {ch}")
    print("=" * 80)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Validate TM files and report on format, locales, and errors.",
    )
    parser.add_argument(
        "tm_directory",
        type=Path,
        help="Directory containing TM files (.json or .jsonl)",
    )

    args = parser.parse_args()

    if not args.tm_directory.is_dir():
        print(f"Error: Directory does not exist: {args.tm_directory}", file=sys.stderr)
        sys.exit(1)

    validate_directory(args.tm_directory)


if __name__ == "__main__":
    main()