amazon-transcreation/scripts/tm_format_migrator.py
DJP 98fa16bfc3 feat: complete Phase 1-2 scaffold — backend, frontend, pipeline skeleton
Full-stack Amazon AI Transcreation Platform with:
- FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables
- JWT auth (SSO-ready abstract provider pattern)
- 6-agent pipeline orchestrator with deterministic modules
- Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme)
- Job wizard, monitoring HUD, output review, admin screens
- 154 TM/reference files imported, 12 locales configured
- Docker Compose for all services

Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs
pending Phase 3 LLM integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 12:31:43 -04:00

243 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""
TM Format Migrator
==================
Converts compact TM files (single "t" field) to the multi-field JSONL format.
Compact format:
{"t": "{seg_key} {note_type} {locale_code} {EN_source} {TX_target}"}
Multi-field output:
{"seg_key": "...", "date": "", "en": "", "lc": "...", "tx": "", "nt": "...", "_text": "..."}
Since the EN/TX boundary is ambiguous (no delimiter between languages), both
en and tx are left empty with a combined _text field for manual review.
Usage:
python scripts/tm_format_migrator.py <input_dir> <output_dir>
"""
import argparse
import json
import re
import sys
from pathlib import Path
# Regex to find locale code in xx-xx format (e.g., es-es, de-de, fr-fr)
LOCALE_PATTERN = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")
# Known note types that appear between the sequence number and the locale code
KNOWN_NOTE_TYPES = [
"BVO", "VO", "SRT", "SUPER", "GFX", "BANNER", "CTA", "HEADLINE",
"SUBHEAD", "BODY", "COPY", "LEGAL", "DISCLAIMER", "TITLE", "DESC",
"ENDCARD", "PRE-ROLL", "MID-ROLL", "POST-ROLL", "OLV", "RADIO",
"DISPLAY", "SOCIAL", "PRINT", "OOH", "DIGITAL", "AUDIO",
]
def parse_compact_line(raw_text: str) -> dict | None:
"""
Parse a compact TM entry's 't' field into structured fields.
Returns a dict with seg_key, date, en, lc, tx, nt, _text or None on failure.
"""
match = LOCALE_PATTERN.search(raw_text)
if not match:
return None
locale_code = match.group(1)
locale_start = match.start()
locale_end = match.end()
# Everything before the locale code contains seg_key + note_type
prefix = raw_text[:locale_start].rstrip()
# Everything after the locale code is the combined EN + TX text
combined_text = raw_text[locale_end:].strip()
# Split prefix into seg_key and note_type
# The note_type is typically the last whitespace-separated token(s) before the locale
# Strategy: walk backwards from the end of prefix to find note_type tokens
prefix_tokens = prefix.split()
if not prefix_tokens:
return None
# Find where the note_type begins by looking for known note types
# or by finding the last numeric token (sequence number)
note_type_tokens = []
seg_key_tokens = []
# Walk backwards through tokens to find the note_type boundary
found_seq_number = False
for i in range(len(prefix_tokens) - 1, -1, -1):
token = prefix_tokens[i]
# A sequence number is a 3-digit number like 001, 002, etc.
if re.match(r"^\d{2,4}$", token) and not found_seq_number:
found_seq_number = True
seg_key_tokens = prefix_tokens[: i + 1]
note_type_tokens = prefix_tokens[i + 1 :]
break
# If we didn't find a clear sequence number, fall back to putting
# everything in seg_key with empty note_type
if not found_seq_number:
seg_key_tokens = prefix_tokens
note_type_tokens = []
seg_key = " ".join(seg_key_tokens)
note_type = " ".join(note_type_tokens)
return {
"seg_key": seg_key,
"date": "",
"en": "",
"lc": locale_code,
"tx": "",
"nt": note_type,
"_text": combined_text, # TODO: manually split into en/tx
}
def migrate_file(input_path: Path, output_path: Path) -> dict:
"""
Migrate a single compact TM file to multi-field JSONL format.
Returns a stats dict with counts of processed, skipped, and error lines.
"""
stats = {"processed": 0, "skipped": 0, "errors": 0, "locales": set()}
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(input_path, "r", encoding="utf-8") as infile, \
open(output_path, "w", encoding="utf-8") as outfile:
for line_num, line in enumerate(infile, start=1):
line = line.strip()
if not line:
stats["skipped"] += 1
continue
try:
entry = json.loads(line)
except json.JSONDecodeError as exc:
print(
f" WARNING: {input_path.name}:{line_num} - "
f"Invalid JSON: {exc}",
file=sys.stderr,
)
stats["errors"] += 1
continue
raw_text = entry.get("t")
if raw_text is None:
# Not a compact format line; check if already multi-field
if "seg_key" in entry:
# Already in new format, pass through
outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
stats["processed"] += 1
else:
print(
f" WARNING: {input_path.name}:{line_num} - "
f"No 't' or 'seg_key' field found",
file=sys.stderr,
)
stats["errors"] += 1
continue
parsed = parse_compact_line(raw_text)
if parsed is None:
print(
f" WARNING: {input_path.name}:{line_num} - "
f"Could not parse locale from: {raw_text[:80]}...",
file=sys.stderr,
)
stats["errors"] += 1
continue
outfile.write(json.dumps(parsed, ensure_ascii=False) + "\n")
stats["processed"] += 1
stats["locales"].add(parsed["lc"])
return stats
def migrate_directory(input_dir: Path, output_dir: Path) -> None:
"""Recursively migrate all .json TM files from input_dir to output_dir."""
json_files = sorted(input_dir.rglob("*.json"))
if not json_files:
print(f"No .json files found in {input_dir}")
return
print(f"Found {len(json_files)} .json file(s) in {input_dir}\n")
total_stats = {"files": 0, "processed": 0, "skipped": 0, "errors": 0, "locales": set()}
for json_file in json_files:
# Preserve relative directory structure
relative_path = json_file.relative_to(input_dir)
output_path = output_dir / relative_path.with_suffix(".jsonl")
print(f"Processing: {relative_path}")
stats = migrate_file(json_file, output_path)
total_stats["files"] += 1
total_stats["processed"] += stats["processed"]
total_stats["skipped"] += stats["skipped"]
total_stats["errors"] += stats["errors"]
total_stats["locales"].update(stats["locales"])
print(
f" -> {stats['processed']} entries migrated, "
f"{stats['skipped']} skipped, "
f"{stats['errors']} errors"
)
print("\n" + "=" * 60)
print("Migration Summary")
print("=" * 60)
print(f" Files processed: {total_stats['files']}")
print(f" Total entries: {total_stats['processed']}")
print(f" Skipped lines: {total_stats['skipped']}")
print(f" Errors: {total_stats['errors']}")
print(f" Locales found: {', '.join(sorted(total_stats['locales']))}")
print("=" * 60)
def main() -> None:
parser = argparse.ArgumentParser(
description="Migrate compact TM files to multi-field JSONL format.",
)
parser.add_argument(
"input_dir",
type=Path,
help="Directory containing compact .json TM files",
)
parser.add_argument(
"output_dir",
type=Path,
help="Directory to write migrated .jsonl files",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Parse and report without writing output files",
)
args = parser.parse_args()
if not args.input_dir.is_dir():
print(f"Error: Input directory does not exist: {args.input_dir}", file=sys.stderr)
sys.exit(1)
if args.dry_run:
print("[DRY RUN] No files will be written.\n")
args.output_dir.mkdir(parents=True, exist_ok=True)
migrate_directory(args.input_dir, args.output_dir)
if __name__ == "__main__":
main()