Full-stack Amazon AI Transcreation Platform with: - FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables - JWT auth (SSO-ready abstract provider pattern) - 6-agent pipeline orchestrator with deterministic modules - Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme) - Job wizard, monitoring HUD, output review, admin screens - 154 TM/reference files imported, 12 locales configured - Docker Compose for all services Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs pending Phase 3 LLM integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
243 lines
7.9 KiB
Python
243 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
TM Format Migrator
|
|
==================
|
|
Converts compact TM files (single "t" field) to the multi-field JSONL format.
|
|
|
|
Compact format:
|
|
{"t": "{seg_key} {note_type} {locale_code} {EN_source} {TX_target}"}
|
|
|
|
Multi-field output:
|
|
{"seg_key": "...", "date": "", "en": "", "lc": "...", "tx": "", "nt": "...", "_text": "..."}
|
|
|
|
Since the EN/TX boundary is ambiguous (no delimiter between languages), both
|
|
en and tx are left empty with a combined _text field for manual review.
|
|
|
|
Usage:
|
|
python scripts/tm_format_migrator.py <input_dir> <output_dir>
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Regex to find locale code in xx-xx format (e.g., es-es, de-de, fr-fr)
|
|
LOCALE_PATTERN = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")
|
|
|
|
# Known note types that appear between the sequence number and the locale code
|
|
KNOWN_NOTE_TYPES = [
|
|
"BVO", "VO", "SRT", "SUPER", "GFX", "BANNER", "CTA", "HEADLINE",
|
|
"SUBHEAD", "BODY", "COPY", "LEGAL", "DISCLAIMER", "TITLE", "DESC",
|
|
"ENDCARD", "PRE-ROLL", "MID-ROLL", "POST-ROLL", "OLV", "RADIO",
|
|
"DISPLAY", "SOCIAL", "PRINT", "OOH", "DIGITAL", "AUDIO",
|
|
]
|
|
|
|
|
|
def parse_compact_line(raw_text: str) -> dict | None:
|
|
"""
|
|
Parse a compact TM entry's 't' field into structured fields.
|
|
|
|
Returns a dict with seg_key, date, en, lc, tx, nt, _text or None on failure.
|
|
"""
|
|
match = LOCALE_PATTERN.search(raw_text)
|
|
if not match:
|
|
return None
|
|
|
|
locale_code = match.group(1)
|
|
locale_start = match.start()
|
|
locale_end = match.end()
|
|
|
|
# Everything before the locale code contains seg_key + note_type
|
|
prefix = raw_text[:locale_start].rstrip()
|
|
|
|
# Everything after the locale code is the combined EN + TX text
|
|
combined_text = raw_text[locale_end:].strip()
|
|
|
|
# Split prefix into seg_key and note_type
|
|
# The note_type is typically the last whitespace-separated token(s) before the locale
|
|
# Strategy: walk backwards from the end of prefix to find note_type tokens
|
|
prefix_tokens = prefix.split()
|
|
|
|
if not prefix_tokens:
|
|
return None
|
|
|
|
# Find where the note_type begins by looking for known note types
|
|
# or by finding the last numeric token (sequence number)
|
|
note_type_tokens = []
|
|
seg_key_tokens = []
|
|
|
|
# Walk backwards through tokens to find the note_type boundary
|
|
found_seq_number = False
|
|
for i in range(len(prefix_tokens) - 1, -1, -1):
|
|
token = prefix_tokens[i]
|
|
# A sequence number is a 3-digit number like 001, 002, etc.
|
|
if re.match(r"^\d{2,4}$", token) and not found_seq_number:
|
|
found_seq_number = True
|
|
seg_key_tokens = prefix_tokens[: i + 1]
|
|
note_type_tokens = prefix_tokens[i + 1 :]
|
|
break
|
|
|
|
# If we didn't find a clear sequence number, fall back to putting
|
|
# everything in seg_key with empty note_type
|
|
if not found_seq_number:
|
|
seg_key_tokens = prefix_tokens
|
|
note_type_tokens = []
|
|
|
|
seg_key = " ".join(seg_key_tokens)
|
|
note_type = " ".join(note_type_tokens)
|
|
|
|
return {
|
|
"seg_key": seg_key,
|
|
"date": "",
|
|
"en": "",
|
|
"lc": locale_code,
|
|
"tx": "",
|
|
"nt": note_type,
|
|
"_text": combined_text, # TODO: manually split into en/tx
|
|
}
|
|
|
|
|
|
def migrate_file(input_path: Path, output_path: Path) -> dict:
|
|
"""
|
|
Migrate a single compact TM file to multi-field JSONL format.
|
|
|
|
Returns a stats dict with counts of processed, skipped, and error lines.
|
|
"""
|
|
stats = {"processed": 0, "skipped": 0, "errors": 0, "locales": set()}
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(input_path, "r", encoding="utf-8") as infile, \
|
|
open(output_path, "w", encoding="utf-8") as outfile:
|
|
|
|
for line_num, line in enumerate(infile, start=1):
|
|
line = line.strip()
|
|
if not line:
|
|
stats["skipped"] += 1
|
|
continue
|
|
|
|
try:
|
|
entry = json.loads(line)
|
|
except json.JSONDecodeError as exc:
|
|
print(
|
|
f" WARNING: {input_path.name}:{line_num} - "
|
|
f"Invalid JSON: {exc}",
|
|
file=sys.stderr,
|
|
)
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
raw_text = entry.get("t")
|
|
if raw_text is None:
|
|
# Not a compact format line; check if already multi-field
|
|
if "seg_key" in entry:
|
|
# Already in new format, pass through
|
|
outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
stats["processed"] += 1
|
|
else:
|
|
print(
|
|
f" WARNING: {input_path.name}:{line_num} - "
|
|
f"No 't' or 'seg_key' field found",
|
|
file=sys.stderr,
|
|
)
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
parsed = parse_compact_line(raw_text)
|
|
if parsed is None:
|
|
print(
|
|
f" WARNING: {input_path.name}:{line_num} - "
|
|
f"Could not parse locale from: {raw_text[:80]}...",
|
|
file=sys.stderr,
|
|
)
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
outfile.write(json.dumps(parsed, ensure_ascii=False) + "\n")
|
|
stats["processed"] += 1
|
|
stats["locales"].add(parsed["lc"])
|
|
|
|
return stats
|
|
|
|
|
|
def migrate_directory(input_dir: Path, output_dir: Path) -> None:
|
|
"""Recursively migrate all .json TM files from input_dir to output_dir."""
|
|
json_files = sorted(input_dir.rglob("*.json"))
|
|
|
|
if not json_files:
|
|
print(f"No .json files found in {input_dir}")
|
|
return
|
|
|
|
print(f"Found {len(json_files)} .json file(s) in {input_dir}\n")
|
|
|
|
total_stats = {"files": 0, "processed": 0, "skipped": 0, "errors": 0, "locales": set()}
|
|
|
|
for json_file in json_files:
|
|
# Preserve relative directory structure
|
|
relative_path = json_file.relative_to(input_dir)
|
|
output_path = output_dir / relative_path.with_suffix(".jsonl")
|
|
|
|
print(f"Processing: {relative_path}")
|
|
|
|
stats = migrate_file(json_file, output_path)
|
|
|
|
total_stats["files"] += 1
|
|
total_stats["processed"] += stats["processed"]
|
|
total_stats["skipped"] += stats["skipped"]
|
|
total_stats["errors"] += stats["errors"]
|
|
total_stats["locales"].update(stats["locales"])
|
|
|
|
print(
|
|
f" -> {stats['processed']} entries migrated, "
|
|
f"{stats['skipped']} skipped, "
|
|
f"{stats['errors']} errors"
|
|
)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Migration Summary")
|
|
print("=" * 60)
|
|
print(f" Files processed: {total_stats['files']}")
|
|
print(f" Total entries: {total_stats['processed']}")
|
|
print(f" Skipped lines: {total_stats['skipped']}")
|
|
print(f" Errors: {total_stats['errors']}")
|
|
print(f" Locales found: {', '.join(sorted(total_stats['locales']))}")
|
|
print("=" * 60)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Migrate compact TM files to multi-field JSONL format.",
|
|
)
|
|
parser.add_argument(
|
|
"input_dir",
|
|
type=Path,
|
|
help="Directory containing compact .json TM files",
|
|
)
|
|
parser.add_argument(
|
|
"output_dir",
|
|
type=Path,
|
|
help="Directory to write migrated .jsonl files",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Parse and report without writing output files",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.input_dir.is_dir():
|
|
print(f"Error: Input directory does not exist: {args.input_dir}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if args.dry_run:
|
|
print("[DRY RUN] No files will be written.\n")
|
|
|
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
migrate_directory(args.input_dir, args.output_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|