amazon-transcreation/seed/import_reference_files.py
DJP 98fa16bfc3 feat: complete Phase 1-2 scaffold — backend, frontend, pipeline skeleton
Full-stack Amazon AI Transcreation Platform with:
- FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables
- JWT auth (SSO-ready abstract provider pattern)
- 6-agent pipeline orchestrator with deterministic modules
- Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme)
- Job wizard, monitoring HUD, output review, admin screens
- 154 TM/reference files imported, 12 locales configured
- Docker Compose for all services

Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs
pending Phase 3 LLM integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 12:31:43 -04:00

307 lines
9.8 KiB
Python

#!/usr/bin/env python3
"""
Import Reference Files
======================
Reads existing reference and TM files from the source directory and copies
them into the storage/ directory structure used by the platform.
Source structure (from Agent build directory):
JSON REFS and TMs/
0-GLOBAL FOR ALL LOCALES/
Amazon_TOV_Guidelines_for_Transcreation_290224.json
1-PER LANGUAGE/
CA_ES/
ca_ES_glossary.json
flat_MASS_ca-es.json
...
Target structure:
storage/amazon/
tm/{locale}/flat_{channel}_{lc}.json
ref/glossary/{prefix}_glossary.json
ref/blacklist/{prefix}_blacklist.json
ref/date_pct_formats/{prefix}_date_percent_formats.json
ref/locale_considerations/{prefix}_local_considerations.json
ref/tov_global/Amazon_TOV_Guidelines_for_Transcreation_290224.json
ref/tov_supplement/{LOCALE}_TOV_Guidelines.json
Usage:
python seed/import_reference_files.py [--source <path>] [--target <path>] [--dry-run]
"""
import argparse
import re
import shutil
import sys
from pathlib import Path
# Default paths
DEFAULT_SOURCE = (
Path(__file__).resolve().parent.parent.parent
/ "Agent build + supporting JSONs"
/ "JSON REFS and TMs"
)
DEFAULT_TARGET = Path(__file__).resolve().parent.parent / "storage" / "amazon"
CLIENT_ID = "amazon"
# Map folder names (CA_ES) to locale codes for TM paths (ca-ES)
# The folder name pattern is {lang}_{region} in uppercase
# TM files use lowercase-hyphenated (ca-es), ref files use underscore (ca_ES)
def folder_to_tm_locale(folder_name: str) -> str:
"""Convert folder name like CA_ES to TM locale like ca-ES."""
parts = folder_name.split("_")
if len(parts) == 2:
return f"{parts[0].lower()}-{parts[1]}"
return folder_name.lower()
def classify_file(filename: str) -> tuple[str, str | None]:
"""
Classify a file as TM or a specific reference type.
Returns:
(category, sub_type) where category is 'tm', 'ref', or 'unknown'
and sub_type provides detail for ref files.
"""
lower = filename.lower()
# TM files start with "flat_"
if lower.startswith("flat_"):
return ("tm", None)
# Reference file patterns
if "_glossary." in lower:
return ("ref", "glossary")
if "_blacklist." in lower:
return ("ref", "blacklist")
if "_date_percent_formats." in lower:
return ("ref", "date_pct_formats")
if "_local_considerations." in lower:
return ("ref", "locale_considerations")
if "tov_guidelines" in lower or "tov_guide" in lower:
return ("ref", "tov_supplement")
return ("unknown", None)
def extract_tm_locale_from_filename(filename: str) -> str | None:
"""Extract locale code from a TM filename like flat_MASS_ca-es.json."""
match = re.search(r"_([a-z]{2}-[a-z]{2})\.json$", filename, re.IGNORECASE)
if match:
return match.group(1)
return None
def import_global_files(
source_dir: Path, target_dir: Path, dry_run: bool = False
) -> list[dict]:
"""Import files from the global directory."""
global_dir = source_dir / "0-GLOBAL FOR ALL LOCALES"
records = []
if not global_dir.is_dir():
print(f" WARNING: Global directory not found: {global_dir}")
return records
for file_path in sorted(global_dir.iterdir()):
if not file_path.is_file() or not file_path.suffix == ".json":
continue
dest = target_dir / "ref" / "tov_global" / file_path.name
if not dry_run:
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(file_path, dest)
record = {
"table": "reference_files",
"file_type": "tov_global",
"locale_scope": "global",
"filename": file_path.name,
"file_path": str(dest.relative_to(target_dir.parent.parent)),
}
records.append(record)
print(f" [GLOBAL] {file_path.name} -> ref/tov_global/")
return records
def import_locale_files(
source_dir: Path, target_dir: Path, dry_run: bool = False
) -> list[dict]:
"""Import per-locale TM and reference files."""
locale_dir = source_dir / "1-PER LANGUAGE"
records = []
if not locale_dir.is_dir():
print(f" WARNING: Per-language directory not found: {locale_dir}")
return records
for folder in sorted(locale_dir.iterdir()):
if not folder.is_dir():
continue
folder_name = folder.name # e.g., CA_ES, DE_DE
tm_locale = folder_to_tm_locale(folder_name) # e.g., ca-ES
print(f"\n [{folder_name}] (locale: {tm_locale})")
for file_path in sorted(folder.iterdir()):
if not file_path.is_file() or file_path.suffix != ".json":
continue
category, sub_type = classify_file(file_path.name)
if category == "tm":
# TM files go to tm/{locale}/
file_locale = extract_tm_locale_from_filename(file_path.name)
dest = target_dir / "tm" / tm_locale / file_path.name
if not dry_run:
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(file_path, dest)
# Extract channel name for the DB record
channel_match = re.match(
r"flat_(.+?)_[a-z]{2}-[a-z]{2}\.json$",
file_path.name,
re.IGNORECASE,
)
channel = channel_match.group(1) if channel_match else "unknown"
record = {
"table": "tm_file_registry",
"locale_code": file_locale or tm_locale,
"channel": channel,
"filename": file_path.name,
"file_path": str(
Path("amazon") / "tm" / tm_locale / file_path.name
),
}
records.append(record)
print(f" TM: {file_path.name} -> tm/{tm_locale}/")
elif category == "ref":
# Reference files go to ref/{sub_type}/
dest = target_dir / "ref" / sub_type / file_path.name
if not dry_run:
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(file_path, dest)
record = {
"table": "reference_files",
"file_type": sub_type,
"locale_scope": tm_locale,
"filename": file_path.name,
"file_path": str(
Path("amazon") / "ref" / sub_type / file_path.name
),
}
records.append(record)
print(f" REF: {file_path.name} -> ref/{sub_type}/")
else:
print(f" SKIP: {file_path.name} (unrecognized file type)")
return records
def print_db_summary(records: list[dict]) -> None:
"""Print a summary of database records that would be inserted."""
tm_records = [r for r in records if r["table"] == "tm_file_registry"]
ref_records = [r for r in records if r["table"] == "reference_files"]
print("\n" + "=" * 70)
print("DATABASE RECORDS (would be inserted)")
print("=" * 70)
print(f"\n TM File Registry entries: {len(tm_records)}")
for rec in tm_records:
print(
f" INSERT tm_file_registry: "
f"locale={rec['locale_code']}, channel={rec['channel']}, "
f"file={rec['filename']}"
)
print(f"\n Reference File entries: {len(ref_records)}")
for rec in ref_records:
print(
f" INSERT reference_files: "
f"type={rec['file_type']}, scope={rec['locale_scope']}, "
f"file={rec['filename']}"
)
print(f"\n TOTAL: {len(records)} records")
print("=" * 70)
def main() -> None:
parser = argparse.ArgumentParser(
description="Import reference and TM files into the storage directory structure.",
)
parser.add_argument(
"--source",
type=Path,
default=DEFAULT_SOURCE,
help=f"Source directory with JSON REFS and TMs (default: {DEFAULT_SOURCE})",
)
parser.add_argument(
"--target",
type=Path,
default=DEFAULT_TARGET,
help=f"Target storage directory (default: {DEFAULT_TARGET})",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be done without copying files",
)
args = parser.parse_args()
if not args.source.is_dir():
print(f"Error: Source directory not found: {args.source}", file=sys.stderr)
print(
"Ensure the 'Agent build + supporting JSONs/JSON REFS and TMs/' "
"directory exists alongside this project.",
file=sys.stderr,
)
sys.exit(1)
mode = "DRY RUN" if args.dry_run else "IMPORT"
print(f"[{mode}] Importing reference files")
print(f" Source: {args.source}")
print(f" Target: {args.target}")
print()
if not args.dry_run:
args.target.mkdir(parents=True, exist_ok=True)
# Import global files
print("--- Global Files ---")
global_records = import_global_files(args.source, args.target, args.dry_run)
# Import per-locale files
print("\n--- Per-Locale Files ---")
locale_records = import_locale_files(args.source, args.target, args.dry_run)
all_records = global_records + locale_records
# Summary
print(f"\n{'=' * 70}")
print("IMPORT SUMMARY")
print(f"{'=' * 70}")
print(f" Files copied: {len(all_records)}")
print(f" TM files: {sum(1 for r in all_records if r['table'] == 'tm_file_registry')}")
print(f" Ref files: {sum(1 for r in all_records if r['table'] == 'reference_files')}")
# Print what would be inserted in the DB
print_db_summary(all_records)
if __name__ == "__main__":
main()