Full-stack Amazon AI Transcreation Platform with: - FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables - JWT auth (SSO-ready abstract provider pattern) - 6-agent pipeline orchestrator with deterministic modules - Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme) - Job wizard, monitoring HUD, output review, admin screens - 154 TM/reference files imported, 12 locales configured - Docker Compose for all services Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs pending Phase 3 LLM integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
307 lines
9.8 KiB
Python
307 lines
9.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Import Reference Files
|
|
======================
|
|
Reads existing reference and TM files from the source directory and copies
|
|
them into the storage/ directory structure used by the platform.
|
|
|
|
Source structure (from Agent build directory):
|
|
JSON REFS and TMs/
|
|
0-GLOBAL FOR ALL LOCALES/
|
|
Amazon_TOV_Guidelines_for_Transcreation_290224.json
|
|
1-PER LANGUAGE/
|
|
CA_ES/
|
|
ca_ES_glossary.json
|
|
flat_MASS_ca-es.json
|
|
...
|
|
|
|
Target structure:
|
|
storage/amazon/
|
|
tm/{locale}/flat_{channel}_{lc}.json
|
|
ref/glossary/{prefix}_glossary.json
|
|
ref/blacklist/{prefix}_blacklist.json
|
|
ref/date_pct_formats/{prefix}_date_percent_formats.json
|
|
ref/locale_considerations/{prefix}_local_considerations.json
|
|
ref/tov_global/Amazon_TOV_Guidelines_for_Transcreation_290224.json
|
|
ref/tov_supplement/{LOCALE}_TOV_Guidelines.json
|
|
|
|
Usage:
|
|
python seed/import_reference_files.py [--source <path>] [--target <path>] [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import shutil
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Default paths
|
|
DEFAULT_SOURCE = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "Agent build + supporting JSONs"
|
|
/ "JSON REFS and TMs"
|
|
)
|
|
DEFAULT_TARGET = Path(__file__).resolve().parent.parent / "storage" / "amazon"
|
|
|
|
CLIENT_ID = "amazon"
|
|
|
|
# Map folder names (CA_ES) to locale codes for TM paths (ca-ES)
|
|
# The folder name pattern is {lang}_{region} in uppercase
|
|
# TM files use lowercase-hyphenated (ca-es), ref files use underscore (ca_ES)
|
|
|
|
|
|
def folder_to_tm_locale(folder_name: str) -> str:
|
|
"""Convert folder name like CA_ES to TM locale like ca-ES."""
|
|
parts = folder_name.split("_")
|
|
if len(parts) == 2:
|
|
return f"{parts[0].lower()}-{parts[1]}"
|
|
return folder_name.lower()
|
|
|
|
|
|
def classify_file(filename: str) -> tuple[str, str | None]:
|
|
"""
|
|
Classify a file as TM or a specific reference type.
|
|
|
|
Returns:
|
|
(category, sub_type) where category is 'tm', 'ref', or 'unknown'
|
|
and sub_type provides detail for ref files.
|
|
"""
|
|
lower = filename.lower()
|
|
|
|
# TM files start with "flat_"
|
|
if lower.startswith("flat_"):
|
|
return ("tm", None)
|
|
|
|
# Reference file patterns
|
|
if "_glossary." in lower:
|
|
return ("ref", "glossary")
|
|
if "_blacklist." in lower:
|
|
return ("ref", "blacklist")
|
|
if "_date_percent_formats." in lower:
|
|
return ("ref", "date_pct_formats")
|
|
if "_local_considerations." in lower:
|
|
return ("ref", "locale_considerations")
|
|
if "tov_guidelines" in lower or "tov_guide" in lower:
|
|
return ("ref", "tov_supplement")
|
|
|
|
return ("unknown", None)
|
|
|
|
|
|
def extract_tm_locale_from_filename(filename: str) -> str | None:
|
|
"""Extract locale code from a TM filename like flat_MASS_ca-es.json."""
|
|
match = re.search(r"_([a-z]{2}-[a-z]{2})\.json$", filename, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def import_global_files(
|
|
source_dir: Path, target_dir: Path, dry_run: bool = False
|
|
) -> list[dict]:
|
|
"""Import files from the global directory."""
|
|
global_dir = source_dir / "0-GLOBAL FOR ALL LOCALES"
|
|
records = []
|
|
|
|
if not global_dir.is_dir():
|
|
print(f" WARNING: Global directory not found: {global_dir}")
|
|
return records
|
|
|
|
for file_path in sorted(global_dir.iterdir()):
|
|
if not file_path.is_file() or not file_path.suffix == ".json":
|
|
continue
|
|
|
|
dest = target_dir / "ref" / "tov_global" / file_path.name
|
|
|
|
if not dry_run:
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(file_path, dest)
|
|
|
|
record = {
|
|
"table": "reference_files",
|
|
"file_type": "tov_global",
|
|
"locale_scope": "global",
|
|
"filename": file_path.name,
|
|
"file_path": str(dest.relative_to(target_dir.parent.parent)),
|
|
}
|
|
records.append(record)
|
|
print(f" [GLOBAL] {file_path.name} -> ref/tov_global/")
|
|
|
|
return records
|
|
|
|
|
|
def import_locale_files(
|
|
source_dir: Path, target_dir: Path, dry_run: bool = False
|
|
) -> list[dict]:
|
|
"""Import per-locale TM and reference files."""
|
|
locale_dir = source_dir / "1-PER LANGUAGE"
|
|
records = []
|
|
|
|
if not locale_dir.is_dir():
|
|
print(f" WARNING: Per-language directory not found: {locale_dir}")
|
|
return records
|
|
|
|
for folder in sorted(locale_dir.iterdir()):
|
|
if not folder.is_dir():
|
|
continue
|
|
|
|
folder_name = folder.name # e.g., CA_ES, DE_DE
|
|
tm_locale = folder_to_tm_locale(folder_name) # e.g., ca-ES
|
|
|
|
print(f"\n [{folder_name}] (locale: {tm_locale})")
|
|
|
|
for file_path in sorted(folder.iterdir()):
|
|
if not file_path.is_file() or file_path.suffix != ".json":
|
|
continue
|
|
|
|
category, sub_type = classify_file(file_path.name)
|
|
|
|
if category == "tm":
|
|
# TM files go to tm/{locale}/
|
|
file_locale = extract_tm_locale_from_filename(file_path.name)
|
|
dest = target_dir / "tm" / tm_locale / file_path.name
|
|
|
|
if not dry_run:
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(file_path, dest)
|
|
|
|
# Extract channel name for the DB record
|
|
channel_match = re.match(
|
|
r"flat_(.+?)_[a-z]{2}-[a-z]{2}\.json$",
|
|
file_path.name,
|
|
re.IGNORECASE,
|
|
)
|
|
channel = channel_match.group(1) if channel_match else "unknown"
|
|
|
|
record = {
|
|
"table": "tm_file_registry",
|
|
"locale_code": file_locale or tm_locale,
|
|
"channel": channel,
|
|
"filename": file_path.name,
|
|
"file_path": str(
|
|
Path("amazon") / "tm" / tm_locale / file_path.name
|
|
),
|
|
}
|
|
records.append(record)
|
|
print(f" TM: {file_path.name} -> tm/{tm_locale}/")
|
|
|
|
elif category == "ref":
|
|
# Reference files go to ref/{sub_type}/
|
|
dest = target_dir / "ref" / sub_type / file_path.name
|
|
|
|
if not dry_run:
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(file_path, dest)
|
|
|
|
record = {
|
|
"table": "reference_files",
|
|
"file_type": sub_type,
|
|
"locale_scope": tm_locale,
|
|
"filename": file_path.name,
|
|
"file_path": str(
|
|
Path("amazon") / "ref" / sub_type / file_path.name
|
|
),
|
|
}
|
|
records.append(record)
|
|
print(f" REF: {file_path.name} -> ref/{sub_type}/")
|
|
|
|
else:
|
|
print(f" SKIP: {file_path.name} (unrecognized file type)")
|
|
|
|
return records
|
|
|
|
|
|
def print_db_summary(records: list[dict]) -> None:
|
|
"""Print a summary of database records that would be inserted."""
|
|
tm_records = [r for r in records if r["table"] == "tm_file_registry"]
|
|
ref_records = [r for r in records if r["table"] == "reference_files"]
|
|
|
|
print("\n" + "=" * 70)
|
|
print("DATABASE RECORDS (would be inserted)")
|
|
print("=" * 70)
|
|
|
|
print(f"\n TM File Registry entries: {len(tm_records)}")
|
|
for rec in tm_records:
|
|
print(
|
|
f" INSERT tm_file_registry: "
|
|
f"locale={rec['locale_code']}, channel={rec['channel']}, "
|
|
f"file={rec['filename']}"
|
|
)
|
|
|
|
print(f"\n Reference File entries: {len(ref_records)}")
|
|
for rec in ref_records:
|
|
print(
|
|
f" INSERT reference_files: "
|
|
f"type={rec['file_type']}, scope={rec['locale_scope']}, "
|
|
f"file={rec['filename']}"
|
|
)
|
|
|
|
print(f"\n TOTAL: {len(records)} records")
|
|
print("=" * 70)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Import reference and TM files into the storage directory structure.",
|
|
)
|
|
parser.add_argument(
|
|
"--source",
|
|
type=Path,
|
|
default=DEFAULT_SOURCE,
|
|
help=f"Source directory with JSON REFS and TMs (default: {DEFAULT_SOURCE})",
|
|
)
|
|
parser.add_argument(
|
|
"--target",
|
|
type=Path,
|
|
default=DEFAULT_TARGET,
|
|
help=f"Target storage directory (default: {DEFAULT_TARGET})",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be done without copying files",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.source.is_dir():
|
|
print(f"Error: Source directory not found: {args.source}", file=sys.stderr)
|
|
print(
|
|
"Ensure the 'Agent build + supporting JSONs/JSON REFS and TMs/' "
|
|
"directory exists alongside this project.",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(1)
|
|
|
|
mode = "DRY RUN" if args.dry_run else "IMPORT"
|
|
print(f"[{mode}] Importing reference files")
|
|
print(f" Source: {args.source}")
|
|
print(f" Target: {args.target}")
|
|
print()
|
|
|
|
if not args.dry_run:
|
|
args.target.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Import global files
|
|
print("--- Global Files ---")
|
|
global_records = import_global_files(args.source, args.target, args.dry_run)
|
|
|
|
# Import per-locale files
|
|
print("\n--- Per-Locale Files ---")
|
|
locale_records = import_locale_files(args.source, args.target, args.dry_run)
|
|
|
|
all_records = global_records + locale_records
|
|
|
|
# Summary
|
|
print(f"\n{'=' * 70}")
|
|
print("IMPORT SUMMARY")
|
|
print(f"{'=' * 70}")
|
|
print(f" Files copied: {len(all_records)}")
|
|
print(f" TM files: {sum(1 for r in all_records if r['table'] == 'tm_file_registry')}")
|
|
print(f" Ref files: {sum(1 for r in all_records if r['table'] == 'reference_files')}")
|
|
|
|
# Print what would be inserted in the DB
|
|
print_db_summary(all_records)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|