#!/usr/bin/env python3 """ Import Reference Files ====================== Reads existing reference and TM files from the source directory and copies them into the storage/ directory structure used by the platform. Source structure (from Agent build directory): JSON REFS and TMs/ 0-GLOBAL FOR ALL LOCALES/ Amazon_TOV_Guidelines_for_Transcreation_290224.json 1-PER LANGUAGE/ CA_ES/ ca_ES_glossary.json flat_MASS_ca-es.json ... Target structure: storage/amazon/ tm/{locale}/flat_{channel}_{lc}.json ref/glossary/{prefix}_glossary.json ref/blacklist/{prefix}_blacklist.json ref/date_pct_formats/{prefix}_date_percent_formats.json ref/locale_considerations/{prefix}_local_considerations.json ref/tov_global/Amazon_TOV_Guidelines_for_Transcreation_290224.json ref/tov_supplement/{LOCALE}_TOV_Guidelines.json Usage: python seed/import_reference_files.py [--source ] [--target ] [--dry-run] """ import argparse import re import shutil import sys from pathlib import Path # Default paths DEFAULT_SOURCE = ( Path(__file__).resolve().parent.parent.parent / "Agent build + supporting JSONs" / "JSON REFS and TMs" ) DEFAULT_TARGET = Path(__file__).resolve().parent.parent / "storage" / "amazon" CLIENT_ID = "amazon" # Map folder names (CA_ES) to locale codes for TM paths (ca-ES) # The folder name pattern is {lang}_{region} in uppercase # TM files use lowercase-hyphenated (ca-es), ref files use underscore (ca_ES) def folder_to_tm_locale(folder_name: str) -> str: """Convert folder name like CA_ES to TM locale like ca-ES.""" parts = folder_name.split("_") if len(parts) == 2: return f"{parts[0].lower()}-{parts[1]}" return folder_name.lower() def classify_file(filename: str) -> tuple[str, str | None]: """ Classify a file as TM or a specific reference type. Returns: (category, sub_type) where category is 'tm', 'ref', or 'unknown' and sub_type provides detail for ref files. """ lower = filename.lower() # TM files start with "flat_" if lower.startswith("flat_"): return ("tm", None) # Reference file patterns if "_glossary." in lower: return ("ref", "glossary") if "_blacklist." in lower: return ("ref", "blacklist") if "_date_percent_formats." in lower: return ("ref", "date_pct_formats") if "_local_considerations." in lower: return ("ref", "locale_considerations") if "tov_guidelines" in lower or "tov_guide" in lower: return ("ref", "tov_supplement") return ("unknown", None) def extract_tm_locale_from_filename(filename: str) -> str | None: """Extract locale code from a TM filename like flat_MASS_ca-es.json.""" match = re.search(r"_([a-z]{2}-[a-z]{2})\.json$", filename, re.IGNORECASE) if match: return match.group(1) return None def import_global_files( source_dir: Path, target_dir: Path, dry_run: bool = False ) -> list[dict]: """Import files from the global directory.""" global_dir = source_dir / "0-GLOBAL FOR ALL LOCALES" records = [] if not global_dir.is_dir(): print(f" WARNING: Global directory not found: {global_dir}") return records for file_path in sorted(global_dir.iterdir()): if not file_path.is_file() or not file_path.suffix == ".json": continue dest = target_dir / "ref" / "tov_global" / file_path.name if not dry_run: dest.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(file_path, dest) record = { "table": "reference_files", "file_type": "tov_global", "locale_scope": "global", "filename": file_path.name, "file_path": str(dest.relative_to(target_dir.parent.parent)), } records.append(record) print(f" [GLOBAL] {file_path.name} -> ref/tov_global/") return records def import_locale_files( source_dir: Path, target_dir: Path, dry_run: bool = False ) -> list[dict]: """Import per-locale TM and reference files.""" locale_dir = source_dir / "1-PER LANGUAGE" records = [] if not locale_dir.is_dir(): print(f" WARNING: Per-language directory not found: {locale_dir}") return records for folder in sorted(locale_dir.iterdir()): if not folder.is_dir(): continue folder_name = folder.name # e.g., CA_ES, DE_DE tm_locale = folder_to_tm_locale(folder_name) # e.g., ca-ES print(f"\n [{folder_name}] (locale: {tm_locale})") for file_path in sorted(folder.iterdir()): if not file_path.is_file() or file_path.suffix != ".json": continue category, sub_type = classify_file(file_path.name) if category == "tm": # TM files go to tm/{locale}/ file_locale = extract_tm_locale_from_filename(file_path.name) dest = target_dir / "tm" / tm_locale / file_path.name if not dry_run: dest.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(file_path, dest) # Extract channel name for the DB record channel_match = re.match( r"flat_(.+?)_[a-z]{2}-[a-z]{2}\.json$", file_path.name, re.IGNORECASE, ) channel = channel_match.group(1) if channel_match else "unknown" record = { "table": "tm_file_registry", "locale_code": file_locale or tm_locale, "channel": channel, "filename": file_path.name, "file_path": str( Path("amazon") / "tm" / tm_locale / file_path.name ), } records.append(record) print(f" TM: {file_path.name} -> tm/{tm_locale}/") elif category == "ref": # Reference files go to ref/{sub_type}/ dest = target_dir / "ref" / sub_type / file_path.name if not dry_run: dest.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(file_path, dest) record = { "table": "reference_files", "file_type": sub_type, "locale_scope": tm_locale, "filename": file_path.name, "file_path": str( Path("amazon") / "ref" / sub_type / file_path.name ), } records.append(record) print(f" REF: {file_path.name} -> ref/{sub_type}/") else: print(f" SKIP: {file_path.name} (unrecognized file type)") return records def print_db_summary(records: list[dict]) -> None: """Print a summary of database records that would be inserted.""" tm_records = [r for r in records if r["table"] == "tm_file_registry"] ref_records = [r for r in records if r["table"] == "reference_files"] print("\n" + "=" * 70) print("DATABASE RECORDS (would be inserted)") print("=" * 70) print(f"\n TM File Registry entries: {len(tm_records)}") for rec in tm_records: print( f" INSERT tm_file_registry: " f"locale={rec['locale_code']}, channel={rec['channel']}, " f"file={rec['filename']}" ) print(f"\n Reference File entries: {len(ref_records)}") for rec in ref_records: print( f" INSERT reference_files: " f"type={rec['file_type']}, scope={rec['locale_scope']}, " f"file={rec['filename']}" ) print(f"\n TOTAL: {len(records)} records") print("=" * 70) def main() -> None: parser = argparse.ArgumentParser( description="Import reference and TM files into the storage directory structure.", ) parser.add_argument( "--source", type=Path, default=DEFAULT_SOURCE, help=f"Source directory with JSON REFS and TMs (default: {DEFAULT_SOURCE})", ) parser.add_argument( "--target", type=Path, default=DEFAULT_TARGET, help=f"Target storage directory (default: {DEFAULT_TARGET})", ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be done without copying files", ) args = parser.parse_args() if not args.source.is_dir(): print(f"Error: Source directory not found: {args.source}", file=sys.stderr) print( "Ensure the 'Agent build + supporting JSONs/JSON REFS and TMs/' " "directory exists alongside this project.", file=sys.stderr, ) sys.exit(1) mode = "DRY RUN" if args.dry_run else "IMPORT" print(f"[{mode}] Importing reference files") print(f" Source: {args.source}") print(f" Target: {args.target}") print() if not args.dry_run: args.target.mkdir(parents=True, exist_ok=True) # Import global files print("--- Global Files ---") global_records = import_global_files(args.source, args.target, args.dry_run) # Import per-locale files print("\n--- Per-Locale Files ---") locale_records = import_locale_files(args.source, args.target, args.dry_run) all_records = global_records + locale_records # Summary print(f"\n{'=' * 70}") print("IMPORT SUMMARY") print(f"{'=' * 70}") print(f" Files copied: {len(all_records)}") print(f" TM files: {sum(1 for r in all_records if r['table'] == 'tm_file_registry')}") print(f" Ref files: {sum(1 for r in all_records if r['table'] == 'reference_files')}") # Print what would be inserted in the DB print_db_summary(all_records) if __name__ == "__main__": main()