"""Reference file loader. Loads various reference files used in the transcreation pipeline: - Glossary (JSON): locale-specific term glossary - Blacklist (JSON): forbidden terms and roots - Date/Percentage formats (JSON): approved format patterns - Locale Considerations (JSON): locale-specific rules and notes - TOV (Tone of Voice) files (JSON): global and supplementary voice profiles """ import json from pathlib import Path from typing import Any class RefFileLoadError(Exception): """Raised when a reference file cannot be loaded or parsed.""" pass def load_json_file(file_path: str) -> Any: """Load and parse a JSON file. Args: file_path: Absolute path to the JSON file. Returns: Parsed JSON data. Raises: RefFileLoadError: If file cannot be read or parsed. """ path = Path(file_path) if not path.exists(): raise RefFileLoadError(f"Reference file not found: {file_path}") try: with open(path, "r", encoding="utf-8") as f: return json.load(f) except json.JSONDecodeError as exc: raise RefFileLoadError(f"Invalid JSON in reference file: {exc}") except UnicodeDecodeError as exc: raise RefFileLoadError(f"Encoding error reading reference file: {exc}") def load_glossary(file_path: str) -> list[dict[str, str]]: """Load a glossary file. Expected format: list of dicts with keys like: {"en": "source term", "tx": "translated term", "context": "usage notes"} Args: file_path: Path to glossary JSON file. Returns: List of glossary entry dicts. """ data = load_json_file(file_path) if not isinstance(data, list): raise RefFileLoadError("Glossary file must contain a JSON array") return data def load_blacklist(file_path: str) -> list[dict[str, str]]: """Load a blacklist file. Expected format: list of dicts with keys: {"term": "forbidden term", "root": "optional root", "reason": "why forbidden"} Args: file_path: Path to blacklist JSON file. Returns: List of blacklist entry dicts. """ data = load_json_file(file_path) if not isinstance(data, list): raise RefFileLoadError("Blacklist file must contain a JSON array") return data def load_date_pct_formats(file_path: str) -> dict[str, list[dict[str, str]]]: """Load date/percentage format rules. Expected format: { "date_formats": [{"pattern": "...", "example": "...", "description": "..."}], "percentage_formats": [{"pattern": "...", "example": "...", "description": "..."}] } Args: file_path: Path to date/pct formats JSON file. Returns: Dict with "date_formats" and "percentage_formats" keys. """ data = load_json_file(file_path) if not isinstance(data, dict): raise RefFileLoadError("Date/pct format file must contain a JSON object") return { "date_formats": data.get("date_formats", []), "percentage_formats": data.get("percentage_formats", []), } def load_locale_considerations(file_path: str) -> dict[str, Any]: """Load locale-specific considerations. Expected format: JSON object with locale-specific rules, cultural notes, etc. Args: file_path: Path to locale considerations JSON file. Returns: Dict of locale considerations. """ data = load_json_file(file_path) if not isinstance(data, dict): raise RefFileLoadError( "Locale considerations file must contain a JSON object" ) return data def load_tov(file_path: str) -> dict[str, Any]: """Load a Tone of Voice file (global or supplement). Expected format: JSON object with voice profile data. Args: file_path: Path to TOV JSON file. Returns: Dict of TOV profile data. """ data = load_json_file(file_path) if not isinstance(data, dict): raise RefFileLoadError("TOV file must contain a JSON object") return data def load_all_reference_files( file_manifest: dict[str, str | None], ) -> dict[str, Any]: """Load all reference files from a file manifest. Args: file_manifest: Dict mapping file types to file paths. Keys: glossary_file, blacklist_file, tov_global_file, tov_supplement_file, locale_considerations_file, date_pct_formats_file Returns: Dict mapping file types to loaded data. """ result: dict[str, Any] = {} loaders = { "glossary_file": load_glossary, "blacklist_file": load_blacklist, "date_pct_formats_file": load_date_pct_formats, "locale_considerations_file": load_locale_considerations, "tov_global_file": load_tov, "tov_supplement_file": load_tov, } for key, loader in loaders.items(): path = file_manifest.get(key) if path: try: result[key] = loader(path) except RefFileLoadError: result[key] = None else: result[key] = None return result