diff --git a/backend/app/pipeline/modules/ref_file_loader.py b/backend/app/pipeline/modules/ref_file_loader.py index cb0d633..2f5371e 100644 --- a/backend/app/pipeline/modules/ref_file_loader.py +++ b/backend/app/pipeline/modules/ref_file_loader.py @@ -9,15 +9,38 @@ Loads various reference files used in the transcreation pipeline: """ import json +import logging from pathlib import Path from typing import Any +logger = logging.getLogger(__name__) + class RefFileLoadError(Exception): """Raised when a reference file cannot be loaded or parsed.""" pass +def _extract_entries(data: Any, file_kind: str) -> list[dict[str, Any]]: + """Pull the entry list out of either a bare-list file or a wrapper object. + + Real on-disk files use the wrapper shape `{"locale": "...", "entries": [...]}`. + The original loaders only accepted a bare list, so wrapper-shaped files + silently failed to load. This helper handles both. + """ + if isinstance(data, dict) and "entries" in data: + entries = data["entries"] + elif isinstance(data, list): + entries = data + else: + raise RefFileLoadError( + f"{file_kind} must be a JSON array or a {{entries: [...]}} object" + ) + if not isinstance(entries, list): + raise RefFileLoadError(f"{file_kind} 'entries' must be a list") + return entries + + def load_json_file(file_path: str) -> Any: """Load and parse a JSON file. @@ -43,61 +66,38 @@ def load_json_file(file_path: str) -> Any: raise RefFileLoadError(f"Encoding error reading reference file: {exc}") -def load_glossary(file_path: str) -> list[dict[str, str]]: +def load_glossary(file_path: str) -> list[dict[str, Any]]: """Load a glossary file. - Expected format: list of dicts with keys like: - {"en": "source term", "tx": "translated term", "context": "usage notes"} - - Args: - file_path: Path to glossary JSON file. - - Returns: - List of glossary entry dicts. + Accepts both the on-disk wrapper shape + `{"locale": "...", "entries": [...]}` and a bare JSON list. Returns + the entry list either way. """ - data = load_json_file(file_path) - if not isinstance(data, list): - raise RefFileLoadError("Glossary file must contain a JSON array") - return data + return _extract_entries(load_json_file(file_path), "Glossary") -def load_blacklist(file_path: str) -> list[dict[str, str]]: +def load_blacklist(file_path: str) -> list[dict[str, Any]]: """Load a blacklist file. - Expected format: list of dicts with keys: - {"term": "forbidden term", "root": "optional root", "reason": "why forbidden"} - - Args: - file_path: Path to blacklist JSON file. - - Returns: - List of blacklist entry dicts. + Accepts both the on-disk wrapper shape + `{"locale": "...", "entries": [...]}` and a bare JSON list. Returns + the entry list either way. """ - data = load_json_file(file_path) - if not isinstance(data, list): - raise RefFileLoadError("Blacklist file must contain a JSON array") - return data + return _extract_entries(load_json_file(file_path), "Blacklist") -def load_date_pct_formats(file_path: str) -> dict[str, list[dict[str, str]]]: +def load_date_pct_formats(file_path: str) -> dict[str, Any]: """Load date/percentage format rules. - Expected format: - { - "date_formats": [{"pattern": "...", "example": "...", "description": "..."}], - "percentage_formats": [{"pattern": "...", "example": "...", "description": "..."}] - } - - Args: - file_path: Path to date/pct formats JSON file. - - Returns: - Dict with "date_formats" and "percentage_formats" keys. + The on-disk files use a single `entries` list; an older spec used + separate `date_formats` and `percentage_formats` keys. Preserve all + of them when present so the agent sees the full picture either way. """ data = load_json_file(file_path) if not isinstance(data, dict): raise RefFileLoadError("Date/pct format file must contain a JSON object") return { + "entries": data.get("entries", []), "date_formats": data.get("date_formats", []), "percentage_formats": data.get("percentage_formats", []), } @@ -169,7 +169,10 @@ def load_all_reference_files( if path: try: result[key] = loader(path) - except RefFileLoadError: + except RefFileLoadError as exc: + logger.warning( + "Skipped reference file %s (%s): %s", key, path, exc + ) result[key] = None else: result[key] = None