Round 2.8: ref loaders read the actual on-disk file shape

The agent reported (for an nl-BE job) that glossary and blacklist were "not provided" and date/percentage formats were "provided but empty". The files are on disk with real content — the bug was in the loaders, which expected shapes that didn't match what's actually shipped: - load_glossary expected a top-level JSON list, but files use {"locale": "...", "entries": [...]}. RefFileLoadError raised, silently caught by load_all_reference_files, result became None. - load_blacklist had the same mismatch, same outcome. - load_date_pct_formats accepted the dict shape but only knew about the "date_formats"/"percentage_formats" keys; the files use "entries" → returned {"date_formats": [], "percentage_formats": []} which is exactly what the agent reported. Fix: - New _extract_entries() helper that accepts both the wrapper shape {entries: [...]} and a bare list. load_glossary / load_blacklist both delegate to it. - load_date_pct_formats now passes entries through alongside the legacy date_formats / percentage_formats keys (back-compat). - load_all_reference_files now logs a warning when a loader raises RefFileLoadError instead of silently swallowing it — so any future loader/file-shape drift surfaces in the celery logs. Verified inside the backend container against nl-BE, de-DE, fr-FR: - 58 / 68 / 64 glossary entries respectively (was 0) - 14 / 9 / 4 blacklist entries (was 0) - 10 / 10 / 10 date/pct entries (was empty) - locale_considerations and tov_global still load correctly Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:26:59 -04:00 · 2026-05-18 14:26:59 -04:00 · 2b44c3b4ee
commit 2b44c3b4ee
parent bb8ed2a004
1 changed files with 42 additions and 39 deletions
--- a/backend/app/pipeline/modules/ref_file_loader.py
+++ b/backend/app/pipeline/modules/ref_file_loader.py
@ -9,15 +9,38 @@ Loads various reference files used in the transcreation pipeline:
 """

 import json
+import logging
 from pathlib import Path
 from typing import Any

+logger = logging.getLogger(__name__)
+

 class RefFileLoadError(Exception):
    """Raised when a reference file cannot be loaded or parsed."""
    pass


+def _extract_entries(data: Any, file_kind: str) -> list[dict[str, Any]]:
+    """Pull the entry list out of either a bare-list file or a wrapper object.
+
+    Real on-disk files use the wrapper shape `{"locale": "...", "entries": [...]}`.
+    The original loaders only accepted a bare list, so wrapper-shaped files
+    silently failed to load. This helper handles both.
+    """
+    if isinstance(data, dict) and "entries" in data:
+        entries = data["entries"]
+    elif isinstance(data, list):
+        entries = data
+    else:
+        raise RefFileLoadError(
+            f"{file_kind} must be a JSON array or a {{entries: [...]}} object"
+        )
+    if not isinstance(entries, list):
+        raise RefFileLoadError(f"{file_kind} 'entries' must be a list")
+    return entries
+
+
 def load_json_file(file_path: str) -> Any:
    """Load and parse a JSON file.

@ -43,61 +66,38 @@ def load_json_file(file_path: str) -> Any:
        raise RefFileLoadError(f"Encoding error reading reference file: {exc}")


-def load_glossary(file_path: str) -> list[dict[str, str]]:
+def load_glossary(file_path: str) -> list[dict[str, Any]]:
    """Load a glossary file.

-    Expected format: list of dicts with keys like:
-    {"en": "source term", "tx": "translated term", "context": "usage notes"}
-
-    Args:
-        file_path: Path to glossary JSON file.
-
-    Returns:
-        List of glossary entry dicts.
+    Accepts both the on-disk wrapper shape
+    `{"locale": "...", "entries": [...]}` and a bare JSON list. Returns
+    the entry list either way.
    """
-    data = load_json_file(file_path)
-    if not isinstance(data, list):
-        raise RefFileLoadError("Glossary file must contain a JSON array")
-    return data
+    return _extract_entries(load_json_file(file_path), "Glossary")


-def load_blacklist(file_path: str) -> list[dict[str, str]]:
+def load_blacklist(file_path: str) -> list[dict[str, Any]]:
    """Load a blacklist file.

-    Expected format: list of dicts with keys:
-    {"term": "forbidden term", "root": "optional root", "reason": "why forbidden"}
-
-    Args:
-        file_path: Path to blacklist JSON file.
-
-    Returns:
-        List of blacklist entry dicts.
+    Accepts both the on-disk wrapper shape
+    `{"locale": "...", "entries": [...]}` and a bare JSON list. Returns
+    the entry list either way.
    """
-    data = load_json_file(file_path)
-    if not isinstance(data, list):
-        raise RefFileLoadError("Blacklist file must contain a JSON array")
-    return data
+    return _extract_entries(load_json_file(file_path), "Blacklist")


-def load_date_pct_formats(file_path: str) -> dict[str, list[dict[str, str]]]:
+def load_date_pct_formats(file_path: str) -> dict[str, Any]:
    """Load date/percentage format rules.

-    Expected format:
-    {
-        "date_formats": [{"pattern": "...", "example": "...", "description": "..."}],
-        "percentage_formats": [{"pattern": "...", "example": "...", "description": "..."}]
-    }
-
-    Args:
-        file_path: Path to date/pct formats JSON file.
-
-    Returns:
-        Dict with "date_formats" and "percentage_formats" keys.
+    The on-disk files use a single `entries` list; an older spec used
+    separate `date_formats` and `percentage_formats` keys. Preserve all
+    of them when present so the agent sees the full picture either way.
    """
    data = load_json_file(file_path)
    if not isinstance(data, dict):
        raise RefFileLoadError("Date/pct format file must contain a JSON object")
    return {
+        "entries": data.get("entries", []),
        "date_formats": data.get("date_formats", []),
        "percentage_formats": data.get("percentage_formats", []),
    }
@ -169,7 +169,10 @@ def load_all_reference_files(
        if path:
            try:
                result[key] = loader(path)
-            except RefFileLoadError:
+            except RefFileLoadError as exc:
+                logger.warning(
+                    "Skipped reference file %s (%s): %s", key, path, exc
+                )
                result[key] = None
        else:
            result[key] = None