Round 2.8: ref loaders read the actual on-disk file shape
The agent reported (for an nl-BE job) that glossary and blacklist were
"not provided" and date/percentage formats were "provided but empty".
The files are on disk with real content — the bug was in the loaders,
which expected shapes that didn't match what's actually shipped:
- load_glossary expected a top-level JSON list, but files use
{"locale": "...", "entries": [...]}. RefFileLoadError raised,
silently caught by load_all_reference_files, result became None.
- load_blacklist had the same mismatch, same outcome.
- load_date_pct_formats accepted the dict shape but only knew about
the "date_formats"/"percentage_formats" keys; the files use
"entries" → returned {"date_formats": [], "percentage_formats": []}
which is exactly what the agent reported.
Fix:
- New _extract_entries() helper that accepts both the wrapper shape
{entries: [...]} and a bare list. load_glossary / load_blacklist
both delegate to it.
- load_date_pct_formats now passes entries through alongside the
legacy date_formats / percentage_formats keys (back-compat).
- load_all_reference_files now logs a warning when a loader raises
RefFileLoadError instead of silently swallowing it — so any future
loader/file-shape drift surfaces in the celery logs.
Verified inside the backend container against nl-BE, de-DE, fr-FR:
- 58 / 68 / 64 glossary entries respectively (was 0)
- 14 / 9 / 4 blacklist entries (was 0)
- 10 / 10 / 10 date/pct entries (was empty)
- locale_considerations and tov_global still load correctly
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bb8ed2a004
commit
2b44c3b4ee
1 changed files with 42 additions and 39 deletions
|
|
@ -9,15 +9,38 @@ Loads various reference files used in the transcreation pipeline:
|
|||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RefFileLoadError(Exception):
|
||||
"""Raised when a reference file cannot be loaded or parsed."""
|
||||
pass
|
||||
|
||||
|
||||
def _extract_entries(data: Any, file_kind: str) -> list[dict[str, Any]]:
|
||||
"""Pull the entry list out of either a bare-list file or a wrapper object.
|
||||
|
||||
Real on-disk files use the wrapper shape `{"locale": "...", "entries": [...]}`.
|
||||
The original loaders only accepted a bare list, so wrapper-shaped files
|
||||
silently failed to load. This helper handles both.
|
||||
"""
|
||||
if isinstance(data, dict) and "entries" in data:
|
||||
entries = data["entries"]
|
||||
elif isinstance(data, list):
|
||||
entries = data
|
||||
else:
|
||||
raise RefFileLoadError(
|
||||
f"{file_kind} must be a JSON array or a {{entries: [...]}} object"
|
||||
)
|
||||
if not isinstance(entries, list):
|
||||
raise RefFileLoadError(f"{file_kind} 'entries' must be a list")
|
||||
return entries
|
||||
|
||||
|
||||
def load_json_file(file_path: str) -> Any:
|
||||
"""Load and parse a JSON file.
|
||||
|
||||
|
|
@ -43,61 +66,38 @@ def load_json_file(file_path: str) -> Any:
|
|||
raise RefFileLoadError(f"Encoding error reading reference file: {exc}")
|
||||
|
||||
|
||||
def load_glossary(file_path: str) -> list[dict[str, str]]:
|
||||
def load_glossary(file_path: str) -> list[dict[str, Any]]:
|
||||
"""Load a glossary file.
|
||||
|
||||
Expected format: list of dicts with keys like:
|
||||
{"en": "source term", "tx": "translated term", "context": "usage notes"}
|
||||
|
||||
Args:
|
||||
file_path: Path to glossary JSON file.
|
||||
|
||||
Returns:
|
||||
List of glossary entry dicts.
|
||||
Accepts both the on-disk wrapper shape
|
||||
`{"locale": "...", "entries": [...]}` and a bare JSON list. Returns
|
||||
the entry list either way.
|
||||
"""
|
||||
data = load_json_file(file_path)
|
||||
if not isinstance(data, list):
|
||||
raise RefFileLoadError("Glossary file must contain a JSON array")
|
||||
return data
|
||||
return _extract_entries(load_json_file(file_path), "Glossary")
|
||||
|
||||
|
||||
def load_blacklist(file_path: str) -> list[dict[str, str]]:
|
||||
def load_blacklist(file_path: str) -> list[dict[str, Any]]:
|
||||
"""Load a blacklist file.
|
||||
|
||||
Expected format: list of dicts with keys:
|
||||
{"term": "forbidden term", "root": "optional root", "reason": "why forbidden"}
|
||||
|
||||
Args:
|
||||
file_path: Path to blacklist JSON file.
|
||||
|
||||
Returns:
|
||||
List of blacklist entry dicts.
|
||||
Accepts both the on-disk wrapper shape
|
||||
`{"locale": "...", "entries": [...]}` and a bare JSON list. Returns
|
||||
the entry list either way.
|
||||
"""
|
||||
data = load_json_file(file_path)
|
||||
if not isinstance(data, list):
|
||||
raise RefFileLoadError("Blacklist file must contain a JSON array")
|
||||
return data
|
||||
return _extract_entries(load_json_file(file_path), "Blacklist")
|
||||
|
||||
|
||||
def load_date_pct_formats(file_path: str) -> dict[str, list[dict[str, str]]]:
|
||||
def load_date_pct_formats(file_path: str) -> dict[str, Any]:
|
||||
"""Load date/percentage format rules.
|
||||
|
||||
Expected format:
|
||||
{
|
||||
"date_formats": [{"pattern": "...", "example": "...", "description": "..."}],
|
||||
"percentage_formats": [{"pattern": "...", "example": "...", "description": "..."}]
|
||||
}
|
||||
|
||||
Args:
|
||||
file_path: Path to date/pct formats JSON file.
|
||||
|
||||
Returns:
|
||||
Dict with "date_formats" and "percentage_formats" keys.
|
||||
The on-disk files use a single `entries` list; an older spec used
|
||||
separate `date_formats` and `percentage_formats` keys. Preserve all
|
||||
of them when present so the agent sees the full picture either way.
|
||||
"""
|
||||
data = load_json_file(file_path)
|
||||
if not isinstance(data, dict):
|
||||
raise RefFileLoadError("Date/pct format file must contain a JSON object")
|
||||
return {
|
||||
"entries": data.get("entries", []),
|
||||
"date_formats": data.get("date_formats", []),
|
||||
"percentage_formats": data.get("percentage_formats", []),
|
||||
}
|
||||
|
|
@ -169,7 +169,10 @@ def load_all_reference_files(
|
|||
if path:
|
||||
try:
|
||||
result[key] = loader(path)
|
||||
except RefFileLoadError:
|
||||
except RefFileLoadError as exc:
|
||||
logger.warning(
|
||||
"Skipped reference file %s (%s): %s", key, path, exc
|
||||
)
|
||||
result[key] = None
|
||||
else:
|
||||
result[key] = None
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue