amazon-transcreation/backend/app/pipeline/modules/ref_file_loader.py
DJP 2b44c3b4ee Round 2.8: ref loaders read the actual on-disk file shape
The agent reported (for an nl-BE job) that glossary and blacklist were
"not provided" and date/percentage formats were "provided but empty".
The files are on disk with real content — the bug was in the loaders,
which expected shapes that didn't match what's actually shipped:

- load_glossary expected a top-level JSON list, but files use
  {"locale": "...", "entries": [...]}. RefFileLoadError raised,
  silently caught by load_all_reference_files, result became None.
- load_blacklist had the same mismatch, same outcome.
- load_date_pct_formats accepted the dict shape but only knew about
  the "date_formats"/"percentage_formats" keys; the files use
  "entries" → returned {"date_formats": [], "percentage_formats": []}
  which is exactly what the agent reported.

Fix:
- New _extract_entries() helper that accepts both the wrapper shape
  {entries: [...]} and a bare list. load_glossary / load_blacklist
  both delegate to it.
- load_date_pct_formats now passes entries through alongside the
  legacy date_formats / percentage_formats keys (back-compat).
- load_all_reference_files now logs a warning when a loader raises
  RefFileLoadError instead of silently swallowing it — so any future
  loader/file-shape drift surfaces in the celery logs.

Verified inside the backend container against nl-BE, de-DE, fr-FR:
- 58 / 68 / 64 glossary entries respectively (was 0)
- 14 / 9 / 4 blacklist entries (was 0)
- 10 / 10 / 10 date/pct entries (was empty)
- locale_considerations and tov_global still load correctly

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:26:59 -04:00

180 lines
5.4 KiB
Python

"""Reference file loader.
Loads various reference files used in the transcreation pipeline:
- Glossary (JSON): locale-specific term glossary
- Blacklist (JSON): forbidden terms and roots
- Date/Percentage formats (JSON): approved format patterns
- Locale Considerations (JSON): locale-specific rules and notes
- TOV (Tone of Voice) files (JSON): global and supplementary voice profiles
"""
import json
import logging
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
class RefFileLoadError(Exception):
"""Raised when a reference file cannot be loaded or parsed."""
pass
def _extract_entries(data: Any, file_kind: str) -> list[dict[str, Any]]:
"""Pull the entry list out of either a bare-list file or a wrapper object.
Real on-disk files use the wrapper shape `{"locale": "...", "entries": [...]}`.
The original loaders only accepted a bare list, so wrapper-shaped files
silently failed to load. This helper handles both.
"""
if isinstance(data, dict) and "entries" in data:
entries = data["entries"]
elif isinstance(data, list):
entries = data
else:
raise RefFileLoadError(
f"{file_kind} must be a JSON array or a {{entries: [...]}} object"
)
if not isinstance(entries, list):
raise RefFileLoadError(f"{file_kind} 'entries' must be a list")
return entries
def load_json_file(file_path: str) -> Any:
"""Load and parse a JSON file.
Args:
file_path: Absolute path to the JSON file.
Returns:
Parsed JSON data.
Raises:
RefFileLoadError: If file cannot be read or parsed.
"""
path = Path(file_path)
if not path.exists():
raise RefFileLoadError(f"Reference file not found: {file_path}")
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except json.JSONDecodeError as exc:
raise RefFileLoadError(f"Invalid JSON in reference file: {exc}")
except UnicodeDecodeError as exc:
raise RefFileLoadError(f"Encoding error reading reference file: {exc}")
def load_glossary(file_path: str) -> list[dict[str, Any]]:
"""Load a glossary file.
Accepts both the on-disk wrapper shape
`{"locale": "...", "entries": [...]}` and a bare JSON list. Returns
the entry list either way.
"""
return _extract_entries(load_json_file(file_path), "Glossary")
def load_blacklist(file_path: str) -> list[dict[str, Any]]:
"""Load a blacklist file.
Accepts both the on-disk wrapper shape
`{"locale": "...", "entries": [...]}` and a bare JSON list. Returns
the entry list either way.
"""
return _extract_entries(load_json_file(file_path), "Blacklist")
def load_date_pct_formats(file_path: str) -> dict[str, Any]:
"""Load date/percentage format rules.
The on-disk files use a single `entries` list; an older spec used
separate `date_formats` and `percentage_formats` keys. Preserve all
of them when present so the agent sees the full picture either way.
"""
data = load_json_file(file_path)
if not isinstance(data, dict):
raise RefFileLoadError("Date/pct format file must contain a JSON object")
return {
"entries": data.get("entries", []),
"date_formats": data.get("date_formats", []),
"percentage_formats": data.get("percentage_formats", []),
}
def load_locale_considerations(file_path: str) -> dict[str, Any]:
"""Load locale-specific considerations.
Expected format: JSON object with locale-specific rules, cultural notes, etc.
Args:
file_path: Path to locale considerations JSON file.
Returns:
Dict of locale considerations.
"""
data = load_json_file(file_path)
if not isinstance(data, dict):
raise RefFileLoadError(
"Locale considerations file must contain a JSON object"
)
return data
def load_tov(file_path: str) -> dict[str, Any]:
"""Load a Tone of Voice file (global or supplement).
Expected format: JSON object with voice profile data.
Args:
file_path: Path to TOV JSON file.
Returns:
Dict of TOV profile data.
"""
data = load_json_file(file_path)
if not isinstance(data, dict):
raise RefFileLoadError("TOV file must contain a JSON object")
return data
def load_all_reference_files(
file_manifest: dict[str, str | None],
) -> dict[str, Any]:
"""Load all reference files from a file manifest.
Args:
file_manifest: Dict mapping file types to file paths.
Keys: glossary_file, blacklist_file, tov_global_file,
tov_supplement_file, locale_considerations_file,
date_pct_formats_file
Returns:
Dict mapping file types to loaded data.
"""
result: dict[str, Any] = {}
loaders = {
"glossary_file": load_glossary,
"blacklist_file": load_blacklist,
"date_pct_formats_file": load_date_pct_formats,
"locale_considerations_file": load_locale_considerations,
"tov_global_file": load_tov,
"tov_supplement_file": load_tov,
}
for key, loader in loaders.items():
path = file_manifest.get(key)
if path:
try:
result[key] = loader(path)
except RefFileLoadError as exc:
logger.warning(
"Skipped reference file %s (%s): %s", key, path, exc
)
result[key] = None
else:
result[key] = None
return result