amazon-transcreation/backend/app/pipeline/modules/ref_file_loader.py
DJP 98fa16bfc3 feat: complete Phase 1-2 scaffold — backend, frontend, pipeline skeleton
Full-stack Amazon AI Transcreation Platform with:
- FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables
- JWT auth (SSO-ready abstract provider pattern)
- 6-agent pipeline orchestrator with deterministic modules
- Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme)
- Job wizard, monitoring HUD, output review, admin screens
- 154 TM/reference files imported, 12 locales configured
- Docker Compose for all services

Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs
pending Phase 3 LLM integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 12:31:43 -04:00

177 lines
5 KiB
Python

"""Reference file loader.
Loads various reference files used in the transcreation pipeline:
- Glossary (JSON): locale-specific term glossary
- Blacklist (JSON): forbidden terms and roots
- Date/Percentage formats (JSON): approved format patterns
- Locale Considerations (JSON): locale-specific rules and notes
- TOV (Tone of Voice) files (JSON): global and supplementary voice profiles
"""
import json
from pathlib import Path
from typing import Any
class RefFileLoadError(Exception):
"""Raised when a reference file cannot be loaded or parsed."""
pass
def load_json_file(file_path: str) -> Any:
"""Load and parse a JSON file.
Args:
file_path: Absolute path to the JSON file.
Returns:
Parsed JSON data.
Raises:
RefFileLoadError: If file cannot be read or parsed.
"""
path = Path(file_path)
if not path.exists():
raise RefFileLoadError(f"Reference file not found: {file_path}")
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except json.JSONDecodeError as exc:
raise RefFileLoadError(f"Invalid JSON in reference file: {exc}")
except UnicodeDecodeError as exc:
raise RefFileLoadError(f"Encoding error reading reference file: {exc}")
def load_glossary(file_path: str) -> list[dict[str, str]]:
"""Load a glossary file.
Expected format: list of dicts with keys like:
{"en": "source term", "tx": "translated term", "context": "usage notes"}
Args:
file_path: Path to glossary JSON file.
Returns:
List of glossary entry dicts.
"""
data = load_json_file(file_path)
if not isinstance(data, list):
raise RefFileLoadError("Glossary file must contain a JSON array")
return data
def load_blacklist(file_path: str) -> list[dict[str, str]]:
"""Load a blacklist file.
Expected format: list of dicts with keys:
{"term": "forbidden term", "root": "optional root", "reason": "why forbidden"}
Args:
file_path: Path to blacklist JSON file.
Returns:
List of blacklist entry dicts.
"""
data = load_json_file(file_path)
if not isinstance(data, list):
raise RefFileLoadError("Blacklist file must contain a JSON array")
return data
def load_date_pct_formats(file_path: str) -> dict[str, list[dict[str, str]]]:
"""Load date/percentage format rules.
Expected format:
{
"date_formats": [{"pattern": "...", "example": "...", "description": "..."}],
"percentage_formats": [{"pattern": "...", "example": "...", "description": "..."}]
}
Args:
file_path: Path to date/pct formats JSON file.
Returns:
Dict with "date_formats" and "percentage_formats" keys.
"""
data = load_json_file(file_path)
if not isinstance(data, dict):
raise RefFileLoadError("Date/pct format file must contain a JSON object")
return {
"date_formats": data.get("date_formats", []),
"percentage_formats": data.get("percentage_formats", []),
}
def load_locale_considerations(file_path: str) -> dict[str, Any]:
"""Load locale-specific considerations.
Expected format: JSON object with locale-specific rules, cultural notes, etc.
Args:
file_path: Path to locale considerations JSON file.
Returns:
Dict of locale considerations.
"""
data = load_json_file(file_path)
if not isinstance(data, dict):
raise RefFileLoadError(
"Locale considerations file must contain a JSON object"
)
return data
def load_tov(file_path: str) -> dict[str, Any]:
"""Load a Tone of Voice file (global or supplement).
Expected format: JSON object with voice profile data.
Args:
file_path: Path to TOV JSON file.
Returns:
Dict of TOV profile data.
"""
data = load_json_file(file_path)
if not isinstance(data, dict):
raise RefFileLoadError("TOV file must contain a JSON object")
return data
def load_all_reference_files(
file_manifest: dict[str, str | None],
) -> dict[str, Any]:
"""Load all reference files from a file manifest.
Args:
file_manifest: Dict mapping file types to file paths.
Keys: glossary_file, blacklist_file, tov_global_file,
tov_supplement_file, locale_considerations_file,
date_pct_formats_file
Returns:
Dict mapping file types to loaded data.
"""
result: dict[str, Any] = {}
loaders = {
"glossary_file": load_glossary,
"blacklist_file": load_blacklist,
"date_pct_formats_file": load_date_pct_formats,
"locale_considerations_file": load_locale_considerations,
"tov_global_file": load_tov,
"tov_supplement_file": load_tov,
}
for key, loader in loaders.items():
path = file_manifest.get(key)
if path:
try:
result[key] = loader(path)
except RefFileLoadError:
result[key] = None
else:
result[key] = None
return result