amazon-transcreation/backend/app/pipeline/modules/source_file_parser.py
DJP 98fa16bfc3 feat: complete Phase 1-2 scaffold — backend, frontend, pipeline skeleton
Full-stack Amazon AI Transcreation Platform with:
- FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables
- JWT auth (SSO-ready abstract provider pattern)
- 6-agent pipeline orchestrator with deterministic modules
- Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme)
- Job wizard, monitoring HUD, output review, admin screens
- 154 TM/reference files imported, 12 locales configured
- Docker Compose for all services

Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs
pending Phase 3 LLM integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 12:31:43 -04:00

97 lines
3.1 KiB
Python

"""Parse source xlsx files into structured source line data.
Validates the expected 5 column headers (case sensitive):
EN_GB, Copy Type, Creative Guidance, Visual Ref, Char Limit
Skips rows where EN_GB is empty. Detects \\n in EN_GB for is_display_format.
"""
from typing import Any
from openpyxl import load_workbook
REQUIRED_HEADERS = ["EN_GB", "Copy Type", "Creative Guidance", "Visual Ref", "Char Limit"]
class SourceFileParseError(Exception):
"""Raised when the source file has validation errors."""
pass
def parse_source_file(file_path: str) -> list[dict[str, Any]]:
"""Parse a source xlsx file and return a list of source line dicts.
Args:
file_path: Absolute path to the xlsx file.
Returns:
List of dicts with keys: en_gb, copy_type, creative_guidance,
visual_ref, char_limit, is_display_format.
Raises:
SourceFileParseError: If headers are invalid or file cannot be read.
"""
try:
wb = load_workbook(file_path, read_only=True, data_only=True)
except Exception as exc:
raise SourceFileParseError(f"Cannot open xlsx file: {exc}")
ws = wb.active
if ws is None:
raise SourceFileParseError("Workbook has no active sheet")
# Read and validate headers from first row
rows = ws.iter_rows(min_row=1, max_row=1, values_only=True)
first_row = next(rows, None)
if first_row is None:
raise SourceFileParseError("File is empty - no header row found")
headers = [str(cell).strip() if cell else "" for cell in first_row]
# Validate all required headers exist (case sensitive)
for required in REQUIRED_HEADERS:
if required not in headers:
raise SourceFileParseError(
f"Missing required header '{required}'. "
f"Found headers: {headers}. "
f"Expected: {REQUIRED_HEADERS}"
)
# Build column index map
col_map = {header: idx for idx, header in enumerate(headers)}
# Parse data rows
source_lines: list[dict[str, Any]] = []
for row in ws.iter_rows(min_row=2, values_only=True):
en_gb_idx = col_map["EN_GB"]
en_gb_raw = row[en_gb_idx] if en_gb_idx < len(row) else None
# Skip rows where EN_GB is empty
if en_gb_raw is None or str(en_gb_raw).strip() == "":
continue
en_gb = str(en_gb_raw).strip()
# Detect display format: presence of \n in EN_GB text
is_display_format = "\n" in en_gb
def _get_cell(header: str) -> str | None:
idx = col_map.get(header)
if idx is None or idx >= len(row):
return None
val = row[idx]
if val is None:
return None
return str(val).strip() or None
source_lines.append({
"en_gb": en_gb,
"copy_type": _get_cell("Copy Type"),
"creative_guidance": _get_cell("Creative Guidance"),
"visual_ref": _get_cell("Visual Ref"),
"char_limit": _get_cell("Char Limit"),
"is_display_format": is_display_format,
})
wb.close()
return source_lines