amazon-transcreation/backend/app/pipeline/modules/source_file_parser.py
DJP f271343bc0 feat: wire job wizard and dashboard to real backend API
- Job wizard now calls real API: create job → upload source → launch
- Dashboard and monitoring pages use live data instead of mock data
- Monitoring page polls every 3s while job is active
- Backend enriches job responses with client_name, created_by_name,
  source_line_count from eager-loaded relationships
- Frontend response mappers handle backend→frontend type differences
  (lowercase enum values, field name mapping, computed progress/stage)
- Source file parser accepts column aliases (Line type, Context notes)
  with case-insensitive matching for real-world Excel files
- Clients list endpoint accessible to all authenticated users
- Fixed uploadSource to use PUT, uploadSupplementary per-file
- Removed all hardcoded mock data from useJobs hook

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 14:18:47 -04:00

122 lines
4 KiB
Python

"""Parse source xlsx files into structured source line data.
Validates the expected column headers with flexible alias support:
EN_GB (required), Copy Type / Line type, Creative Guidance / Context notes,
Visual Ref, Char Limit.
Skips rows where EN_GB is empty. Detects \\n in EN_GB for is_display_format.
"""
from typing import Any
from openpyxl import load_workbook
# Map of canonical field name -> list of accepted header variations (case-insensitive)
HEADER_ALIASES: dict[str, list[str]] = {
"en_gb": ["en_gb"],
"copy_type": ["copy type", "line type", "copy_type", "linetype"],
"creative_guidance": ["creative guidance", "context notes", "creative_guidance", "context_notes", "guidance"],
"visual_ref": ["visual ref", "visual_ref", "visual reference"],
"char_limit": ["char limit", "char_limit", "character limit", "charlimit"],
}
REQUIRED_FIELDS = ["en_gb"]
class SourceFileParseError(Exception):
"""Raised when the source file has validation errors."""
pass
def _resolve_headers(raw_headers: list[str]) -> dict[str, int]:
"""Match raw file headers to canonical field names using aliases.
Returns a dict mapping canonical field name -> column index.
"""
resolved: dict[str, int] = {}
lower_headers = [h.lower().strip() for h in raw_headers]
for field, aliases in HEADER_ALIASES.items():
for alias in aliases:
if alias in lower_headers:
resolved[field] = lower_headers.index(alias)
break
return resolved
def parse_source_file(file_path: str) -> list[dict[str, Any]]:
"""Parse a source xlsx file and return a list of source line dicts.
Args:
file_path: Absolute path to the xlsx file.
Returns:
List of dicts with keys: en_gb, copy_type, creative_guidance,
visual_ref, char_limit, is_display_format.
Raises:
SourceFileParseError: If required headers are missing or file cannot be read.
"""
try:
wb = load_workbook(file_path, read_only=True, data_only=True)
except Exception as exc:
raise SourceFileParseError(f"Cannot open xlsx file: {exc}")
ws = wb.active
if ws is None:
raise SourceFileParseError("Workbook has no active sheet")
# Read and validate headers from first row
rows = ws.iter_rows(min_row=1, max_row=1, values_only=True)
first_row = next(rows, None)
if first_row is None:
raise SourceFileParseError("File is empty - no header row found")
raw_headers = [str(cell).strip() if cell else "" for cell in first_row]
col_map = _resolve_headers(raw_headers)
# Validate required fields
for field in REQUIRED_FIELDS:
if field not in col_map:
raise SourceFileParseError(
f"Missing required column matching '{field}'. "
f"Found headers: {raw_headers}. "
f"Accepted aliases: {HEADER_ALIASES[field]}"
)
# Parse data rows
source_lines: list[dict[str, Any]] = []
for row in ws.iter_rows(min_row=2, values_only=True):
en_gb_idx = col_map["en_gb"]
en_gb_raw = row[en_gb_idx] if en_gb_idx < len(row) else None
# Skip rows where EN_GB is empty
if en_gb_raw is None or str(en_gb_raw).strip() == "":
continue
en_gb = str(en_gb_raw).strip()
# Detect display format: presence of \n in EN_GB text
is_display_format = "\n" in en_gb
def _get_cell(field: str) -> str | None:
idx = col_map.get(field)
if idx is None or idx >= len(row):
return None
val = row[idx]
if val is None:
return None
return str(val).strip() or None
source_lines.append({
"en_gb": en_gb,
"copy_type": _get_cell("copy_type"),
"creative_guidance": _get_cell("creative_guidance"),
"visual_ref": _get_cell("visual_ref"),
"char_limit": _get_cell("char_limit"),
"is_display_format": is_display_format,
})
wb.close()
return source_lines