"""Parse source xlsx files into structured source line data. Validates the expected column headers with flexible alias support: EN_GB (required), Copy Type / Line type, Creative Guidance / Context notes, Visual Ref, Char Limit. Skips rows where EN_GB is empty. Detects \\n in EN_GB for is_display_format. """ from typing import Any from openpyxl import load_workbook # Map of canonical field name -> list of accepted header variations (case-insensitive) HEADER_ALIASES: dict[str, list[str]] = { "en_gb": ["en_gb"], "copy_type": ["copy type", "line type", "copy_type", "linetype"], "creative_guidance": ["creative guidance", "context notes", "creative_guidance", "context_notes", "guidance"], "visual_ref": ["visual ref", "visual_ref", "visual reference"], "char_limit": ["char limit", "char_limit", "character limit", "charlimit"], } REQUIRED_FIELDS = ["en_gb"] class SourceFileParseError(Exception): """Raised when the source file has validation errors.""" pass def _resolve_headers(raw_headers: list[str]) -> dict[str, int]: """Match raw file headers to canonical field names using aliases. Returns a dict mapping canonical field name -> column index. """ resolved: dict[str, int] = {} lower_headers = [h.lower().strip() for h in raw_headers] for field, aliases in HEADER_ALIASES.items(): for alias in aliases: if alias in lower_headers: resolved[field] = lower_headers.index(alias) break return resolved def parse_source_file(file_path: str) -> list[dict[str, Any]]: """Parse a source xlsx file and return a list of source line dicts. Args: file_path: Absolute path to the xlsx file. Returns: List of dicts with keys: en_gb, copy_type, creative_guidance, visual_ref, char_limit, is_display_format. Raises: SourceFileParseError: If required headers are missing or file cannot be read. """ try: wb = load_workbook(file_path, read_only=True, data_only=True) except Exception as exc: raise SourceFileParseError(f"Cannot open xlsx file: {exc}") ws = wb.active if ws is None: raise SourceFileParseError("Workbook has no active sheet") # Read and validate headers from first row rows = ws.iter_rows(min_row=1, max_row=1, values_only=True) first_row = next(rows, None) if first_row is None: raise SourceFileParseError("File is empty - no header row found") raw_headers = [str(cell).strip() if cell else "" for cell in first_row] col_map = _resolve_headers(raw_headers) # Validate required fields for field in REQUIRED_FIELDS: if field not in col_map: raise SourceFileParseError( f"Missing required column matching '{field}'. " f"Found headers: {raw_headers}. " f"Accepted aliases: {HEADER_ALIASES[field]}" ) # Parse data rows source_lines: list[dict[str, Any]] = [] for row in ws.iter_rows(min_row=2, values_only=True): en_gb_idx = col_map["en_gb"] en_gb_raw = row[en_gb_idx] if en_gb_idx < len(row) else None # Skip rows where EN_GB is empty if en_gb_raw is None or str(en_gb_raw).strip() == "": continue en_gb = str(en_gb_raw).strip() # Detect display format: presence of \n in EN_GB text is_display_format = "\n" in en_gb def _get_cell(field: str) -> str | None: idx = col_map.get(field) if idx is None or idx >= len(row): return None val = row[idx] if val is None: return None return str(val).strip() or None source_lines.append({ "en_gb": en_gb, "copy_type": _get_cell("copy_type"), "creative_guidance": _get_cell("creative_guidance"), "visual_ref": _get_cell("visual_ref"), "char_limit": _get_cell("char_limit"), "is_display_format": is_display_format, }) wb.close() return source_lines