amazon-transcreation/backend/app/pipeline/modules/source_file_parser.py

"""Parse source xlsx files into structured source line data.

Validates the expected column headers with flexible alias support:
EN_GB (required), Copy Type / Line type, Creative Guidance / Context notes,
Visual Ref, Char Limit.

Skips rows where EN_GB is empty. Detects \\n in EN_GB for is_display_format.
"""

from typing import Any

from openpyxl import load_workbook

# Map of canonical field name -> list of accepted header variations (case-insensitive)
HEADER_ALIASES: dict[str, list[str]] = {
    "en_gb": ["en_gb"],
    "copy_type": ["copy type", "line type", "copy_type", "linetype"],
    "creative_guidance": ["creative guidance", "context notes", "creative_guidance", "context_notes", "guidance"],
    "visual_ref": ["visual ref", "visual_ref", "visual reference"],
    "char_limit": ["char limit", "char_limit", "character limit", "charlimit"],
}

REQUIRED_FIELDS = ["en_gb"]


class SourceFileParseError(Exception):
    """Raised when the source file has validation errors."""
    pass


def _resolve_headers(raw_headers: list[str]) -> dict[str, int]:
    """Match raw file headers to canonical field names using aliases.

    Returns a dict mapping canonical field name -> column index.
    """
    resolved: dict[str, int] = {}
    lower_headers = [h.lower().strip() for h in raw_headers]

    for field, aliases in HEADER_ALIASES.items():
        for alias in aliases:
            if alias in lower_headers:
                resolved[field] = lower_headers.index(alias)
                break

    return resolved


def parse_source_file(file_path: str) -> list[dict[str, Any]]:
    """Parse a source xlsx file and return a list of source line dicts.

    Args:
        file_path: Absolute path to the xlsx file.

    Returns:
        List of dicts with keys: en_gb, copy_type, creative_guidance,
        visual_ref, char_limit, is_display_format.

    Raises:
        SourceFileParseError: If required headers are missing or file cannot be read.
    """
    try:
        wb = load_workbook(file_path, read_only=True, data_only=True)
    except Exception as exc:
        raise SourceFileParseError(f"Cannot open xlsx file: {exc}")

    ws = wb.active
    if ws is None:
        raise SourceFileParseError("Workbook has no active sheet")

    # Read and validate headers from first row
    rows = ws.iter_rows(min_row=1, max_row=1, values_only=True)
    first_row = next(rows, None)
    if first_row is None:
        raise SourceFileParseError("File is empty - no header row found")

    raw_headers = [str(cell).strip() if cell else "" for cell in first_row]
    col_map = _resolve_headers(raw_headers)

    # Validate required fields
    for field in REQUIRED_FIELDS:
        if field not in col_map:
            raise SourceFileParseError(
                f"Missing required column matching '{field}'. "
                f"Found headers: {raw_headers}. "
                f"Accepted aliases: {HEADER_ALIASES[field]}"
            )

    # Parse data rows
    source_lines: list[dict[str, Any]] = []
    for row in ws.iter_rows(min_row=2, values_only=True):
        en_gb_idx = col_map["en_gb"]
        en_gb_raw = row[en_gb_idx] if en_gb_idx < len(row) else None

        # Skip rows where EN_GB is empty
        if en_gb_raw is None or str(en_gb_raw).strip() == "":
            continue

        en_gb = str(en_gb_raw).strip()

        # Detect display format: presence of \n in EN_GB text
        is_display_format = "\n" in en_gb

        def _get_cell(field: str) -> str | None:
            idx = col_map.get(field)
            if idx is None or idx >= len(row):
                return None
            val = row[idx]
            if val is None:
                return None
            return str(val).strip() or None

        source_lines.append({
            "en_gb": en_gb,
            "copy_type": _get_cell("copy_type"),
            "creative_guidance": _get_cell("creative_guidance"),
            "visual_ref": _get_cell("visual_ref"),
            "char_limit": _get_cell("char_limit"),
            "is_display_format": is_display_format,
        })

    wb.close()
    return source_lines