loreal-utilisation-dept/backend/app/services/deliverable_parse.py

"""Deliverable Summary parser.

Decisions:
- Mirrors `parseDeliverableBuffer` from the original SPA. The CSV contains
  both "D" (Deliverable) and "B" (Brief) rows; we only emit D rows because
  those are the asset-level records the dashboard reasons about.
- Date fields are emitted as ISO strings (YYYY-MM-DD) so JSON serialisation
  is straightforward downstream — matches the original.
- Header lookup is case-insensitive and trim-stripped, first occurrence
  wins (the source CSV doesn't repeat headers but we keep the rule).
"""

from __future__ import annotations

import csv
import hashlib
import io
from typing import Any, Iterable

from openpyxl import load_workbook

from app.services.zoho_parse import _parse_date


def _norm(s: Any) -> str:
    return str(s or "").strip().lower()


def _str(v: Any) -> str:
    if v is None:
        return ""
    return str(v).strip()


def _build_header_index(headers: Iterable[Any]) -> dict[str, int]:
    out: dict[str, int] = {}
    for i, h in enumerate(headers):
        k = _norm(h)
        if k and k not in out:
            out[k] = i
    return out


def _col(row: list[Any], idx: dict[str, int], *names: str) -> str:
    for n in names:
        i = idx.get(n.lower())
        if i is not None and i < len(row):
            v = row[i]
            if v is None:
                continue
            s = str(v).strip()
            if s:
                return s
    return ""


def _col_raw(row: list[Any], idx: dict[str, int], *names: str) -> Any:
    for n in names:
        i = idx.get(n.lower())
        if i is not None and i < len(row):
            v = row[i]
            if v is None or (isinstance(v, str) and v.strip() == ""):
                continue
            return v
    return None


def _date_iso(v: Any) -> str:
    d = _parse_date(v)
    return d.isoformat() if d else ""


# Expected canonical headers — anything outside this set goes into
# unrecognised_columns.
_KNOWN_HEADERS = {
    "component", "project number", "project status", "deliverable number",
    "deliverable status", "project type (from omg)", "project type",
    "deliverable start date", "deliverable end date",
    "project start date", "project end date",
    "brand", "business division", "business area - lv 2", "business area",
    "market", "deliverable title",
}


def _build_rows(headers: list[Any], data_rows: Iterable[list[Any]]) -> tuple[list[dict[str, Any]], list[str]]:
    idx = _build_header_index(headers)

    unrecognised: list[str] = []
    seen: set[str] = set()
    for h in headers:
        s = _str(h)
        if not s:
            continue
        if s.lower() in _KNOWN_HEADERS:
            continue
        if s in seen:
            continue
        seen.add(s)
        unrecognised.append(s)

    out: list[dict[str, Any]] = []
    for row in data_rows:
        if not row or all(c in (None, "") for c in row):
            continue
        # D-row filter (per original).
        component = _col(list(row), idx, "component")
        if component and component.upper() != "D":
            continue
        project_number = _col(list(row), idx, "project number")
        if not project_number:
            continue

        d_start = _date_iso(_col_raw(list(row), idx, "deliverable start date"))
        d_end = _date_iso(_col_raw(list(row), idx, "deliverable end date"))
        if not d_start or not d_end:
            continue

        out.append({
            "projectNumber": project_number,
            "projectStatus": _col(list(row), idx, "project status").upper(),
            "deliverableNumber": _col(list(row), idx, "deliverable number"),
            "deliverableStatus": _col(list(row), idx, "deliverable status").upper(),
            "projectType": _col(list(row), idx, "project type (from omg)", "project type"),
            "deliverableStartDate": d_start,
            "deliverableEndDate": d_end,
            "projectStartDate": _date_iso(_col_raw(list(row), idx, "project start date")),
            "projectEndDate": _date_iso(_col_raw(list(row), idx, "project end date")),
            "brand": _col(list(row), idx, "brand") or "Unknown",
            "businessDivision": _col(list(row), idx, "business division") or "Unknown",
            "businessArea": _col(list(row), idx, "business area - lv 2", "business area") or "Unknown",
            "market": _col(list(row), idx, "market") or "Unknown",
            "deliverableTitle": _col(list(row), idx, "deliverable title"),
        })
    return out, unrecognised


def parse(filename: str, content: bytes) -> dict[str, Any]:
    fn = (filename or "").lower()
    if fn.endswith(".xlsx") or fn.endswith(".xlsm"):
        rows, unknown = _parse_xlsx(content)
    else:
        rows, unknown = _parse_csv(content)
    digest = hashlib.sha256(content).hexdigest()
    return {"rows": rows, "unrecognised_columns": unknown, "content_hash": f"sha256:{digest}"}


def _parse_csv(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
    text = content.decode("utf-8-sig", errors="replace")
    reader = csv.reader(io.StringIO(text))
    rows = list(reader)
    if not rows:
        return [], []
    return _build_rows(rows[0], rows[1:])


def _parse_xlsx(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
    wb = load_workbook(io.BytesIO(content), read_only=True, data_only=True)
    ws = wb.active
    if ws is None:
        return [], []
    rows_iter = ws.iter_rows(values_only=True)
    try:
        headers = list(next(rows_iter))
    except StopIteration:
        return [], []
    data = (list(r) for r in rows_iter)
    return _build_rows(headers, data)