loreal-utilisation-dept/backend/app/services/project_summary_parse.py

"""Project Summary parser.

Decisions:
- Mirrors `parseProjectSummaryBuffer` in the original SPA. One row per
  project; the dashboard uses these as the authoritative project list
  when present (it overrides the time-log-derived project set).
- Emits ISO date strings ("YYYY-MM-DD") for projectStartDate/EndDate so
  the rest of the API speaks the same shape across deliverable / summary
  / forecast endpoints.
"""

from __future__ import annotations

import csv
import hashlib
import io
from typing import Any, Iterable

from openpyxl import load_workbook

from app.services.zoho_parse import _parse_date


_KNOWN_HEADERS = {
    "project number", "project title", "project name",
    "project status", "status",
    "project type (from omg)", "project type",
    "project start date", "project end date",
    "no. of assets", "no of assets", "assets", "number of assets",
    "business division", "division", "brand",
}


def _norm(s: Any) -> str:
    return str(s or "").strip().lower()


def _str(v: Any) -> str:
    if v is None:
        return ""
    return str(v).strip()


def _header_idx(headers: Iterable[Any]) -> dict[str, int]:
    out: dict[str, int] = {}
    for i, h in enumerate(headers):
        k = _norm(h)
        if k and k not in out:
            out[k] = i
    return out


def _col(row: list[Any], idx: dict[str, int], *names: str) -> str:
    for n in names:
        i = idx.get(n.lower())
        if i is not None and i < len(row):
            v = row[i]
            if v is None:
                continue
            s = str(v).strip()
            if s:
                return s
    return ""


def _col_raw(row: list[Any], idx: dict[str, int], *names: str) -> Any:
    for n in names:
        i = idx.get(n.lower())
        if i is not None and i < len(row):
            v = row[i]
            if v is None or (isinstance(v, str) and v.strip() == ""):
                continue
            return v
    return None


def _asset_count(v: Any) -> float | None:
    if v is None or v == "":
        return None
    if isinstance(v, (int, float)):
        return float(v) if v > 0 else None
    s = str(v).strip().replace(",", "")
    try:
        n = float(s)
        return n if n > 0 else None
    except ValueError:
        return None


def _date_iso(v: Any) -> str | None:
    d = _parse_date(v)
    return d.isoformat() if d else None


def _build_rows(headers: list[Any], data_rows: Iterable[list[Any]]) -> tuple[list[dict[str, Any]], list[str]]:
    idx = _header_idx(headers)

    unrecognised: list[str] = []
    seen: set[str] = set()
    for h in headers:
        s = _str(h)
        if not s:
            continue
        if s.lower() in _KNOWN_HEADERS:
            continue
        if s in seen:
            continue
        seen.add(s)
        unrecognised.append(s)

    out: list[dict[str, Any]] = []
    for row in data_rows:
        if not row or all(c in (None, "") for c in row):
            continue
        project_number = _col(list(row), idx, "project number")
        if not project_number:
            continue
        out.append({
            "projectNumber": project_number,
            "projectTitle": _col(list(row), idx, "project title", "project name"),
            "projectStatus": _col(list(row), idx, "project status", "status").upper(),
            "projectType": _col(list(row), idx, "project type (from omg)", "project type"),
            "projectStartDate": _date_iso(_col_raw(list(row), idx, "project start date")),
            "projectEndDate": _date_iso(_col_raw(list(row), idx, "project end date")),
            "assetCount": _asset_count(_col_raw(list(row), idx, "no. of assets", "no of assets", "assets", "number of assets")),
            "division": _col(list(row), idx, "business division", "division") or "Unknown",
            "brand": _col(list(row), idx, "brand") or "Unknown",
        })
    return out, unrecognised


def parse(filename: str, content: bytes) -> dict[str, Any]:
    fn = (filename or "").lower()
    if fn.endswith(".xlsx") or fn.endswith(".xlsm"):
        rows, unknown = _parse_xlsx(content)
    else:
        rows, unknown = _parse_csv(content)
    digest = hashlib.sha256(content).hexdigest()
    return {"rows": rows, "unrecognised_columns": unknown, "content_hash": f"sha256:{digest}"}


def _parse_csv(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
    text = content.decode("utf-8-sig", errors="replace")
    reader = csv.reader(io.StringIO(text))
    rows = list(reader)
    if not rows:
        return [], []
    return _build_rows(rows[0], rows[1:])


def _parse_xlsx(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
    wb = load_workbook(io.BytesIO(content), read_only=True, data_only=True)
    ws = wb.active
    if ws is None:
        return [], []
    rows_iter = ws.iter_rows(values_only=True)
    try:
        headers = list(next(rows_iter))
    except StopIteration:
        return [], []
    data = (list(r) for r in rows_iter)
    return _build_rows(headers, data)