loreal-utilisation-dept/backend/app/services/zoho_parse.py

"""Zoho timelog parser.

Decisions:
- Header matching is case-insensitive and trim-stripped. Unknown headers
  are surfaced in `unrecognised_columns` so the operator notices when
  Zoho silently renames a column.
- Billable detection: if the column is literally "Billable" / "Is Billable",
  we coerce truthy strings. If the column is "Billing Type", we map
  "Client Related" / "Fee Related" → True, everything else → False.
- Date parsing tries ISO first, then dateutil for the messy formats Zoho
  occasionally emits ("01/05/2026", "1-May-2026", etc.).
- For .xlsx we use openpyxl read-only mode — keeps memory low on big files.
"""

from __future__ import annotations

import csv
import hashlib
import io
import logging
from datetime import date, datetime
from typing import Any, Iterable

from dateutil import parser as dateparser
from openpyxl import load_workbook


logger = logging.getLogger(__name__)


# Canonical name → set of accepted aliases (compared after .strip().lower()).
HEADER_ALIASES: dict[str, set[str]] = {
    "date": {"date", "log date", "time log start", "start date"},
    "employee": {"resource name", "resource", "employee", "user", "name"},
    "project": {"project title", "project name", "project"},
    "task": {"task description", "task", "description"},
    "hours": {"hours logged", "total hours", "hours", "time logged", "actual logged"},
    "billable": {"billable", "is billable", "billing type"},
}

BILLABLE_TRUE_VALUES = {"client related", "fee related", "true", "yes", "1", "billable"}


def _canonicalise_header(raw: str) -> str | None:
    if raw is None:
        return None
    key = str(raw).strip().lower()
    if not key:
        return None
    for canonical, aliases in HEADER_ALIASES.items():
        if key in aliases:
            return canonical
    return None


def _parse_date(v: Any) -> date | None:
    if v is None or v == "":
        return None
    if isinstance(v, date) and not isinstance(v, datetime):
        return v
    if isinstance(v, datetime):
        return v.date()
    try:
        # ISO short-circuit
        return date.fromisoformat(str(v)[:10])
    except ValueError:
        pass
    try:
        # dayfirst=True because Zoho regional defaults are commonly DD/MM.
        return dateparser.parse(str(v), dayfirst=True).date()
    except (ValueError, TypeError, OverflowError):
        return None


def _parse_hours(v: Any) -> float:
    if v is None or v == "":
        return 0.0
    if isinstance(v, (int, float)):
        return float(v)
    s = str(v).strip()
    # Zoho sometimes outputs "7:30" (HH:MM). Convert.
    if ":" in s and all(p.isdigit() for p in s.split(":") if p):
        parts = s.split(":")
        try:
            h = int(parts[0])
            m = int(parts[1]) if len(parts) > 1 else 0
            return h + m / 60.0
        except ValueError:
            pass
    try:
        return float(s.replace(",", ""))
    except ValueError:
        return 0.0


def _parse_billable(v: Any, *, source_header_canonical: str | None = None) -> bool:
    # source_header_canonical only matters for "billable" — both columns
    # canonicalise to that key but we want different semantics. We accept
    # either bool, the special billing-type strings, or generic yes/no.
    if v is None:
        return False
    if isinstance(v, bool):
        return v
    if isinstance(v, (int, float)):
        return bool(v)
    s = str(v).strip().lower()
    if not s:
        return False
    return s in BILLABLE_TRUE_VALUES


# ----------------------------------------------------------------------
# Public API
# ----------------------------------------------------------------------

def parse(filename: str, content: bytes) -> dict[str, Any]:
    """Parse uploaded file. Returns dict with rows, unrecognised_columns, content_hash."""
    fn = (filename or "").lower()
    if fn.endswith(".xlsx") or fn.endswith(".xlsm"):
        rows, unknown = _parse_xlsx(content)
    elif fn.endswith(".csv") or fn.endswith(".txt"):
        rows, unknown = _parse_csv(content)
    else:
        # Best-effort sniff: try CSV first, fall back to xlsx.
        try:
            rows, unknown = _parse_csv(content)
        except Exception:
            rows, unknown = _parse_xlsx(content)

    digest = hashlib.sha256(content).hexdigest()
    return {
        "rows": rows,
        "unrecognised_columns": unknown,
        "content_hash": f"sha256:{digest}",
    }


def _build_rows(raw_rows: Iterable[list[Any]], headers: list[Any]) -> tuple[list[dict[str, Any]], list[str]]:
    # Map column index → canonical key. Track unknown ones.
    canonical_by_idx: dict[int, str] = {}
    unrecognised: list[str] = []
    for idx, raw in enumerate(headers):
        if raw is None or str(raw).strip() == "":
            continue
        canon = _canonicalise_header(raw)
        if canon:
            canonical_by_idx[idx] = canon
        else:
            unrecognised.append(str(raw).strip())

    out: list[dict[str, Any]] = []
    for raw_row in raw_rows:
        if not raw_row or all(c in (None, "") for c in raw_row):
            continue
        row: dict[str, Any] = {
            "date": None,
            "employee": None,
            "project": None,
            "task": None,
            "hours": 0.0,
            "billable": False,
        }
        for idx, canon in canonical_by_idx.items():
            if idx >= len(raw_row):
                continue
            v = raw_row[idx]
            if canon == "date":
                row["date"] = _parse_date(v)
            elif canon == "hours":
                row["hours"] = _parse_hours(v)
            elif canon == "billable":
                row["billable"] = _parse_billable(v)
            else:
                row[canon] = (str(v).strip() if v is not None else None) or None
        out.append(row)
    return out, unrecognised


def _parse_csv(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
    # Decode permissively; Zoho exports are usually utf-8 or utf-8-sig.
    text = content.decode("utf-8-sig", errors="replace")
    reader = csv.reader(io.StringIO(text))
    rows = list(reader)
    if not rows:
        return [], []
    headers = rows[0]
    data = rows[1:]
    return _build_rows(data, headers)


def _parse_xlsx(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
    wb = load_workbook(io.BytesIO(content), read_only=True, data_only=True)
    ws = wb.active
    if ws is None:
        return [], []
    rows_iter = ws.iter_rows(values_only=True)
    try:
        headers = list(next(rows_iter))
    except StopIteration:
        return [], []
    data = (list(r) for r in rows_iter)
    return _build_rows(data, headers)