loreal-utilisation-dept/backend/app/services/zoho_parse.py

"""Zoho timelog parser.

Decisions:
- Header matching is case-insensitive and trim-stripped. Unknown headers
  are surfaced in `unrecognised_columns` so the operator notices when
  Zoho silently renames a column.
- Billable detection: we keep TWO canonical fields. `billable` accepts
  literal "Billable" / "Is Billable" columns (boolean-ish). `billingType`
  accepts a "Billing Type" column whose values look like
  "Client Related" / "Fee Related" / "Idle Time" / "Leave Hours".
  When only one of the two is present we cross-fill the other: a
  billingType of client/fee implies billable=True; leave implies False.
- Date parsing tries ISO first, then dateutil for the messy formats Zoho
  occasionally emits ("01/05/2026", "1-May-2026", etc.).
- For .xlsx we use openpyxl read-only mode — keeps memory low on big files.
"""

from __future__ import annotations

import csv
import hashlib
import io
import logging
from datetime import date, datetime
from typing import Any, Iterable

from dateutil import parser as dateparser
from openpyxl import load_workbook


logger = logging.getLogger(__name__)


# Canonical name → set of accepted aliases (compared after .strip().lower()).
HEADER_ALIASES: dict[str, set[str]] = {
    "date": {"date", "log date", "time log start", "start date"},
    "employee": {"resource name", "resource", "employee", "user", "name"},
    "project": {"project title", "project name", "project"},
    "task": {"task description", "task", "description"},
    "hours": {"hours logged", "total hours", "hours", "time logged", "actual logged"},
    "billable": {"billable", "is billable"},
    "billingType": {"billing type"},
}

# Generic truthy strings for a literal "Billable" column.
BILLABLE_TRUE_VALUES = {"true", "yes", "1", "billable"}

# Billing-type values (lower-cased) that imply billable=True.
BILLING_TYPE_BILLABLE = {"client related", "fee related"}

# Billing-type values that imply billable=False (and are leave-coded).
BILLING_TYPE_LEAVE = {"leave hours", "leave"}


def _canonicalise_header(raw: str) -> str | None:
    if raw is None:
        return None
    key = str(raw).strip().lower()
    if not key:
        return None
    for canonical, aliases in HEADER_ALIASES.items():
        if key in aliases:
            return canonical
    return None


def _parse_date(v: Any) -> date | None:
    if v is None or v == "":
        return None
    if isinstance(v, date) and not isinstance(v, datetime):
        return v
    if isinstance(v, datetime):
        return v.date()
    try:
        # ISO short-circuit
        return date.fromisoformat(str(v)[:10])
    except ValueError:
        pass
    try:
        # dayfirst=True because Zoho regional defaults are commonly DD/MM.
        return dateparser.parse(str(v), dayfirst=True).date()
    except (ValueError, TypeError, OverflowError):
        return None


def _parse_hours(v: Any) -> float:
    if v is None or v == "":
        return 0.0
    if isinstance(v, (int, float)):
        return float(v)
    s = str(v).strip()
    # Zoho sometimes outputs "7:30" (HH:MM). Convert.
    if ":" in s and all(p.isdigit() for p in s.split(":") if p):
        parts = s.split(":")
        try:
            h = int(parts[0])
            m = int(parts[1]) if len(parts) > 1 else 0
            return h + m / 60.0
        except ValueError:
            pass
    try:
        return float(s.replace(",", ""))
    except ValueError:
        return 0.0


def _parse_billable(v: Any) -> bool:
    """Parse a literal Billable / Is Billable column value."""
    if v is None:
        return False
    if isinstance(v, bool):
        return v
    if isinstance(v, (int, float)):
        return bool(v)
    s = str(v).strip().lower()
    if not s:
        return False
    return s in BILLABLE_TRUE_VALUES


def _parse_billing_type(v: Any) -> str | None:
    """Parse a Billing Type column value to a lowercase canonical string."""
    if v is None:
        return None
    s = str(v).strip().lower()
    return s or None


# ----------------------------------------------------------------------
# Public API
# ----------------------------------------------------------------------

def parse(filename: str, content: bytes) -> dict[str, Any]:
    """Parse uploaded file. Returns dict with rows, unrecognised_columns, content_hash."""
    fn = (filename or "").lower()
    if fn.endswith(".xlsx") or fn.endswith(".xlsm"):
        rows, unknown = _parse_xlsx(content)
    elif fn.endswith(".csv") or fn.endswith(".txt"):
        rows, unknown = _parse_csv(content)
    else:
        # Best-effort sniff: try CSV first, fall back to xlsx.
        try:
            rows, unknown = _parse_csv(content)
        except Exception:
            rows, unknown = _parse_xlsx(content)

    digest = hashlib.sha256(content).hexdigest()
    return {
        "rows": rows,
        "unrecognised_columns": unknown,
        "content_hash": f"sha256:{digest}",
    }


def _build_rows(raw_rows: Iterable[list[Any]], headers: list[Any]) -> tuple[list[dict[str, Any]], list[str]]:
    # Map column index → canonical key. Track unknown ones.
    canonical_by_idx: dict[int, str] = {}
    unrecognised: list[str] = []
    for idx, raw in enumerate(headers):
        if raw is None or str(raw).strip() == "":
            continue
        canon = _canonicalise_header(raw)
        if canon:
            canonical_by_idx[idx] = canon
        else:
            unrecognised.append(str(raw).strip())

    # Track whether each canonical was actually present in the headers
    # so we can decide whether to cross-fill billable from billingType
    # (or vice versa) without clobbering a user-supplied value.
    present_canonicals = set(canonical_by_idx.values())

    out: list[dict[str, Any]] = []
    for raw_row in raw_rows:
        if not raw_row or all(c in (None, "") for c in raw_row):
            continue
        row: dict[str, Any] = {
            "date": None,
            "employee": None,
            "project": None,
            "task": None,
            "hours": 0.0,
            "billable": False,
            "billingType": None,
        }
        for idx, canon in canonical_by_idx.items():
            if idx >= len(raw_row):
                continue
            v = raw_row[idx]
            if canon == "date":
                row["date"] = _parse_date(v)
            elif canon == "hours":
                row["hours"] = _parse_hours(v)
            elif canon == "billable":
                row["billable"] = _parse_billable(v)
            elif canon == "billingType":
                row["billingType"] = _parse_billing_type(v)
            else:
                row[canon] = (str(v).strip() if v is not None else None) or None

        # Cross-fill: when only billingType is present, derive billable.
        # When only billable is present, billingType stays None.
        bt = row.get("billingType")
        if "billingType" in present_canonicals and bt is not None:
            if bt in BILLING_TYPE_BILLABLE:
                row["billable"] = True
            elif bt in BILLING_TYPE_LEAVE:
                row["billable"] = False

        out.append(row)
    return out, unrecognised


def _parse_csv(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
    # Decode permissively; Zoho exports are usually utf-8 or utf-8-sig.
    text = content.decode("utf-8-sig", errors="replace")
    reader = csv.reader(io.StringIO(text))
    rows = list(reader)
    if not rows:
        return [], []
    headers = rows[0]
    data = rows[1:]
    return _build_rows(data, headers)


def _parse_xlsx(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
    wb = load_workbook(io.BytesIO(content), read_only=True, data_only=True)
    ws = wb.active
    if ws is None:
        return [], []
    rows_iter = ws.iter_rows(values_only=True)
    try:
        headers = list(next(rows_iter))
    except StopIteration:
        return [], []
    data = (list(r) for r in rows_iter)
    return _build_rows(data, headers)