loreal-utilisation-dept/backend/app/services/zoho_parse.py
DJP 04edbfdd2c Initial commit: dockerised FastAPI backend + React/Vite frontend rewrite
Replaces a static SPA that shipped an Airtable PAT in the JS bundle.
The new architecture holds all secrets server-side, fronts the app
behind Apache on optical-dev with the shared-vhost split-build pattern,
and is designed for a later Azure AD/MSAL swap-in.

- backend/   FastAPI + uvicorn, local auth (Azure AD stub), Airtable
             proxy with TTL cache, Zoho .xlsx/.csv parser, merge
             service for utilisation summaries. 28 pytest tests.
- frontend/  React + Vite + TS + Tailwind + Recharts SPA. Login entry
             chunk 12.83 KB gzipped; Recharts lazy-loaded. No tokens
             or Airtable URLs in the built bundle.
- deploy/    Idempotent deploy.sh (port auto-pick 8200-8299,
             .env-persisted) + split-build Apache include template.
- docker-compose.yml pins name: utilisation-dept and binds 127.0.0.1.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 12:37:04 -04:00

202 lines
6.5 KiB
Python

"""Zoho timelog parser.
Decisions:
- Header matching is case-insensitive and trim-stripped. Unknown headers
are surfaced in `unrecognised_columns` so the operator notices when
Zoho silently renames a column.
- Billable detection: if the column is literally "Billable" / "Is Billable",
we coerce truthy strings. If the column is "Billing Type", we map
"Client Related" / "Fee Related" → True, everything else → False.
- Date parsing tries ISO first, then dateutil for the messy formats Zoho
occasionally emits ("01/05/2026", "1-May-2026", etc.).
- For .xlsx we use openpyxl read-only mode — keeps memory low on big files.
"""
from __future__ import annotations
import csv
import hashlib
import io
import logging
from datetime import date, datetime
from typing import Any, Iterable
from dateutil import parser as dateparser
from openpyxl import load_workbook
logger = logging.getLogger(__name__)
# Canonical name → set of accepted aliases (compared after .strip().lower()).
HEADER_ALIASES: dict[str, set[str]] = {
"date": {"date", "log date", "time log start", "start date"},
"employee": {"resource name", "resource", "employee", "user", "name"},
"project": {"project title", "project name", "project"},
"task": {"task description", "task", "description"},
"hours": {"hours logged", "total hours", "hours", "time logged", "actual logged"},
"billable": {"billable", "is billable", "billing type"},
}
BILLABLE_TRUE_VALUES = {"client related", "fee related", "true", "yes", "1", "billable"}
def _canonicalise_header(raw: str) -> str | None:
if raw is None:
return None
key = str(raw).strip().lower()
if not key:
return None
for canonical, aliases in HEADER_ALIASES.items():
if key in aliases:
return canonical
return None
def _parse_date(v: Any) -> date | None:
if v is None or v == "":
return None
if isinstance(v, date) and not isinstance(v, datetime):
return v
if isinstance(v, datetime):
return v.date()
try:
# ISO short-circuit
return date.fromisoformat(str(v)[:10])
except ValueError:
pass
try:
# dayfirst=True because Zoho regional defaults are commonly DD/MM.
return dateparser.parse(str(v), dayfirst=True).date()
except (ValueError, TypeError, OverflowError):
return None
def _parse_hours(v: Any) -> float:
if v is None or v == "":
return 0.0
if isinstance(v, (int, float)):
return float(v)
s = str(v).strip()
# Zoho sometimes outputs "7:30" (HH:MM). Convert.
if ":" in s and all(p.isdigit() for p in s.split(":") if p):
parts = s.split(":")
try:
h = int(parts[0])
m = int(parts[1]) if len(parts) > 1 else 0
return h + m / 60.0
except ValueError:
pass
try:
return float(s.replace(",", ""))
except ValueError:
return 0.0
def _parse_billable(v: Any, *, source_header_canonical: str | None = None) -> bool:
# source_header_canonical only matters for "billable" — both columns
# canonicalise to that key but we want different semantics. We accept
# either bool, the special billing-type strings, or generic yes/no.
if v is None:
return False
if isinstance(v, bool):
return v
if isinstance(v, (int, float)):
return bool(v)
s = str(v).strip().lower()
if not s:
return False
return s in BILLABLE_TRUE_VALUES
# ----------------------------------------------------------------------
# Public API
# ----------------------------------------------------------------------
def parse(filename: str, content: bytes) -> dict[str, Any]:
"""Parse uploaded file. Returns dict with rows, unrecognised_columns, content_hash."""
fn = (filename or "").lower()
if fn.endswith(".xlsx") or fn.endswith(".xlsm"):
rows, unknown = _parse_xlsx(content)
elif fn.endswith(".csv") or fn.endswith(".txt"):
rows, unknown = _parse_csv(content)
else:
# Best-effort sniff: try CSV first, fall back to xlsx.
try:
rows, unknown = _parse_csv(content)
except Exception:
rows, unknown = _parse_xlsx(content)
digest = hashlib.sha256(content).hexdigest()
return {
"rows": rows,
"unrecognised_columns": unknown,
"content_hash": f"sha256:{digest}",
}
def _build_rows(raw_rows: Iterable[list[Any]], headers: list[Any]) -> tuple[list[dict[str, Any]], list[str]]:
# Map column index → canonical key. Track unknown ones.
canonical_by_idx: dict[int, str] = {}
unrecognised: list[str] = []
for idx, raw in enumerate(headers):
if raw is None or str(raw).strip() == "":
continue
canon = _canonicalise_header(raw)
if canon:
canonical_by_idx[idx] = canon
else:
unrecognised.append(str(raw).strip())
out: list[dict[str, Any]] = []
for raw_row in raw_rows:
if not raw_row or all(c in (None, "") for c in raw_row):
continue
row: dict[str, Any] = {
"date": None,
"employee": None,
"project": None,
"task": None,
"hours": 0.0,
"billable": False,
}
for idx, canon in canonical_by_idx.items():
if idx >= len(raw_row):
continue
v = raw_row[idx]
if canon == "date":
row["date"] = _parse_date(v)
elif canon == "hours":
row["hours"] = _parse_hours(v)
elif canon == "billable":
row["billable"] = _parse_billable(v)
else:
row[canon] = (str(v).strip() if v is not None else None) or None
out.append(row)
return out, unrecognised
def _parse_csv(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
# Decode permissively; Zoho exports are usually utf-8 or utf-8-sig.
text = content.decode("utf-8-sig", errors="replace")
reader = csv.reader(io.StringIO(text))
rows = list(reader)
if not rows:
return [], []
headers = rows[0]
data = rows[1:]
return _build_rows(data, headers)
def _parse_xlsx(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
wb = load_workbook(io.BytesIO(content), read_only=True, data_only=True)
ws = wb.active
if ws is None:
return [], []
rows_iter = ws.iter_rows(values_only=True)
try:
headers = list(next(rows_iter))
except StopIteration:
return [], []
data = (list(r) for r in rows_iter)
return _build_rows(data, headers)