Replaces a static SPA that shipped an Airtable PAT in the JS bundle.
The new architecture holds all secrets server-side, fronts the app
behind Apache on optical-dev with the shared-vhost split-build pattern,
and is designed for a later Azure AD/MSAL swap-in.
- backend/ FastAPI + uvicorn, local auth (Azure AD stub), Airtable
proxy with TTL cache, Zoho .xlsx/.csv parser, merge
service for utilisation summaries. 28 pytest tests.
- frontend/ React + Vite + TS + Tailwind + Recharts SPA. Login entry
chunk 12.83 KB gzipped; Recharts lazy-loaded. No tokens
or Airtable URLs in the built bundle.
- deploy/ Idempotent deploy.sh (port auto-pick 8200-8299,
.env-persisted) + split-build Apache include template.
- docker-compose.yml pins name: utilisation-dept and binds 127.0.0.1.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
202 lines
6.5 KiB
Python
202 lines
6.5 KiB
Python
"""Zoho timelog parser.
|
|
|
|
Decisions:
|
|
- Header matching is case-insensitive and trim-stripped. Unknown headers
|
|
are surfaced in `unrecognised_columns` so the operator notices when
|
|
Zoho silently renames a column.
|
|
- Billable detection: if the column is literally "Billable" / "Is Billable",
|
|
we coerce truthy strings. If the column is "Billing Type", we map
|
|
"Client Related" / "Fee Related" → True, everything else → False.
|
|
- Date parsing tries ISO first, then dateutil for the messy formats Zoho
|
|
occasionally emits ("01/05/2026", "1-May-2026", etc.).
|
|
- For .xlsx we use openpyxl read-only mode — keeps memory low on big files.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import hashlib
|
|
import io
|
|
import logging
|
|
from datetime import date, datetime
|
|
from typing import Any, Iterable
|
|
|
|
from dateutil import parser as dateparser
|
|
from openpyxl import load_workbook
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Canonical name → set of accepted aliases (compared after .strip().lower()).
|
|
HEADER_ALIASES: dict[str, set[str]] = {
|
|
"date": {"date", "log date", "time log start", "start date"},
|
|
"employee": {"resource name", "resource", "employee", "user", "name"},
|
|
"project": {"project title", "project name", "project"},
|
|
"task": {"task description", "task", "description"},
|
|
"hours": {"hours logged", "total hours", "hours", "time logged", "actual logged"},
|
|
"billable": {"billable", "is billable", "billing type"},
|
|
}
|
|
|
|
BILLABLE_TRUE_VALUES = {"client related", "fee related", "true", "yes", "1", "billable"}
|
|
|
|
|
|
def _canonicalise_header(raw: str) -> str | None:
|
|
if raw is None:
|
|
return None
|
|
key = str(raw).strip().lower()
|
|
if not key:
|
|
return None
|
|
for canonical, aliases in HEADER_ALIASES.items():
|
|
if key in aliases:
|
|
return canonical
|
|
return None
|
|
|
|
|
|
def _parse_date(v: Any) -> date | None:
|
|
if v is None or v == "":
|
|
return None
|
|
if isinstance(v, date) and not isinstance(v, datetime):
|
|
return v
|
|
if isinstance(v, datetime):
|
|
return v.date()
|
|
try:
|
|
# ISO short-circuit
|
|
return date.fromisoformat(str(v)[:10])
|
|
except ValueError:
|
|
pass
|
|
try:
|
|
# dayfirst=True because Zoho regional defaults are commonly DD/MM.
|
|
return dateparser.parse(str(v), dayfirst=True).date()
|
|
except (ValueError, TypeError, OverflowError):
|
|
return None
|
|
|
|
|
|
def _parse_hours(v: Any) -> float:
|
|
if v is None or v == "":
|
|
return 0.0
|
|
if isinstance(v, (int, float)):
|
|
return float(v)
|
|
s = str(v).strip()
|
|
# Zoho sometimes outputs "7:30" (HH:MM). Convert.
|
|
if ":" in s and all(p.isdigit() for p in s.split(":") if p):
|
|
parts = s.split(":")
|
|
try:
|
|
h = int(parts[0])
|
|
m = int(parts[1]) if len(parts) > 1 else 0
|
|
return h + m / 60.0
|
|
except ValueError:
|
|
pass
|
|
try:
|
|
return float(s.replace(",", ""))
|
|
except ValueError:
|
|
return 0.0
|
|
|
|
|
|
def _parse_billable(v: Any, *, source_header_canonical: str | None = None) -> bool:
|
|
# source_header_canonical only matters for "billable" — both columns
|
|
# canonicalise to that key but we want different semantics. We accept
|
|
# either bool, the special billing-type strings, or generic yes/no.
|
|
if v is None:
|
|
return False
|
|
if isinstance(v, bool):
|
|
return v
|
|
if isinstance(v, (int, float)):
|
|
return bool(v)
|
|
s = str(v).strip().lower()
|
|
if not s:
|
|
return False
|
|
return s in BILLABLE_TRUE_VALUES
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Public API
|
|
# ----------------------------------------------------------------------
|
|
|
|
def parse(filename: str, content: bytes) -> dict[str, Any]:
|
|
"""Parse uploaded file. Returns dict with rows, unrecognised_columns, content_hash."""
|
|
fn = (filename or "").lower()
|
|
if fn.endswith(".xlsx") or fn.endswith(".xlsm"):
|
|
rows, unknown = _parse_xlsx(content)
|
|
elif fn.endswith(".csv") or fn.endswith(".txt"):
|
|
rows, unknown = _parse_csv(content)
|
|
else:
|
|
# Best-effort sniff: try CSV first, fall back to xlsx.
|
|
try:
|
|
rows, unknown = _parse_csv(content)
|
|
except Exception:
|
|
rows, unknown = _parse_xlsx(content)
|
|
|
|
digest = hashlib.sha256(content).hexdigest()
|
|
return {
|
|
"rows": rows,
|
|
"unrecognised_columns": unknown,
|
|
"content_hash": f"sha256:{digest}",
|
|
}
|
|
|
|
|
|
def _build_rows(raw_rows: Iterable[list[Any]], headers: list[Any]) -> tuple[list[dict[str, Any]], list[str]]:
|
|
# Map column index → canonical key. Track unknown ones.
|
|
canonical_by_idx: dict[int, str] = {}
|
|
unrecognised: list[str] = []
|
|
for idx, raw in enumerate(headers):
|
|
if raw is None or str(raw).strip() == "":
|
|
continue
|
|
canon = _canonicalise_header(raw)
|
|
if canon:
|
|
canonical_by_idx[idx] = canon
|
|
else:
|
|
unrecognised.append(str(raw).strip())
|
|
|
|
out: list[dict[str, Any]] = []
|
|
for raw_row in raw_rows:
|
|
if not raw_row or all(c in (None, "") for c in raw_row):
|
|
continue
|
|
row: dict[str, Any] = {
|
|
"date": None,
|
|
"employee": None,
|
|
"project": None,
|
|
"task": None,
|
|
"hours": 0.0,
|
|
"billable": False,
|
|
}
|
|
for idx, canon in canonical_by_idx.items():
|
|
if idx >= len(raw_row):
|
|
continue
|
|
v = raw_row[idx]
|
|
if canon == "date":
|
|
row["date"] = _parse_date(v)
|
|
elif canon == "hours":
|
|
row["hours"] = _parse_hours(v)
|
|
elif canon == "billable":
|
|
row["billable"] = _parse_billable(v)
|
|
else:
|
|
row[canon] = (str(v).strip() if v is not None else None) or None
|
|
out.append(row)
|
|
return out, unrecognised
|
|
|
|
|
|
def _parse_csv(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
|
|
# Decode permissively; Zoho exports are usually utf-8 or utf-8-sig.
|
|
text = content.decode("utf-8-sig", errors="replace")
|
|
reader = csv.reader(io.StringIO(text))
|
|
rows = list(reader)
|
|
if not rows:
|
|
return [], []
|
|
headers = rows[0]
|
|
data = rows[1:]
|
|
return _build_rows(data, headers)
|
|
|
|
|
|
def _parse_xlsx(content: bytes) -> tuple[list[dict[str, Any]], list[str]]:
|
|
wb = load_workbook(io.BytesIO(content), read_only=True, data_only=True)
|
|
ws = wb.active
|
|
if ws is None:
|
|
return [], []
|
|
rows_iter = ws.iter_rows(values_only=True)
|
|
try:
|
|
headers = list(next(rows_iter))
|
|
except StopIteration:
|
|
return [], []
|
|
data = (list(r) for r in rows_iter)
|
|
return _build_rows(data, headers)
|