gmal-scope-builder/backend/app/services/excel_parser.py
DJP 26d3435be0 Improve matching, upload UX, collapse fix, full catalog approach
- Upload now shows live stage progress (uploading -> extracting -> AI parsing -> done)
- Fix match group collapse: proper React state instead of DOM manipulation
- Replace pre-filter with full GMAL catalog sent to Claude (~3k tokens, <$0.01)
  - FTS and keyword matching missed too many semantic matches
  - Claude now sees all 243 assets and uses semantic understanding
- Improved system prompt with terminology bridges for better scoring
- Per-project AI cost tracking persisted to DB
- Parallel matching with cancel support
- Auto-select matches >= 80%, YOLO button for rest
- Debug panel for AI call inspection

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 19:22:08 -04:00

346 lines
13 KiB
Python

"""Parse the GMAL Excel workbook into database records."""
import logging
from pathlib import Path
import openpyxl
from sqlalchemy import text
from sqlalchemy.orm import Session
from app.models.gmal import GmalAsset, Role, GmalHours, GmalServiceLine, RoleLevelMapping, MODEL_TYPE_MAP, ModelType
logger = logging.getLogger(__name__)
PROGRAMME_ROLE_KEYWORDS = [
"programme director", "global programme director",
"regional / national operations director",
"head of creative services", "head of project management",
"creative services director",
]
def parse_gmal_workbook(filepath: str, db: Session) -> dict:
"""Parse the full GMAL Excel workbook and load all data into the database.
Returns a dict with counts of loaded records.
"""
wb = openpyxl.load_workbook(filepath, data_only=True)
result = {
"assets_loaded": 0,
"roles_loaded": 0,
"hours_loaded": 0,
"service_lines_loaded": 0,
"role_mappings_loaded": 0,
}
# Clear existing data (full re-ingest)
db.execute(text("DELETE FROM gmal_hours"))
db.execute(text("DELETE FROM gmal_service_lines"))
db.execute(text("DELETE FROM role_level_mappings"))
db.execute(text("DELETE FROM roles"))
db.execute(text("DELETE FROM ratecard_lines"))
db.execute(text("DELETE FROM matches"))
db.execute(text("DELETE FROM client_assets"))
db.execute(text("DELETE FROM projects"))
db.execute(text("DELETE FROM gmal_assets"))
db.flush()
# Step 1: Load asset catalog from "Unilever_GMAL Asset List"
asset_map = _load_asset_list(wb, db)
result["assets_loaded"] = len(asset_map)
# Step 2: Load roles and hours from "GMAL Routes_Jan25"
roles_loaded, hours_loaded, extra_assets = _load_routes(wb, db, asset_map)
result["roles_loaded"] = roles_loaded
result["hours_loaded"] = hours_loaded
result["assets_loaded"] += extra_assets
# Step 3: Load service lines
result["service_lines_loaded"] = _load_service_lines(wb, db)
# Step 4: Load role-level mappings
result["role_mappings_loaded"] = _load_role_mappings(wb, db)
# Populate full-text search vectors
db.execute(text("""
UPDATE gmal_assets SET search_vector =
setweight(to_tsvector('english', coalesce(asset_name, '')), 'A') ||
setweight(to_tsvector('english', coalesce(unique_name, '')), 'A') ||
setweight(to_tsvector('english', coalesce(sub_category, '')), 'B') ||
setweight(to_tsvector('english', coalesce(asset_description, '')), 'C') ||
setweight(to_tsvector('english', coalesce(complexity_description, '')), 'C')
"""))
db.commit()
logger.info(f"Ingestion complete: {result}")
return result
def _load_asset_list(wb: openpyxl.Workbook, db: Session) -> dict[str, GmalAsset]:
"""Load from 'Unilever_GMAL Asset List' sheet. Returns {gmal_id: GmalAsset}."""
ws = wb["Unilever_GMAL Asset List"]
asset_map = {}
for row_idx in range(2, ws.max_row + 1):
gmal_id = ws.cell(row=row_idx, column=2).value
if not gmal_id:
continue
gmal_id = str(gmal_id).strip()
complexity_level = ws.cell(row=row_idx, column=8).value
complexity_name_val = ws.cell(row=row_idx, column=9).value
asset = GmalAsset(
gmal_id=gmal_id,
swop_asset_id=_str(ws.cell(row=row_idx, column=3).value),
region=_str(ws.cell(row=row_idx, column=1).value),
category=_str(ws.cell(row=row_idx, column=4).value),
sub_category=_str(ws.cell(row=row_idx, column=5).value),
sub_category_description=_str(ws.cell(row=row_idx, column=6).value),
asset_name=_str(ws.cell(row=row_idx, column=7).value),
complexity_level=int(complexity_level) if complexity_level else None,
complexity_name=_str(complexity_name_val),
unique_name=_str(ws.cell(row=row_idx, column=10).value),
asset_description=_str(ws.cell(row=row_idx, column=11).value),
complexity_description=_str(ws.cell(row=row_idx, column=12).value),
caveats=_str(ws.cell(row=row_idx, column=13).value),
has_hour_routes=False,
)
db.add(asset)
asset_map[gmal_id] = asset
db.flush()
logger.info(f"Loaded {len(asset_map)} assets from Asset List")
return asset_map
def _load_routes(wb: openpyxl.Workbook, db: Session, asset_map: dict[str, GmalAsset]) -> tuple[int, int, int]:
"""Load roles and hours from 'GMAL Routes_Jan25' sheet.
Returns (roles_loaded, hours_loaded, extra_assets_created).
"""
ws = wb["GMAL Routes_Jan25"]
# Detect model type column boundaries from row 1
model_sections = [] # [(start_col, end_col, ModelType)]
current_header = None
section_start = None
for col_idx in range(13, ws.max_column + 1):
header = ws.cell(row=1, column=col_idx).value
if header and header != current_header:
if current_header and section_start:
mt = MODEL_TYPE_MAP.get(current_header)
if mt:
model_sections.append((section_start, col_idx - 1, mt))
current_header = header
section_start = col_idx
# Don't forget the last section
if current_header and section_start:
mt = MODEL_TYPE_MAP.get(current_header)
if mt:
model_sections.append((section_start, ws.max_column, mt))
logger.info(f"Detected {len(model_sections)} model type sections")
for start, end, mt in model_sections:
logger.info(f" {mt.value}: cols {start}-{end}")
# Load roles from rows 13-137 (column B=discipline, C=title, D=entity, E=location, F=unique)
# Map row_idx -> Role for hour lookups; use unique_name as dedup key
roles = {} # {(role_title, entity): Role}
row_to_role = {} # {row_idx: Role} for hour lookups
for row_idx in range(13, 138):
discipline = ws.cell(row=row_idx, column=2).value
role_title = ws.cell(row=row_idx, column=3).value
if not role_title:
continue
entity = _str(ws.cell(row=row_idx, column=4).value)
location = _str(ws.cell(row=row_idx, column=5).value)
unique_name = _str(ws.cell(row=row_idx, column=6).value)
key = (str(role_title).strip(), entity)
# Skip duplicates - reuse existing role for hour lookups
if key in roles:
row_to_role[row_idx] = roles[key]
continue
is_prog = any(kw in str(role_title).lower() for kw in PROGRAMME_ROLE_KEYWORDS)
role = Role(
discipline=_str(discipline) or "Other",
role_title=_str(role_title),
entity=entity,
resource_location=location,
unique_name=unique_name,
sort_order=row_idx - 12,
is_programme_role=is_prog,
)
db.add(role)
roles[key] = role
row_to_role[row_idx] = role
db.flush()
logger.info(f"Loaded {len(roles)} roles")
# For each model type section, build a mapping of column -> gmal_id
# Then load hours for each (role, gmal, model_type) combo
hours_count = 0
extra_assets = 0
# Track which (gmal_id, role_id, model_type) combos we've already inserted
# to handle duplicate columns (Pro vs non-Pro) for same GMAL within a section
seen_hours = set()
for section_start, section_end, model_type in model_sections:
# Build column -> gmal_id map for this section
# Include Pro variant columns - they contain the actual AI model data
# Extract the base GMAL ID from row 2 (e.g. "GMAL101" from both base and Pro cols)
col_gmal_map = {}
for col_idx in range(section_start, section_end + 1):
gmal_id = ws.cell(row=2, column=col_idx).value
if gmal_id and str(gmal_id).startswith("GMAL"):
base_id = str(gmal_id).strip()
col_gmal_map[col_idx] = base_id
# Enrich asset_map with metadata from Routes if not in Asset List
for col_idx, gmal_id in col_gmal_map.items():
if gmal_id not in asset_map:
# Create asset from Routes metadata
asset = GmalAsset(
gmal_id=gmal_id,
sub_category=_str(ws.cell(row=4, column=col_idx).value),
asset_name=_str(ws.cell(row=5, column=col_idx).value),
asset_description=_str(ws.cell(row=6, column=col_idx).value),
complexity_description=_str(ws.cell(row=7, column=col_idx).value),
caveats=_str(ws.cell(row=8, column=col_idx).value),
complexity_level=_int(ws.cell(row=9, column=col_idx).value),
master_adapt=_str(ws.cell(row=10, column=col_idx).value),
ai_efficiency_pct=_float(ws.cell(row=11, column=col_idx).value),
has_hour_routes=True,
)
db.add(asset)
db.flush()
asset_map[gmal_id] = asset
extra_assets += 1
else:
# Update Routes-specific fields on existing asset
asset = asset_map[gmal_id]
asset.has_hour_routes = True
if not asset.master_adapt:
asset.master_adapt = _str(ws.cell(row=10, column=col_idx).value)
if not asset.ai_efficiency_pct:
asset.ai_efficiency_pct = _float(ws.cell(row=11, column=col_idx).value)
if not asset.sub_category:
asset.sub_category = _str(ws.cell(row=4, column=col_idx).value)
# Now load hours for each role x gmal in this section
for row_idx in range(13, 138):
role = row_to_role.get(row_idx)
if not role:
continue
for col_idx, gmal_id in col_gmal_map.items():
hours_val = ws.cell(row=row_idx, column=col_idx).value
if hours_val and _float(hours_val) and _float(hours_val) != 0:
asset = asset_map.get(gmal_id)
if not asset:
continue
# Deduplicate: skip if we already have hours for this combo
key = (asset.id, role.id, model_type)
if key in seen_hours:
continue
seen_hours.add(key)
gh = GmalHours(
gmal_asset_id=asset.id,
role_id=role.id,
model_type=model_type,
hours=round(_float(hours_val), 2),
)
db.add(gh)
hours_count += 1
# Flush every 5000 to manage memory
if hours_count % 5000 == 0:
db.flush()
db.flush()
logger.info(f"Loaded {hours_count} hour records across {len(model_sections)} model types")
return len(roles), hours_count, extra_assets
def _load_service_lines(wb: openpyxl.Workbook, db: Session) -> int:
"""Load from 'GMAL SERVICE LINES' sheet."""
ws = wb["GMAL SERVICE LINES"]
count = 0
for row_idx in range(2, ws.max_row + 1):
number = ws.cell(row=row_idx, column=1).value
name = ws.cell(row=row_idx, column=2).value
if not name:
continue
sl = GmalServiceLine(
number=_str(number),
name=_str(name),
type=_str(ws.cell(row=row_idx, column=3).value),
gmal_id=_str(ws.cell(row=row_idx, column=4).value),
)
db.add(sl)
count += 1
db.flush()
logger.info(f"Loaded {count} service lines")
return count
def _load_role_mappings(wb: openpyxl.Workbook, db: Session) -> int:
"""Load from 'Job role to Level mapping' sheet."""
ws = wb["Job role to Level mapping"]
count = 0
for row_idx in range(3, ws.max_row + 1):
role_name = ws.cell(row=row_idx, column=1).value
if not role_name:
continue
rlm = RoleLevelMapping(
role_name=_str(role_name),
number=_str(ws.cell(row=row_idx, column=3).value),
level_name=_str(ws.cell(row=row_idx, column=4).value),
type=_str(ws.cell(row=row_idx, column=5).value),
)
db.add(rlm)
count += 1
db.flush()
logger.info(f"Loaded {count} role-level mappings")
return count
def _str(val) -> str | None:
if val is None:
return None
s = str(val).strip()
return s if s else None
def _int(val) -> int | None:
if val is None:
return None
try:
return int(val)
except (ValueError, TypeError):
return None
def _float(val) -> float | None:
if val is None:
return None
try:
return float(val)
except (ValueError, TypeError):
return None