"""Parse the GMAL Excel workbook into database records.""" import logging from pathlib import Path import openpyxl from sqlalchemy import text from sqlalchemy.orm import Session from app.models.gmal import GmalAsset, Role, GmalHours, GmalServiceLine, RoleLevelMapping, MODEL_TYPE_MAP, ModelType logger = logging.getLogger(__name__) PROGRAMME_ROLE_KEYWORDS = [ "programme director", "global programme director", "regional / national operations director", "head of creative services", "head of project management", "creative services director", ] def parse_gmal_workbook(filepath: str, db: Session) -> dict: """Parse the full GMAL Excel workbook and load all data into the database. Returns a dict with counts of loaded records. """ wb = openpyxl.load_workbook(filepath, data_only=True) result = { "assets_loaded": 0, "roles_loaded": 0, "hours_loaded": 0, "service_lines_loaded": 0, "role_mappings_loaded": 0, } # Clear existing data (full re-ingest) db.execute(text("DELETE FROM gmal_hours")) db.execute(text("DELETE FROM gmal_service_lines")) db.execute(text("DELETE FROM role_level_mappings")) db.execute(text("DELETE FROM roles")) db.execute(text("DELETE FROM ratecard_lines")) db.execute(text("DELETE FROM matches")) db.execute(text("DELETE FROM client_assets")) db.execute(text("DELETE FROM projects")) db.execute(text("DELETE FROM gmal_assets")) db.flush() # Step 1: Load asset catalog from "Unilever_GMAL Asset List" asset_map = _load_asset_list(wb, db) result["assets_loaded"] = len(asset_map) # Step 2: Load roles and hours from "GMAL Routes_Jan25" roles_loaded, hours_loaded, extra_assets = _load_routes(wb, db, asset_map) result["roles_loaded"] = roles_loaded result["hours_loaded"] = hours_loaded result["assets_loaded"] += extra_assets # Step 3: Load service lines result["service_lines_loaded"] = _load_service_lines(wb, db) # Step 4: Load role-level mappings result["role_mappings_loaded"] = _load_role_mappings(wb, db) # Populate full-text search vectors db.execute(text(""" UPDATE gmal_assets SET search_vector = setweight(to_tsvector('english', coalesce(asset_name, '')), 'A') || setweight(to_tsvector('english', coalesce(unique_name, '')), 'A') || setweight(to_tsvector('english', coalesce(sub_category, '')), 'B') || setweight(to_tsvector('english', coalesce(asset_description, '')), 'C') || setweight(to_tsvector('english', coalesce(complexity_description, '')), 'C') """)) db.commit() logger.info(f"Ingestion complete: {result}") return result def _load_asset_list(wb: openpyxl.Workbook, db: Session) -> dict[str, GmalAsset]: """Load from 'Unilever_GMAL Asset List' sheet. Returns {gmal_id: GmalAsset}.""" ws = wb["Unilever_GMAL Asset List"] asset_map = {} for row_idx in range(2, ws.max_row + 1): gmal_id = ws.cell(row=row_idx, column=2).value if not gmal_id: continue gmal_id = str(gmal_id).strip() complexity_level = ws.cell(row=row_idx, column=8).value complexity_name_val = ws.cell(row=row_idx, column=9).value asset = GmalAsset( gmal_id=gmal_id, swop_asset_id=_str(ws.cell(row=row_idx, column=3).value), region=_str(ws.cell(row=row_idx, column=1).value), category=_str(ws.cell(row=row_idx, column=4).value), sub_category=_str(ws.cell(row=row_idx, column=5).value), sub_category_description=_str(ws.cell(row=row_idx, column=6).value), asset_name=_str(ws.cell(row=row_idx, column=7).value), complexity_level=int(complexity_level) if complexity_level else None, complexity_name=_str(complexity_name_val), unique_name=_str(ws.cell(row=row_idx, column=10).value), asset_description=_str(ws.cell(row=row_idx, column=11).value), complexity_description=_str(ws.cell(row=row_idx, column=12).value), caveats=_str(ws.cell(row=row_idx, column=13).value), has_hour_routes=False, ) db.add(asset) asset_map[gmal_id] = asset db.flush() logger.info(f"Loaded {len(asset_map)} assets from Asset List") return asset_map def _load_routes(wb: openpyxl.Workbook, db: Session, asset_map: dict[str, GmalAsset]) -> tuple[int, int, int]: """Load roles and hours from 'GMAL Routes_Jan25' sheet. Returns (roles_loaded, hours_loaded, extra_assets_created). """ ws = wb["GMAL Routes_Jan25"] # Detect model type column boundaries from row 1 model_sections = [] # [(start_col, end_col, ModelType)] current_header = None section_start = None for col_idx in range(13, ws.max_column + 1): header = ws.cell(row=1, column=col_idx).value if header and header != current_header: if current_header and section_start: mt = MODEL_TYPE_MAP.get(current_header) if mt: model_sections.append((section_start, col_idx - 1, mt)) current_header = header section_start = col_idx # Don't forget the last section if current_header and section_start: mt = MODEL_TYPE_MAP.get(current_header) if mt: model_sections.append((section_start, ws.max_column, mt)) logger.info(f"Detected {len(model_sections)} model type sections") for start, end, mt in model_sections: logger.info(f" {mt.value}: cols {start}-{end}") # Load roles from rows 13-137 (column B=discipline, C=title, D=entity, E=location, F=unique) # Map row_idx -> Role for hour lookups; use unique_name as dedup key roles = {} # {(role_title, entity): Role} row_to_role = {} # {row_idx: Role} for hour lookups for row_idx in range(13, 138): discipline = ws.cell(row=row_idx, column=2).value role_title = ws.cell(row=row_idx, column=3).value if not role_title: continue entity = _str(ws.cell(row=row_idx, column=4).value) location = _str(ws.cell(row=row_idx, column=5).value) unique_name = _str(ws.cell(row=row_idx, column=6).value) key = (str(role_title).strip(), entity) # Skip duplicates - reuse existing role for hour lookups if key in roles: row_to_role[row_idx] = roles[key] continue is_prog = any(kw in str(role_title).lower() for kw in PROGRAMME_ROLE_KEYWORDS) role = Role( discipline=_str(discipline) or "Other", role_title=_str(role_title), entity=entity, resource_location=location, unique_name=unique_name, sort_order=row_idx - 12, is_programme_role=is_prog, ) db.add(role) roles[key] = role row_to_role[row_idx] = role db.flush() logger.info(f"Loaded {len(roles)} roles") # For each model type section, build a mapping of column -> gmal_id # Then load hours for each (role, gmal, model_type) combo hours_count = 0 extra_assets = 0 # Track which (gmal_id, role_id, model_type) combos we've already inserted # to handle duplicate columns (Pro vs non-Pro) for same GMAL within a section seen_hours = set() for section_start, section_end, model_type in model_sections: # Build column -> gmal_id map for this section # Include Pro variant columns - they contain the actual AI model data # Extract the base GMAL ID from row 2 (e.g. "GMAL101" from both base and Pro cols) col_gmal_map = {} for col_idx in range(section_start, section_end + 1): gmal_id = ws.cell(row=2, column=col_idx).value if gmal_id and str(gmal_id).startswith("GMAL"): base_id = str(gmal_id).strip() col_gmal_map[col_idx] = base_id # Enrich asset_map with metadata from Routes if not in Asset List for col_idx, gmal_id in col_gmal_map.items(): if gmal_id not in asset_map: # Create asset from Routes metadata asset = GmalAsset( gmal_id=gmal_id, sub_category=_str(ws.cell(row=4, column=col_idx).value), asset_name=_str(ws.cell(row=5, column=col_idx).value), asset_description=_str(ws.cell(row=6, column=col_idx).value), complexity_description=_str(ws.cell(row=7, column=col_idx).value), caveats=_str(ws.cell(row=8, column=col_idx).value), complexity_level=_int(ws.cell(row=9, column=col_idx).value), master_adapt=_str(ws.cell(row=10, column=col_idx).value), ai_efficiency_pct=_float(ws.cell(row=11, column=col_idx).value), has_hour_routes=True, ) db.add(asset) db.flush() asset_map[gmal_id] = asset extra_assets += 1 else: # Update Routes-specific fields on existing asset asset = asset_map[gmal_id] asset.has_hour_routes = True if not asset.master_adapt: asset.master_adapt = _str(ws.cell(row=10, column=col_idx).value) if not asset.ai_efficiency_pct: asset.ai_efficiency_pct = _float(ws.cell(row=11, column=col_idx).value) if not asset.sub_category: asset.sub_category = _str(ws.cell(row=4, column=col_idx).value) # Now load hours for each role x gmal in this section for row_idx in range(13, 138): role = row_to_role.get(row_idx) if not role: continue for col_idx, gmal_id in col_gmal_map.items(): hours_val = ws.cell(row=row_idx, column=col_idx).value if hours_val and _float(hours_val) and _float(hours_val) != 0: asset = asset_map.get(gmal_id) if not asset: continue # Deduplicate: skip if we already have hours for this combo key = (asset.id, role.id, model_type) if key in seen_hours: continue seen_hours.add(key) gh = GmalHours( gmal_asset_id=asset.id, role_id=role.id, model_type=model_type, hours=round(_float(hours_val), 2), ) db.add(gh) hours_count += 1 # Flush every 5000 to manage memory if hours_count % 5000 == 0: db.flush() db.flush() logger.info(f"Loaded {hours_count} hour records across {len(model_sections)} model types") return len(roles), hours_count, extra_assets def _load_service_lines(wb: openpyxl.Workbook, db: Session) -> int: """Load from 'GMAL SERVICE LINES' sheet.""" ws = wb["GMAL SERVICE LINES"] count = 0 for row_idx in range(2, ws.max_row + 1): number = ws.cell(row=row_idx, column=1).value name = ws.cell(row=row_idx, column=2).value if not name: continue sl = GmalServiceLine( number=_str(number), name=_str(name), type=_str(ws.cell(row=row_idx, column=3).value), gmal_id=_str(ws.cell(row=row_idx, column=4).value), ) db.add(sl) count += 1 db.flush() logger.info(f"Loaded {count} service lines") return count def _load_role_mappings(wb: openpyxl.Workbook, db: Session) -> int: """Load from 'Job role to Level mapping' sheet.""" ws = wb["Job role to Level mapping"] count = 0 for row_idx in range(3, ws.max_row + 1): role_name = ws.cell(row=row_idx, column=1).value if not role_name: continue rlm = RoleLevelMapping( role_name=_str(role_name), number=_str(ws.cell(row=row_idx, column=3).value), level_name=_str(ws.cell(row=row_idx, column=4).value), type=_str(ws.cell(row=row_idx, column=5).value), ) db.add(rlm) count += 1 db.flush() logger.info(f"Loaded {count} role-level mappings") return count def _str(val) -> str | None: if val is None: return None s = str(val).strip() return s if s else None def _int(val) -> int | None: if val is None: return None try: return int(val) except (ValueError, TypeError): return None def _float(val) -> float | None: if val is None: return None try: return float(val) except (ValueError, TypeError): return None