gmal-scope-builder/backend/app/services/excel_parser.py

"""Parse the GMAL Excel workbook into database records."""

import logging
from pathlib import Path

import openpyxl
from sqlalchemy import text
from sqlalchemy.orm import Session

from app.models.gmal import GmalAsset, Role, GmalHours, GmalServiceLine, RoleLevelMapping, MODEL_TYPE_MAP, ModelType

logger = logging.getLogger(__name__)

PROGRAMME_ROLE_KEYWORDS = [
    "programme director", "global programme director",
    "regional / national operations director",
    "head of creative services", "head of project management",
    "creative services director",
]


def parse_gmal_workbook(filepath: str, db: Session) -> dict:
    """Parse the full GMAL Excel workbook and load all data into the database.

    Returns a dict with counts of loaded records.
    """
    wb = openpyxl.load_workbook(filepath, data_only=True)
    result = {
        "assets_loaded": 0,
        "roles_loaded": 0,
        "hours_loaded": 0,
        "service_lines_loaded": 0,
        "role_mappings_loaded": 0,
    }

    # Clear existing data (full re-ingest)
    db.execute(text("DELETE FROM gmal_hours"))
    db.execute(text("DELETE FROM gmal_service_lines"))
    db.execute(text("DELETE FROM role_level_mappings"))
    db.execute(text("DELETE FROM roles"))
    db.execute(text("DELETE FROM ratecard_lines"))
    db.execute(text("DELETE FROM matches"))
    db.execute(text("DELETE FROM client_assets"))
    db.execute(text("DELETE FROM projects"))
    db.execute(text("DELETE FROM gmal_assets"))
    db.flush()

    # Step 1: Load asset catalog from "Unilever_GMAL Asset List"
    asset_map = _load_asset_list(wb, db)
    result["assets_loaded"] = len(asset_map)

    # Step 2: Load roles and hours from "GMAL Routes_Jan25"
    roles_loaded, hours_loaded, extra_assets = _load_routes(wb, db, asset_map)
    result["roles_loaded"] = roles_loaded
    result["hours_loaded"] = hours_loaded
    result["assets_loaded"] += extra_assets

    # Step 3: Load service lines
    result["service_lines_loaded"] = _load_service_lines(wb, db)

    # Step 4: Load role-level mappings
    result["role_mappings_loaded"] = _load_role_mappings(wb, db)

    # Populate full-text search vectors
    db.execute(text("""
        UPDATE gmal_assets SET search_vector =
          setweight(to_tsvector('english', coalesce(asset_name, '')), 'A') ||
          setweight(to_tsvector('english', coalesce(unique_name, '')), 'A') ||
          setweight(to_tsvector('english', coalesce(sub_category, '')), 'B') ||
          setweight(to_tsvector('english', coalesce(asset_description, '')), 'C') ||
          setweight(to_tsvector('english', coalesce(complexity_description, '')), 'C')
    """))

    db.commit()
    logger.info(f"Ingestion complete: {result}")
    return result


def _load_asset_list(wb: openpyxl.Workbook, db: Session) -> dict[str, GmalAsset]:
    """Load from 'Unilever_GMAL Asset List' sheet. Returns {gmal_id: GmalAsset}."""
    ws = wb["Unilever_GMAL Asset List"]
    asset_map = {}

    for row_idx in range(2, ws.max_row + 1):
        gmal_id = ws.cell(row=row_idx, column=2).value
        if not gmal_id:
            continue
        gmal_id = str(gmal_id).strip()

        complexity_level = ws.cell(row=row_idx, column=8).value
        complexity_name_val = ws.cell(row=row_idx, column=9).value

        asset = GmalAsset(
            gmal_id=gmal_id,
            swop_asset_id=_str(ws.cell(row=row_idx, column=3).value),
            region=_str(ws.cell(row=row_idx, column=1).value),
            category=_str(ws.cell(row=row_idx, column=4).value),
            sub_category=_str(ws.cell(row=row_idx, column=5).value),
            sub_category_description=_str(ws.cell(row=row_idx, column=6).value),
            asset_name=_str(ws.cell(row=row_idx, column=7).value),
            complexity_level=int(complexity_level) if complexity_level else None,
            complexity_name=_str(complexity_name_val),
            unique_name=_str(ws.cell(row=row_idx, column=10).value),
            asset_description=_str(ws.cell(row=row_idx, column=11).value),
            complexity_description=_str(ws.cell(row=row_idx, column=12).value),
            caveats=_str(ws.cell(row=row_idx, column=13).value),
            has_hour_routes=False,
        )
        db.add(asset)
        asset_map[gmal_id] = asset

    db.flush()
    logger.info(f"Loaded {len(asset_map)} assets from Asset List")
    return asset_map


def _load_routes(wb: openpyxl.Workbook, db: Session, asset_map: dict[str, GmalAsset]) -> tuple[int, int, int]:
    """Load roles and hours from 'GMAL Routes_Jan25' sheet.

    Returns (roles_loaded, hours_loaded, extra_assets_created).
    """
    ws = wb["GMAL Routes_Jan25"]

    # Detect model type column boundaries from row 1
    model_sections = []  # [(start_col, end_col, ModelType)]
    current_header = None
    section_start = None

    for col_idx in range(13, ws.max_column + 1):
        header = ws.cell(row=1, column=col_idx).value
        if header and header != current_header:
            if current_header and section_start:
                mt = MODEL_TYPE_MAP.get(current_header)
                if mt:
                    model_sections.append((section_start, col_idx - 1, mt))
            current_header = header
            section_start = col_idx

    # Don't forget the last section
    if current_header and section_start:
        mt = MODEL_TYPE_MAP.get(current_header)
        if mt:
            model_sections.append((section_start, ws.max_column, mt))

    logger.info(f"Detected {len(model_sections)} model type sections")
    for start, end, mt in model_sections:
        logger.info(f"  {mt.value}: cols {start}-{end}")

    # Load roles from rows 13-137 (column B=discipline, C=title, D=entity, E=location, F=unique)
    # Map row_idx -> Role for hour lookups; use unique_name as dedup key
    roles = {}  # {(role_title, entity): Role}
    row_to_role = {}  # {row_idx: Role} for hour lookups
    for row_idx in range(13, 138):
        discipline = ws.cell(row=row_idx, column=2).value
        role_title = ws.cell(row=row_idx, column=3).value
        if not role_title:
            continue

        entity = _str(ws.cell(row=row_idx, column=4).value)
        location = _str(ws.cell(row=row_idx, column=5).value)
        unique_name = _str(ws.cell(row=row_idx, column=6).value)
        key = (str(role_title).strip(), entity)

        # Skip duplicates - reuse existing role for hour lookups
        if key in roles:
            row_to_role[row_idx] = roles[key]
            continue

        is_prog = any(kw in str(role_title).lower() for kw in PROGRAMME_ROLE_KEYWORDS)

        role = Role(
            discipline=_str(discipline) or "Other",
            role_title=_str(role_title),
            entity=entity,
            resource_location=location,
            unique_name=unique_name,
            sort_order=row_idx - 12,
            is_programme_role=is_prog,
        )
        db.add(role)
        roles[key] = role
        row_to_role[row_idx] = role

    db.flush()
    logger.info(f"Loaded {len(roles)} roles")

    # For each model type section, build a mapping of column -> gmal_id
    # Then load hours for each (role, gmal, model_type) combo
    hours_count = 0
    extra_assets = 0

    # Track which (gmal_id, role_id, model_type) combos we've already inserted
    # to handle duplicate columns (Pro vs non-Pro) for same GMAL within a section
    seen_hours = set()

    for section_start, section_end, model_type in model_sections:
        # Build column -> gmal_id map for this section
        # Include Pro variant columns - they contain the actual AI model data
        # Extract the base GMAL ID from row 2 (e.g. "GMAL101" from both base and Pro cols)
        col_gmal_map = {}
        for col_idx in range(section_start, section_end + 1):
            gmal_id = ws.cell(row=2, column=col_idx).value
            if gmal_id and str(gmal_id).startswith("GMAL"):
                base_id = str(gmal_id).strip()
                col_gmal_map[col_idx] = base_id

        # Enrich asset_map with metadata from Routes if not in Asset List
        for col_idx, gmal_id in col_gmal_map.items():
            if gmal_id not in asset_map:
                # Create asset from Routes metadata
                asset = GmalAsset(
                    gmal_id=gmal_id,
                    sub_category=_str(ws.cell(row=4, column=col_idx).value),
                    asset_name=_str(ws.cell(row=5, column=col_idx).value),
                    asset_description=_str(ws.cell(row=6, column=col_idx).value),
                    complexity_description=_str(ws.cell(row=7, column=col_idx).value),
                    caveats=_str(ws.cell(row=8, column=col_idx).value),
                    complexity_level=_int(ws.cell(row=9, column=col_idx).value),
                    master_adapt=_str(ws.cell(row=10, column=col_idx).value),
                    ai_efficiency_pct=_float(ws.cell(row=11, column=col_idx).value),
                    has_hour_routes=True,
                )
                db.add(asset)
                db.flush()
                asset_map[gmal_id] = asset
                extra_assets += 1
            else:
                # Update Routes-specific fields on existing asset
                asset = asset_map[gmal_id]
                asset.has_hour_routes = True
                if not asset.master_adapt:
                    asset.master_adapt = _str(ws.cell(row=10, column=col_idx).value)
                if not asset.ai_efficiency_pct:
                    asset.ai_efficiency_pct = _float(ws.cell(row=11, column=col_idx).value)
                if not asset.sub_category:
                    asset.sub_category = _str(ws.cell(row=4, column=col_idx).value)

        # Now load hours for each role x gmal in this section
        for row_idx in range(13, 138):
            role = row_to_role.get(row_idx)
            if not role:
                continue

            for col_idx, gmal_id in col_gmal_map.items():
                hours_val = ws.cell(row=row_idx, column=col_idx).value
                if hours_val and _float(hours_val) and _float(hours_val) != 0:
                    asset = asset_map.get(gmal_id)
                    if not asset:
                        continue

                    # Deduplicate: skip if we already have hours for this combo
                    key = (asset.id, role.id, model_type)
                    if key in seen_hours:
                        continue
                    seen_hours.add(key)

                    gh = GmalHours(
                        gmal_asset_id=asset.id,
                        role_id=role.id,
                        model_type=model_type,
                        hours=round(_float(hours_val), 2),
                    )
                    db.add(gh)
                    hours_count += 1

            # Flush every 5000 to manage memory
            if hours_count % 5000 == 0:
                db.flush()

    db.flush()
    logger.info(f"Loaded {hours_count} hour records across {len(model_sections)} model types")
    return len(roles), hours_count, extra_assets


def _load_service_lines(wb: openpyxl.Workbook, db: Session) -> int:
    """Load from 'GMAL SERVICE LINES' sheet."""
    ws = wb["GMAL SERVICE LINES"]
    count = 0

    for row_idx in range(2, ws.max_row + 1):
        number = ws.cell(row=row_idx, column=1).value
        name = ws.cell(row=row_idx, column=2).value
        if not name:
            continue

        sl = GmalServiceLine(
            number=_str(number),
            name=_str(name),
            type=_str(ws.cell(row=row_idx, column=3).value),
            gmal_id=_str(ws.cell(row=row_idx, column=4).value),
        )
        db.add(sl)
        count += 1

    db.flush()
    logger.info(f"Loaded {count} service lines")
    return count


def _load_role_mappings(wb: openpyxl.Workbook, db: Session) -> int:
    """Load from 'Job role to Level mapping' sheet."""
    ws = wb["Job role to Level mapping"]
    count = 0

    for row_idx in range(3, ws.max_row + 1):
        role_name = ws.cell(row=row_idx, column=1).value
        if not role_name:
            continue

        rlm = RoleLevelMapping(
            role_name=_str(role_name),
            number=_str(ws.cell(row=row_idx, column=3).value),
            level_name=_str(ws.cell(row=row_idx, column=4).value),
            type=_str(ws.cell(row=row_idx, column=5).value),
        )
        db.add(rlm)
        count += 1

    db.flush()
    logger.info(f"Loaded {count} role-level mappings")
    return count


def _str(val) -> str | None:
    if val is None:
        return None
    s = str(val).strip()
    return s if s else None


def _int(val) -> int | None:
    if val is None:
        return None
    try:
        return int(val)
    except (ValueError, TypeError):
        return None


def _float(val) -> float | None:
    if val is None:
        return None
    try:
        return float(val)
    except (ValueError, TypeError):
        return None