- Upload now shows live stage progress (uploading -> extracting -> AI parsing -> done) - Fix match group collapse: proper React state instead of DOM manipulation - Replace pre-filter with full GMAL catalog sent to Claude (~3k tokens, <$0.01) - FTS and keyword matching missed too many semantic matches - Claude now sees all 243 assets and uses semantic understanding - Improved system prompt with terminology bridges for better scoring - Per-project AI cost tracking persisted to DB - Parallel matching with cancel support - Auto-select matches >= 80%, YOLO button for rest - Debug panel for AI call inspection Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
346 lines
13 KiB
Python
346 lines
13 KiB
Python
"""Parse the GMAL Excel workbook into database records."""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import openpyxl
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.models.gmal import GmalAsset, Role, GmalHours, GmalServiceLine, RoleLevelMapping, MODEL_TYPE_MAP, ModelType
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
PROGRAMME_ROLE_KEYWORDS = [
|
|
"programme director", "global programme director",
|
|
"regional / national operations director",
|
|
"head of creative services", "head of project management",
|
|
"creative services director",
|
|
]
|
|
|
|
|
|
def parse_gmal_workbook(filepath: str, db: Session) -> dict:
|
|
"""Parse the full GMAL Excel workbook and load all data into the database.
|
|
|
|
Returns a dict with counts of loaded records.
|
|
"""
|
|
wb = openpyxl.load_workbook(filepath, data_only=True)
|
|
result = {
|
|
"assets_loaded": 0,
|
|
"roles_loaded": 0,
|
|
"hours_loaded": 0,
|
|
"service_lines_loaded": 0,
|
|
"role_mappings_loaded": 0,
|
|
}
|
|
|
|
# Clear existing data (full re-ingest)
|
|
db.execute(text("DELETE FROM gmal_hours"))
|
|
db.execute(text("DELETE FROM gmal_service_lines"))
|
|
db.execute(text("DELETE FROM role_level_mappings"))
|
|
db.execute(text("DELETE FROM roles"))
|
|
db.execute(text("DELETE FROM ratecard_lines"))
|
|
db.execute(text("DELETE FROM matches"))
|
|
db.execute(text("DELETE FROM client_assets"))
|
|
db.execute(text("DELETE FROM projects"))
|
|
db.execute(text("DELETE FROM gmal_assets"))
|
|
db.flush()
|
|
|
|
# Step 1: Load asset catalog from "Unilever_GMAL Asset List"
|
|
asset_map = _load_asset_list(wb, db)
|
|
result["assets_loaded"] = len(asset_map)
|
|
|
|
# Step 2: Load roles and hours from "GMAL Routes_Jan25"
|
|
roles_loaded, hours_loaded, extra_assets = _load_routes(wb, db, asset_map)
|
|
result["roles_loaded"] = roles_loaded
|
|
result["hours_loaded"] = hours_loaded
|
|
result["assets_loaded"] += extra_assets
|
|
|
|
# Step 3: Load service lines
|
|
result["service_lines_loaded"] = _load_service_lines(wb, db)
|
|
|
|
# Step 4: Load role-level mappings
|
|
result["role_mappings_loaded"] = _load_role_mappings(wb, db)
|
|
|
|
# Populate full-text search vectors
|
|
db.execute(text("""
|
|
UPDATE gmal_assets SET search_vector =
|
|
setweight(to_tsvector('english', coalesce(asset_name, '')), 'A') ||
|
|
setweight(to_tsvector('english', coalesce(unique_name, '')), 'A') ||
|
|
setweight(to_tsvector('english', coalesce(sub_category, '')), 'B') ||
|
|
setweight(to_tsvector('english', coalesce(asset_description, '')), 'C') ||
|
|
setweight(to_tsvector('english', coalesce(complexity_description, '')), 'C')
|
|
"""))
|
|
|
|
db.commit()
|
|
logger.info(f"Ingestion complete: {result}")
|
|
return result
|
|
|
|
|
|
def _load_asset_list(wb: openpyxl.Workbook, db: Session) -> dict[str, GmalAsset]:
|
|
"""Load from 'Unilever_GMAL Asset List' sheet. Returns {gmal_id: GmalAsset}."""
|
|
ws = wb["Unilever_GMAL Asset List"]
|
|
asset_map = {}
|
|
|
|
for row_idx in range(2, ws.max_row + 1):
|
|
gmal_id = ws.cell(row=row_idx, column=2).value
|
|
if not gmal_id:
|
|
continue
|
|
gmal_id = str(gmal_id).strip()
|
|
|
|
complexity_level = ws.cell(row=row_idx, column=8).value
|
|
complexity_name_val = ws.cell(row=row_idx, column=9).value
|
|
|
|
asset = GmalAsset(
|
|
gmal_id=gmal_id,
|
|
swop_asset_id=_str(ws.cell(row=row_idx, column=3).value),
|
|
region=_str(ws.cell(row=row_idx, column=1).value),
|
|
category=_str(ws.cell(row=row_idx, column=4).value),
|
|
sub_category=_str(ws.cell(row=row_idx, column=5).value),
|
|
sub_category_description=_str(ws.cell(row=row_idx, column=6).value),
|
|
asset_name=_str(ws.cell(row=row_idx, column=7).value),
|
|
complexity_level=int(complexity_level) if complexity_level else None,
|
|
complexity_name=_str(complexity_name_val),
|
|
unique_name=_str(ws.cell(row=row_idx, column=10).value),
|
|
asset_description=_str(ws.cell(row=row_idx, column=11).value),
|
|
complexity_description=_str(ws.cell(row=row_idx, column=12).value),
|
|
caveats=_str(ws.cell(row=row_idx, column=13).value),
|
|
has_hour_routes=False,
|
|
)
|
|
db.add(asset)
|
|
asset_map[gmal_id] = asset
|
|
|
|
db.flush()
|
|
logger.info(f"Loaded {len(asset_map)} assets from Asset List")
|
|
return asset_map
|
|
|
|
|
|
def _load_routes(wb: openpyxl.Workbook, db: Session, asset_map: dict[str, GmalAsset]) -> tuple[int, int, int]:
|
|
"""Load roles and hours from 'GMAL Routes_Jan25' sheet.
|
|
|
|
Returns (roles_loaded, hours_loaded, extra_assets_created).
|
|
"""
|
|
ws = wb["GMAL Routes_Jan25"]
|
|
|
|
# Detect model type column boundaries from row 1
|
|
model_sections = [] # [(start_col, end_col, ModelType)]
|
|
current_header = None
|
|
section_start = None
|
|
|
|
for col_idx in range(13, ws.max_column + 1):
|
|
header = ws.cell(row=1, column=col_idx).value
|
|
if header and header != current_header:
|
|
if current_header and section_start:
|
|
mt = MODEL_TYPE_MAP.get(current_header)
|
|
if mt:
|
|
model_sections.append((section_start, col_idx - 1, mt))
|
|
current_header = header
|
|
section_start = col_idx
|
|
|
|
# Don't forget the last section
|
|
if current_header and section_start:
|
|
mt = MODEL_TYPE_MAP.get(current_header)
|
|
if mt:
|
|
model_sections.append((section_start, ws.max_column, mt))
|
|
|
|
logger.info(f"Detected {len(model_sections)} model type sections")
|
|
for start, end, mt in model_sections:
|
|
logger.info(f" {mt.value}: cols {start}-{end}")
|
|
|
|
# Load roles from rows 13-137 (column B=discipline, C=title, D=entity, E=location, F=unique)
|
|
# Map row_idx -> Role for hour lookups; use unique_name as dedup key
|
|
roles = {} # {(role_title, entity): Role}
|
|
row_to_role = {} # {row_idx: Role} for hour lookups
|
|
for row_idx in range(13, 138):
|
|
discipline = ws.cell(row=row_idx, column=2).value
|
|
role_title = ws.cell(row=row_idx, column=3).value
|
|
if not role_title:
|
|
continue
|
|
|
|
entity = _str(ws.cell(row=row_idx, column=4).value)
|
|
location = _str(ws.cell(row=row_idx, column=5).value)
|
|
unique_name = _str(ws.cell(row=row_idx, column=6).value)
|
|
key = (str(role_title).strip(), entity)
|
|
|
|
# Skip duplicates - reuse existing role for hour lookups
|
|
if key in roles:
|
|
row_to_role[row_idx] = roles[key]
|
|
continue
|
|
|
|
is_prog = any(kw in str(role_title).lower() for kw in PROGRAMME_ROLE_KEYWORDS)
|
|
|
|
role = Role(
|
|
discipline=_str(discipline) or "Other",
|
|
role_title=_str(role_title),
|
|
entity=entity,
|
|
resource_location=location,
|
|
unique_name=unique_name,
|
|
sort_order=row_idx - 12,
|
|
is_programme_role=is_prog,
|
|
)
|
|
db.add(role)
|
|
roles[key] = role
|
|
row_to_role[row_idx] = role
|
|
|
|
db.flush()
|
|
logger.info(f"Loaded {len(roles)} roles")
|
|
|
|
# For each model type section, build a mapping of column -> gmal_id
|
|
# Then load hours for each (role, gmal, model_type) combo
|
|
hours_count = 0
|
|
extra_assets = 0
|
|
|
|
# Track which (gmal_id, role_id, model_type) combos we've already inserted
|
|
# to handle duplicate columns (Pro vs non-Pro) for same GMAL within a section
|
|
seen_hours = set()
|
|
|
|
for section_start, section_end, model_type in model_sections:
|
|
# Build column -> gmal_id map for this section
|
|
# Include Pro variant columns - they contain the actual AI model data
|
|
# Extract the base GMAL ID from row 2 (e.g. "GMAL101" from both base and Pro cols)
|
|
col_gmal_map = {}
|
|
for col_idx in range(section_start, section_end + 1):
|
|
gmal_id = ws.cell(row=2, column=col_idx).value
|
|
if gmal_id and str(gmal_id).startswith("GMAL"):
|
|
base_id = str(gmal_id).strip()
|
|
col_gmal_map[col_idx] = base_id
|
|
|
|
# Enrich asset_map with metadata from Routes if not in Asset List
|
|
for col_idx, gmal_id in col_gmal_map.items():
|
|
if gmal_id not in asset_map:
|
|
# Create asset from Routes metadata
|
|
asset = GmalAsset(
|
|
gmal_id=gmal_id,
|
|
sub_category=_str(ws.cell(row=4, column=col_idx).value),
|
|
asset_name=_str(ws.cell(row=5, column=col_idx).value),
|
|
asset_description=_str(ws.cell(row=6, column=col_idx).value),
|
|
complexity_description=_str(ws.cell(row=7, column=col_idx).value),
|
|
caveats=_str(ws.cell(row=8, column=col_idx).value),
|
|
complexity_level=_int(ws.cell(row=9, column=col_idx).value),
|
|
master_adapt=_str(ws.cell(row=10, column=col_idx).value),
|
|
ai_efficiency_pct=_float(ws.cell(row=11, column=col_idx).value),
|
|
has_hour_routes=True,
|
|
)
|
|
db.add(asset)
|
|
db.flush()
|
|
asset_map[gmal_id] = asset
|
|
extra_assets += 1
|
|
else:
|
|
# Update Routes-specific fields on existing asset
|
|
asset = asset_map[gmal_id]
|
|
asset.has_hour_routes = True
|
|
if not asset.master_adapt:
|
|
asset.master_adapt = _str(ws.cell(row=10, column=col_idx).value)
|
|
if not asset.ai_efficiency_pct:
|
|
asset.ai_efficiency_pct = _float(ws.cell(row=11, column=col_idx).value)
|
|
if not asset.sub_category:
|
|
asset.sub_category = _str(ws.cell(row=4, column=col_idx).value)
|
|
|
|
# Now load hours for each role x gmal in this section
|
|
for row_idx in range(13, 138):
|
|
role = row_to_role.get(row_idx)
|
|
if not role:
|
|
continue
|
|
|
|
for col_idx, gmal_id in col_gmal_map.items():
|
|
hours_val = ws.cell(row=row_idx, column=col_idx).value
|
|
if hours_val and _float(hours_val) and _float(hours_val) != 0:
|
|
asset = asset_map.get(gmal_id)
|
|
if not asset:
|
|
continue
|
|
|
|
# Deduplicate: skip if we already have hours for this combo
|
|
key = (asset.id, role.id, model_type)
|
|
if key in seen_hours:
|
|
continue
|
|
seen_hours.add(key)
|
|
|
|
gh = GmalHours(
|
|
gmal_asset_id=asset.id,
|
|
role_id=role.id,
|
|
model_type=model_type,
|
|
hours=round(_float(hours_val), 2),
|
|
)
|
|
db.add(gh)
|
|
hours_count += 1
|
|
|
|
# Flush every 5000 to manage memory
|
|
if hours_count % 5000 == 0:
|
|
db.flush()
|
|
|
|
db.flush()
|
|
logger.info(f"Loaded {hours_count} hour records across {len(model_sections)} model types")
|
|
return len(roles), hours_count, extra_assets
|
|
|
|
|
|
def _load_service_lines(wb: openpyxl.Workbook, db: Session) -> int:
|
|
"""Load from 'GMAL SERVICE LINES' sheet."""
|
|
ws = wb["GMAL SERVICE LINES"]
|
|
count = 0
|
|
|
|
for row_idx in range(2, ws.max_row + 1):
|
|
number = ws.cell(row=row_idx, column=1).value
|
|
name = ws.cell(row=row_idx, column=2).value
|
|
if not name:
|
|
continue
|
|
|
|
sl = GmalServiceLine(
|
|
number=_str(number),
|
|
name=_str(name),
|
|
type=_str(ws.cell(row=row_idx, column=3).value),
|
|
gmal_id=_str(ws.cell(row=row_idx, column=4).value),
|
|
)
|
|
db.add(sl)
|
|
count += 1
|
|
|
|
db.flush()
|
|
logger.info(f"Loaded {count} service lines")
|
|
return count
|
|
|
|
|
|
def _load_role_mappings(wb: openpyxl.Workbook, db: Session) -> int:
|
|
"""Load from 'Job role to Level mapping' sheet."""
|
|
ws = wb["Job role to Level mapping"]
|
|
count = 0
|
|
|
|
for row_idx in range(3, ws.max_row + 1):
|
|
role_name = ws.cell(row=row_idx, column=1).value
|
|
if not role_name:
|
|
continue
|
|
|
|
rlm = RoleLevelMapping(
|
|
role_name=_str(role_name),
|
|
number=_str(ws.cell(row=row_idx, column=3).value),
|
|
level_name=_str(ws.cell(row=row_idx, column=4).value),
|
|
type=_str(ws.cell(row=row_idx, column=5).value),
|
|
)
|
|
db.add(rlm)
|
|
count += 1
|
|
|
|
db.flush()
|
|
logger.info(f"Loaded {count} role-level mappings")
|
|
return count
|
|
|
|
|
|
def _str(val) -> str | None:
|
|
if val is None:
|
|
return None
|
|
s = str(val).strip()
|
|
return s if s else None
|
|
|
|
|
|
def _int(val) -> int | None:
|
|
if val is None:
|
|
return None
|
|
try:
|
|
return int(val)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
|
|
def _float(val) -> float | None:
|
|
if val is None:
|
|
return None
|
|
try:
|
|
return float(val)
|
|
except (ValueError, TypeError):
|
|
return None
|