diff --git a/.gitignore b/.gitignore index 348f1fe..84e4fd3 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ data/*.xlsx .idea/ *.log Thumbs.db +backups/*.sql diff --git a/backend/app/api/gmal.py b/backend/app/api/gmal.py index 9f67220..a7b1d44 100644 --- a/backend/app/api/gmal.py +++ b/backend/app/api/gmal.py @@ -151,10 +151,36 @@ async def get_stats(db: AsyncSession = Depends(get_db)): categories = await db.execute(select(distinct(GmalAsset.category)).where(GmalAsset.category.isnot(None))) sub_cats = await db.execute(select(distinct(GmalAsset.sub_category)).where(GmalAsset.sub_category.isnot(None))) + ai_desc_count = await db.execute( + select(func.count(GmalAsset.id)).where(GmalAsset.ai_enhanced_description.isnot(None)) + ) + return GmalStatsOut( total_assets=assets_count.scalar() or 0, total_roles=roles_count.scalar() or 0, total_hours_records=hours_count.scalar() or 0, categories=sorted([r[0] for r in categories.all()]), sub_categories=sorted([r[0] for r in sub_cats.all()]), + ai_descriptions_count=ai_desc_count.scalar() or 0, ) + + +@router.post("/generate-descriptions") +async def generate_all_descriptions(db: AsyncSession = Depends(get_db)): + """Generate AI-enhanced descriptions for all GMAL assets.""" + from app.services.ai_descriptions import generate_descriptions_batch + result = await generate_descriptions_batch(db) + return result + + +@router.post("/assets/{gmal_id}/generate-description") +async def generate_single_description(gmal_id: str, db: AsyncSession = Depends(get_db)): + """Generate/regenerate AI-enhanced description for a single GMAL asset.""" + from app.services.ai_descriptions import generate_description_single + result = await db.execute(select(GmalAsset).where(GmalAsset.gmal_id == gmal_id)) + asset = result.scalar_one_or_none() + if not asset: + raise HTTPException(status_code=404, detail=f"GMAL asset {gmal_id} not found") + + desc = await generate_description_single(db, asset) + return {"gmal_id": gmal_id, "ai_enhanced_description": desc} diff --git a/backend/app/api/matching.py b/backend/app/api/matching.py index a4b3a12..be2c5cb 100644 --- a/backend/app/api/matching.py +++ b/backend/app/api/matching.py @@ -29,12 +29,11 @@ async def upload_client_document( # Stage 1: Uploading content = await file.read() project.source_filename = file.filename - project.status = ProjectStatus.UPLOADING + project.status = ProjectStatus.PARSING project.parse_stage = f"Uploading {file.filename}..." await db.commit() # Stage 2: Extracting text - project.status = ProjectStatus.EXTRACTING project.parse_stage = "Extracting text from document..." await db.commit() @@ -48,7 +47,6 @@ async def upload_client_document( sheets_info = f" ({metadata['sheet_count']} sheets)" if metadata['sheet_count'] else "" project.parse_stage = f"Extracted {metadata['char_count']:,} characters{sheets_info}. Sending to AI..." - project.status = ProjectStatus.PARSING await db.commit() # Stage 3: AI parsing diff --git a/backend/app/main.py b/backend/app/main.py index 7c3998f..697ac38 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,6 +1,11 @@ +import logging + from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +# Enable app-level logging +logging.basicConfig(level=logging.INFO, format="%(levelname)s [%(name)s] %(message)s") + from app.api import gmal, ingest, projects, matching, ratecard app = FastAPI(title="Scope Builder", version="1.0.0") diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 18a7ebb..12d6699 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -10,8 +10,6 @@ from app.models.gmal import ModelType class ProjectStatus(str, enum.Enum): DRAFT = "draft" - UPLOADING = "uploading" - EXTRACTING = "extracting" PARSING = "parsing" MATCHING = "matching" REVIEW = "review" diff --git a/backend/app/schemas/gmal.py b/backend/app/schemas/gmal.py index 5ad1a46..4da3e80 100644 --- a/backend/app/schemas/gmal.py +++ b/backend/app/schemas/gmal.py @@ -66,6 +66,7 @@ class GmalStatsOut(BaseModel): total_hours_records: int categories: list[str] sub_categories: list[str] + ai_descriptions_count: int = 0 class GmalAssetUpdate(BaseModel): @@ -80,6 +81,7 @@ class GmalAssetUpdate(BaseModel): caveats: str | None = None master_adapt: str | None = None ai_efficiency_pct: float | None = None + ai_enhanced_description: str | None = None class GmalHoursUpdate(BaseModel): diff --git a/backend/app/services/ai_descriptions.py b/backend/app/services/ai_descriptions.py new file mode 100644 index 0000000..7445f06 --- /dev/null +++ b/backend/app/services/ai_descriptions.py @@ -0,0 +1,224 @@ +"""Generate rich AI-enhanced descriptions for GMAL assets.""" + +import logging +from collections import defaultdict + +from sqlalchemy import select, text +from sqlalchemy.ext.asyncio import AsyncSession + +from app.models.gmal import GmalAsset +from app.utils.claude_client import call_claude, extract_tool_result + +logger = logging.getLogger(__name__) + +DESCRIPTION_TOOL = { + "name": "save_descriptions", + "description": "Save the generated brief-friendly descriptions for each GMAL asset.", + "input_schema": { + "type": "object", + "properties": { + "descriptions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "gmal_id": { + "type": "string", + "description": "The GMAL ID" + }, + "description": { + "type": "string", + "description": "The full rich brief-friendly description" + }, + }, + "required": ["gmal_id", "description"], + }, + }, + }, + "required": ["descriptions"], + }, +} + +SYSTEM_PROMPT = """You are a creative production expert who understands both agency terminology and how clients brief work. + +Your job is to write a rich, comprehensive "brief-friendly" description for each GMAL production asset. These descriptions will be used to match client briefs to the correct GMAL, so they must bridge the gap between how agencies name things internally and how clients describe what they need. + +For EACH asset, your description MUST cover ALL of the following: + +1. **Client terminology / synonyms**: List every common name a client might use for this deliverable. Include abbreviations, informal names, regional variations. E.g. for a web banner: "Also known as: display ad, digital banner, MPU, leaderboard, skyscraper, web ad, programmatic creative, HTML5 banner, GDN creative, rich media unit" + +2. **Plain language summary**: What this actually IS in 1-2 sentences. Avoid agency jargon. Write as if explaining to a brand manager. + +3. **What's included**: Specific outputs and process steps covered. E.g. "Includes concept, design, 3 rounds of revisions, final artwork delivery in specified formats" + +4. **What's NOT included**: Important exclusions and scope boundaries. E.g. "Does not include photography/shoot, copywriting (billed separately), media buying, or platform-specific resizing beyond specified formats" + +5. **Typical use cases**: When and why a client would need this. E.g. "Used for: seasonal campaign launches, product launches, always-on digital campaigns, retargeting" + +6. **Channel/format**: Where this asset lives - social, web, print, video, OOH, retail, ecommerce, etc. + +7. **Complexity level explanation**: What specifically makes THIS complexity level different from the other levels of the same asset. Be concrete - don't just say "more complex", say what's different (more markets, more formats, more rounds, more stakeholders, etc.) + +Write each description as a flowing paragraph, not bullet points. Make them 4-8 sentences. Be specific and practical.""" + + +async def generate_descriptions_batch(db: AsyncSession) -> dict: + """Generate AI-enhanced descriptions for all GMAL assets, grouped by sub_category. + + Returns {total_generated, total_categories, cost_usd}. + """ + result = await db.execute( + select(GmalAsset).where(GmalAsset.has_hour_routes == True).order_by(GmalAsset.sub_category, GmalAsset.gmal_id) + ) + all_assets = result.scalars().all() + + # Group by sub_category + by_category: dict[str, list[GmalAsset]] = defaultdict(list) + for a in all_assets: + by_category[a.sub_category or "Other"].append(a) + + total_generated = 0 + total_cost = 0.0 + categories_done = 0 + + for cat_name, assets in by_category.items(): + logger.info(f"Generating descriptions for category '{cat_name}' ({len(assets)} assets)") + + generated, cost = await _generate_for_category(db, cat_name, assets) + total_generated += generated + total_cost += cost + categories_done += 1 + + # Commit after each category + await db.commit() + logger.info(f"Category '{cat_name}' done: {generated} descriptions. Progress: {categories_done}/{len(by_category)}") + + # Regenerate search vectors to include new descriptions + await db.execute(text(""" + UPDATE gmal_assets SET search_vector = + setweight(to_tsvector('english', coalesce(asset_name, '')), 'A') || + setweight(to_tsvector('english', coalesce(unique_name, '')), 'A') || + setweight(to_tsvector('english', coalesce(sub_category, '')), 'B') || + setweight(to_tsvector('english', coalesce(ai_enhanced_description, '')), 'B') || + setweight(to_tsvector('english', coalesce(asset_description, '')), 'C') || + setweight(to_tsvector('english', coalesce(complexity_description, '')), 'C') + """)) + await db.commit() + + return { + "total_generated": total_generated, + "total_categories": len(by_category), + "cost_usd": round(total_cost, 4), + } + + +async def generate_description_single(db: AsyncSession, asset: GmalAsset) -> str: + """Generate AI-enhanced description for a single GMAL asset.""" + # Load siblings (same asset name, different complexities) for context + siblings_result = await db.execute( + select(GmalAsset).where( + GmalAsset.asset_name == asset.asset_name, + GmalAsset.has_hour_routes == True, + ).order_by(GmalAsset.complexity_level) + ) + siblings = siblings_result.scalars().all() + + assets_text = _format_assets_for_prompt([asset], siblings) + + user_msg = f"""Generate a rich brief-friendly description for this GMAL asset: + +{assets_text} + +Context - here are the other complexity levels of this same asset for comparison: +{_format_siblings(siblings, asset.gmal_id)}""" + + response = call_claude( + system=SYSTEM_PROMPT, + user_message=user_msg, + tools=[DESCRIPTION_TOOL], + tool_choice={"type": "tool", "name": "save_descriptions"}, + max_tokens=2048, + ) + + result = extract_tool_result(response) + if result and "descriptions" in result and result["descriptions"]: + desc = result["descriptions"][0]["description"] + asset.ai_enhanced_description = desc + await db.commit() + return desc + + return "" + + +async def _generate_for_category( + db: AsyncSession, + category_name: str, + assets: list[GmalAsset], +) -> tuple[int, float]: + """Generate descriptions for all assets in one category. Returns (count, cost).""" + assets_text = _format_assets_for_prompt(assets, assets) + + user_msg = f"""Generate rich brief-friendly descriptions for these {len(assets)} GMAL assets in the "{category_name}" category: + +{assets_text} + +Remember: each description must cover client synonyms, plain language summary, what's included, what's NOT included, typical use cases, channel/format, and complexity level explanation. Write 4-8 sentences per asset.""" + + response = call_claude( + system=SYSTEM_PROMPT, + user_message=user_msg, + tools=[DESCRIPTION_TOOL], + tool_choice={"type": "tool", "name": "save_descriptions"}, + max_tokens=8192, + ) + + cost = getattr(response, '_usage_info', {}).get("cost_usd", 0) + + result = extract_tool_result(response) + if not result or "descriptions" not in result: + logger.warning(f"No descriptions returned for category '{category_name}'") + return 0, cost + + # Save descriptions + asset_map = {a.gmal_id: a for a in assets} + count = 0 + for item in result["descriptions"]: + asset = asset_map.get(item["gmal_id"]) + if asset: + asset.ai_enhanced_description = item["description"] + count += 1 + + return count, cost + + +def _format_assets_for_prompt(assets: list[GmalAsset], all_in_category: list[GmalAsset]) -> str: + """Format assets for the Claude prompt with full context.""" + parts = [] + for a in assets: + desc = a.asset_description or "" + if len(desc) > 500: + desc = desc[:500] + "..." + comp_desc = a.complexity_description or "" + caveats = a.caveats or "" + + parts.append(f"""--- +GMAL ID: {a.gmal_id} +Asset Name: {a.unique_name or a.asset_name} +Category: {a.sub_category} +Complexity: {a.complexity_name} (Level {a.complexity_level}) +Asset Description: {desc} +Complexity Description: {comp_desc} +Caveats: {caveats} +---""") + + return "\n".join(parts) + + +def _format_siblings(siblings: list[GmalAsset], exclude_id: str) -> str: + """Format sibling assets for context.""" + parts = [] + for s in siblings: + if s.gmal_id == exclude_id: + continue + parts.append(f" {s.gmal_id}: {s.unique_name} - {s.complexity_description or ''}") + return "\n".join(parts) if parts else "No other complexity levels." diff --git a/backend/app/services/ai_matching.py b/backend/app/services/ai_matching.py index 535e8b1..f61d6a0 100644 --- a/backend/app/services/ai_matching.py +++ b/backend/app/services/ai_matching.py @@ -75,6 +75,7 @@ MATCH_TOOLS = [ }, "minItems": 1, "maxItems": 3, + "description": "Return your single best match. Only include a 2nd or 3rd match if they score within 5% of the best match.", }, }, "required": ["matches"], @@ -98,9 +99,9 @@ Guidelines: - "Display banner" / "digital ad" = Standard Banner/Display GMALs - "Social post" / "social content" = Social Content/Social Video GMALs - "BTS" / "behind the scenes" = Behind The Scenes GMALs +- Return your SINGLE BEST match. Only include additional matches if they score within 5% of the best. - If the client asset maps clearly to one GMAL, set confidence="exact" with score 0.9-1.0. - If similar but with notable differences, set confidence="close" with score 0.6-0.89. -- If multiple GMALs could match, return up to 3 ranked options with confidence="multiple". - If nothing matches well, return the closest option with confidence="none" and score below 0.3. - Always explain caveats: what the GMAL includes/excludes vs what the client described. - Pay attention to complexity: a "simple banner" should match a Simple complexity GMAL, not Complex. @@ -142,6 +143,12 @@ async def match_client_assets( """ _clear_cancel(project_id) + # Snapshot client asset data before any commits (ORM objects expire after commit) + asset_snapshots = [ + {"id": ca.id, "raw_name": ca.raw_name, "raw_description": ca.raw_description, "volume": ca.volume} + for ca in client_assets + ] + # Load all GMAL assets - send full compact catalog to Claude (only ~3k tokens) result = await db.execute( select(GmalAsset).where(GmalAsset.has_hour_routes == True).order_by(GmalAsset.gmal_id) @@ -154,7 +161,7 @@ async def match_client_assets( logger.info(f"Full GMAL catalog: {len(all_gmals)} assets, ~{len(catalog_text)} chars") all_matches = [] - total = len(client_assets) + total = len(asset_snapshots) # Process in batches for batch_start in range(0, total, BATCH_SIZE): @@ -162,7 +169,7 @@ async def match_client_assets( logger.info(f"Matching cancelled for project {project_id} at {batch_start}/{total}") break - batch = client_assets[batch_start:batch_start + BATCH_SIZE] + batch = asset_snapshots[batch_start:batch_start + BATCH_SIZE] batch_num = batch_start // BATCH_SIZE + 1 logger.info(f"Matching batch {batch_num} ({batch_start+1}-{min(batch_start+BATCH_SIZE, total)} of {total})") @@ -170,26 +177,26 @@ async def match_client_assets( loop = asyncio.get_event_loop() with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor: futures = [] - for ca in batch: + for snap in batch: if _is_cancelled(project_id): break future = loop.run_in_executor( executor, _match_single_asset, - ca.raw_name, - ca.raw_description, - ca.volume, + snap["raw_name"], + snap["raw_description"], + snap["volume"], catalog_text, len(all_gmals), ) - futures.append((ca, future)) + futures.append((snap, future)) # Collect results and accumulate costs batch_input = 0 batch_output = 0 batch_cost = 0.0 - for ca, future in futures: + for snap, future in futures: try: tool_result, usage = await future batch_input += usage.get("input_tokens", 0) @@ -197,18 +204,24 @@ async def match_client_assets( batch_cost += usage.get("cost_usd", 0) if tool_result and "matches" in tool_result: - # Auto-select: if top match is >= 80%, select it - top_score = tool_result["matches"][0].get("confidence_score", 0) if tool_result["matches"] else 0 + raw_matches = tool_result["matches"] + top_score = raw_matches[0].get("confidence_score", 0) if raw_matches else 0 auto_select = top_score >= 0.8 - for rank, m in enumerate(tool_result["matches"], 1): + # Only keep alternatives within 5% of top score + filtered = [raw_matches[0]] if raw_matches else [] + for m in raw_matches[1:]: + if abs((m.get("confidence_score", 0) - top_score)) <= 0.05: + filtered.append(m) + + for rank, m in enumerate(filtered, 1): gmal = gmal_by_id.get(m["gmal_id"]) if not gmal: logger.warning(f"Claude returned unknown GMAL ID: {m['gmal_id']}") continue match = Match( - client_asset_id=ca.id, + client_asset_id=snap["id"], gmal_asset_id=gmal.id, confidence=MatchConfidence(m["confidence"]), confidence_score=m.get("confidence_score"), @@ -220,9 +233,9 @@ async def match_client_assets( db.add(match) all_matches.append(match) else: - logger.warning(f"No match result for: {ca.raw_name}") + logger.warning(f"No match result for: {snap['raw_name']}") except Exception as e: - logger.error(f"Error matching '{ca.raw_name}': {e}") + logger.error(f"Error matching '{snap['raw_name']}': {e}") # Save batch costs to project from app.models.project import Project @@ -243,9 +256,11 @@ async def match_client_assets( def _format_compact_catalog(all_gmals: list[GmalAsset]) -> str: - """Format the full GMAL catalog as a compact list for Claude. + """Format the full GMAL catalog for Claude with AI-enhanced descriptions where available. - ~3k tokens for 243 assets. Much cheaper than pre-filtering and missing the right match. + Without AI descriptions: ~3k tokens (just names) + With AI descriptions: ~15-20k tokens (names + condensed descriptions) + Still much cheaper and more accurate than pre-filtering. """ lines = [] current_cat = None @@ -256,4 +271,11 @@ def _format_compact_catalog(all_gmals: list[GmalAsset]) -> str: complexity = g.complexity_name or f"L{g.complexity_level}" lines.append(f" {g.gmal_id}: {g.unique_name or g.asset_name} ({complexity})") + # Include AI-enhanced description if available (condensed to ~200 chars) + if g.ai_enhanced_description: + desc = g.ai_enhanced_description + if len(desc) > 250: + desc = desc[:250] + "..." + lines.append(f" > {desc}") + return "\n".join(lines) diff --git a/frontend/src/pages/GmalBrowser.css b/frontend/src/pages/GmalBrowser.css index 22780ac..69b538a 100644 --- a/frontend/src/pages/GmalBrowser.css +++ b/frontend/src/pages/GmalBrowser.css @@ -164,6 +164,44 @@ margin-bottom: 20px; } +.ai-desc-status { + font-size: 10px; + font-weight: 600; + margin-left: 8px; + padding: 1px 6px; + border-radius: 4px; + text-transform: none; + letter-spacing: 0; +} + +.ai-desc-ok { + background: var(--color-success-bg); + color: var(--color-success); +} + +.ai-desc-missing { + background: var(--color-warning-bg); + color: var(--color-warning); +} + +.ai-desc-box { + font-size: 13px; + line-height: 1.7; + color: var(--color-text); + background: rgba(255, 196, 7, 0.05); + border: 1px solid rgba(255, 196, 7, 0.15); + border-radius: var(--radius); + padding: 14px; + white-space: pre-wrap; +} + +.ai-desc-empty { + font-size: 12px; + color: var(--color-text-muted); + font-style: italic; + padding: 10px 0; +} + .detail-text { font-size: 12px; line-height: 1.6; diff --git a/frontend/src/pages/GmalBrowser.tsx b/frontend/src/pages/GmalBrowser.tsx index 19e1e6d..60106dc 100644 --- a/frontend/src/pages/GmalBrowser.tsx +++ b/frontend/src/pages/GmalBrowser.tsx @@ -146,9 +146,24 @@ export default function GmalBrowser() { +