gmal-scope-builder/backend/app/services/ai_descriptions.py
DJP a1bbd330c6 AI-enhanced GMAL descriptions + matching fixes
- New ai_descriptions service: generates rich brief-friendly descriptions
  per GMAL asset via Claude, grouped by category (135/243 generated)
- Descriptions include client synonyms, inclusions/exclusions, use cases,
  channel/format info, complexity differentiators
- GMAL Browser shows AI descriptions with green/amber status indicators
- GMAL Editor: editable AI descriptions, per-asset regenerate, batch generate all
- Matching catalog now includes AI descriptions for better semantic matching
- Fixed ORM session expiry bug: snapshot asset data before batch commits
- Fixed enum issue: removed unused UPLOADING/EXTRACTING statuses
- Added app-level logging (basicConfig) so service logs show in docker
- YOLO now batches 20 selections in parallel
- Matching returns 1 best match by default, extras only within 5% of top

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 10:12:04 -04:00

224 lines
8.9 KiB
Python

"""Generate rich AI-enhanced descriptions for GMAL assets."""
import logging
from collections import defaultdict
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.gmal import GmalAsset
from app.utils.claude_client import call_claude, extract_tool_result
logger = logging.getLogger(__name__)
DESCRIPTION_TOOL = {
"name": "save_descriptions",
"description": "Save the generated brief-friendly descriptions for each GMAL asset.",
"input_schema": {
"type": "object",
"properties": {
"descriptions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"gmal_id": {
"type": "string",
"description": "The GMAL ID"
},
"description": {
"type": "string",
"description": "The full rich brief-friendly description"
},
},
"required": ["gmal_id", "description"],
},
},
},
"required": ["descriptions"],
},
}
SYSTEM_PROMPT = """You are a creative production expert who understands both agency terminology and how clients brief work.
Your job is to write a rich, comprehensive "brief-friendly" description for each GMAL production asset. These descriptions will be used to match client briefs to the correct GMAL, so they must bridge the gap between how agencies name things internally and how clients describe what they need.
For EACH asset, your description MUST cover ALL of the following:
1. **Client terminology / synonyms**: List every common name a client might use for this deliverable. Include abbreviations, informal names, regional variations. E.g. for a web banner: "Also known as: display ad, digital banner, MPU, leaderboard, skyscraper, web ad, programmatic creative, HTML5 banner, GDN creative, rich media unit"
2. **Plain language summary**: What this actually IS in 1-2 sentences. Avoid agency jargon. Write as if explaining to a brand manager.
3. **What's included**: Specific outputs and process steps covered. E.g. "Includes concept, design, 3 rounds of revisions, final artwork delivery in specified formats"
4. **What's NOT included**: Important exclusions and scope boundaries. E.g. "Does not include photography/shoot, copywriting (billed separately), media buying, or platform-specific resizing beyond specified formats"
5. **Typical use cases**: When and why a client would need this. E.g. "Used for: seasonal campaign launches, product launches, always-on digital campaigns, retargeting"
6. **Channel/format**: Where this asset lives - social, web, print, video, OOH, retail, ecommerce, etc.
7. **Complexity level explanation**: What specifically makes THIS complexity level different from the other levels of the same asset. Be concrete - don't just say "more complex", say what's different (more markets, more formats, more rounds, more stakeholders, etc.)
Write each description as a flowing paragraph, not bullet points. Make them 4-8 sentences. Be specific and practical."""
async def generate_descriptions_batch(db: AsyncSession) -> dict:
"""Generate AI-enhanced descriptions for all GMAL assets, grouped by sub_category.
Returns {total_generated, total_categories, cost_usd}.
"""
result = await db.execute(
select(GmalAsset).where(GmalAsset.has_hour_routes == True).order_by(GmalAsset.sub_category, GmalAsset.gmal_id)
)
all_assets = result.scalars().all()
# Group by sub_category
by_category: dict[str, list[GmalAsset]] = defaultdict(list)
for a in all_assets:
by_category[a.sub_category or "Other"].append(a)
total_generated = 0
total_cost = 0.0
categories_done = 0
for cat_name, assets in by_category.items():
logger.info(f"Generating descriptions for category '{cat_name}' ({len(assets)} assets)")
generated, cost = await _generate_for_category(db, cat_name, assets)
total_generated += generated
total_cost += cost
categories_done += 1
# Commit after each category
await db.commit()
logger.info(f"Category '{cat_name}' done: {generated} descriptions. Progress: {categories_done}/{len(by_category)}")
# Regenerate search vectors to include new descriptions
await db.execute(text("""
UPDATE gmal_assets SET search_vector =
setweight(to_tsvector('english', coalesce(asset_name, '')), 'A') ||
setweight(to_tsvector('english', coalesce(unique_name, '')), 'A') ||
setweight(to_tsvector('english', coalesce(sub_category, '')), 'B') ||
setweight(to_tsvector('english', coalesce(ai_enhanced_description, '')), 'B') ||
setweight(to_tsvector('english', coalesce(asset_description, '')), 'C') ||
setweight(to_tsvector('english', coalesce(complexity_description, '')), 'C')
"""))
await db.commit()
return {
"total_generated": total_generated,
"total_categories": len(by_category),
"cost_usd": round(total_cost, 4),
}
async def generate_description_single(db: AsyncSession, asset: GmalAsset) -> str:
"""Generate AI-enhanced description for a single GMAL asset."""
# Load siblings (same asset name, different complexities) for context
siblings_result = await db.execute(
select(GmalAsset).where(
GmalAsset.asset_name == asset.asset_name,
GmalAsset.has_hour_routes == True,
).order_by(GmalAsset.complexity_level)
)
siblings = siblings_result.scalars().all()
assets_text = _format_assets_for_prompt([asset], siblings)
user_msg = f"""Generate a rich brief-friendly description for this GMAL asset:
{assets_text}
Context - here are the other complexity levels of this same asset for comparison:
{_format_siblings(siblings, asset.gmal_id)}"""
response = call_claude(
system=SYSTEM_PROMPT,
user_message=user_msg,
tools=[DESCRIPTION_TOOL],
tool_choice={"type": "tool", "name": "save_descriptions"},
max_tokens=2048,
)
result = extract_tool_result(response)
if result and "descriptions" in result and result["descriptions"]:
desc = result["descriptions"][0]["description"]
asset.ai_enhanced_description = desc
await db.commit()
return desc
return ""
async def _generate_for_category(
db: AsyncSession,
category_name: str,
assets: list[GmalAsset],
) -> tuple[int, float]:
"""Generate descriptions for all assets in one category. Returns (count, cost)."""
assets_text = _format_assets_for_prompt(assets, assets)
user_msg = f"""Generate rich brief-friendly descriptions for these {len(assets)} GMAL assets in the "{category_name}" category:
{assets_text}
Remember: each description must cover client synonyms, plain language summary, what's included, what's NOT included, typical use cases, channel/format, and complexity level explanation. Write 4-8 sentences per asset."""
response = call_claude(
system=SYSTEM_PROMPT,
user_message=user_msg,
tools=[DESCRIPTION_TOOL],
tool_choice={"type": "tool", "name": "save_descriptions"},
max_tokens=8192,
)
cost = getattr(response, '_usage_info', {}).get("cost_usd", 0)
result = extract_tool_result(response)
if not result or "descriptions" not in result:
logger.warning(f"No descriptions returned for category '{category_name}'")
return 0, cost
# Save descriptions
asset_map = {a.gmal_id: a for a in assets}
count = 0
for item in result["descriptions"]:
asset = asset_map.get(item["gmal_id"])
if asset:
asset.ai_enhanced_description = item["description"]
count += 1
return count, cost
def _format_assets_for_prompt(assets: list[GmalAsset], all_in_category: list[GmalAsset]) -> str:
"""Format assets for the Claude prompt with full context."""
parts = []
for a in assets:
desc = a.asset_description or ""
if len(desc) > 500:
desc = desc[:500] + "..."
comp_desc = a.complexity_description or ""
caveats = a.caveats or ""
parts.append(f"""---
GMAL ID: {a.gmal_id}
Asset Name: {a.unique_name or a.asset_name}
Category: {a.sub_category}
Complexity: {a.complexity_name} (Level {a.complexity_level})
Asset Description: {desc}
Complexity Description: {comp_desc}
Caveats: {caveats}
---""")
return "\n".join(parts)
def _format_siblings(siblings: list[GmalAsset], exclude_id: str) -> str:
"""Format sibling assets for context."""
parts = []
for s in siblings:
if s.gmal_id == exclude_id:
continue
parts.append(f" {s.gmal_id}: {s.unique_name} - {s.complexity_description or ''}")
return "\n".join(parts) if parts else "No other complexity levels."