gmal-scope-builder/backend/app/services/doc_parser.py
DJP 3cb1973f57 Fix tier matching: use client tier to pick correct complexity variant
- Doc parser now extracts tier labels (Tier A, A, Gold, etc.) per asset
- Matching uses tier to find the correct GMAL complexity variant:
  - Claude matches to the GMAL family (asset type)
  - Post-match lookup: (asset_name + target_complexity_level) finds exact variant
  - e.g. "Banner - Tier A" with A=Complex → finds Complex variant by asset_name query
- Tier hint passed to Claude prompt for better matching
- No blind expansion - only the tier-appropriate GMAL is matched
- Expand to Tiers button still available for when client doesn't specify tiers

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 15:17:20 -04:00

153 lines
6.3 KiB
Python

"""Parse uploaded client documents (Word/Excel) to extract asset lists."""
import logging
import io
from pathlib import Path
import openpyxl
import docx
from app.utils.claude_client import call_claude, extract_tool_result, extract_text
logger = logging.getLogger(__name__)
EXTRACT_TOOLS = [
{
"name": "extract_assets",
"description": "Extract a structured list of deliverable assets from a client brief or scope document.",
"input_schema": {
"type": "object",
"properties": {
"assets": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The asset/deliverable name as described by the client"
},
"description": {
"type": "string",
"description": "Description of what this asset involves, including any complexity or format details"
},
"complexity_hint": {
"type": "string",
"enum": ["simple", "medium", "complex", "unknown"],
"description": "Estimated complexity based on the brief"
},
"volume": {
"type": "integer",
"description": "Number of this asset needed (default 1 if not specified)"
},
"tier": {
"type": "string",
"description": "The client's tier/complexity label if specified (e.g. 'Tier A', 'A', 'Gold', '1', 'Premium'). Leave empty string if no tier is specified."
},
},
"required": ["name", "description", "complexity_hint", "volume", "tier"],
},
},
},
"required": ["assets"],
},
}
]
SYSTEM_PROMPT = """You are a creative agency asset specialist who understands production scoping.
Your job is to extract every distinct deliverable/asset from the client brief or scope document provided.
For each asset, provide:
- name: The asset name as the client describes it (e.g., "Social Media Banner", "TV Commercial Edit", "Brand Book")
- description: What this asset involves based on the document context. Include format, size, channel, and any other relevant details.
- complexity_hint: Your best estimate of complexity (simple/medium/complex) based on the description. Use "unknown" if unclear.
- volume: How many of this asset are needed. Default to 1 if not specified.
- tier: If the client specifies a tier, grade, or complexity label for this asset (e.g. "Tier A", "A", "Gold", "Premium", "1"), include it exactly as written. If the document has columns like A/B/C or Tier 1/2/3, extract those labels. Leave empty string if no tier is specified.
Be thorough - extract every distinct asset type mentioned. If the same asset appears at different tiers or complexity levels, list them as SEPARATE entries with their respective tier labels.
Do NOT combine different asset types into one entry.
Do NOT combine different asset types into one entry."""
def extract_text_from_file(file_content: bytes, filename: str) -> tuple[str, dict]:
"""Extract text from a file. Returns (text, metadata)."""
ext = Path(filename).suffix.lower()
if ext == ".docx":
text = _extract_docx_text(file_content)
sheet_count = 0
elif ext in (".xlsx", ".xls"):
text = _extract_excel_text(file_content)
wb = openpyxl.load_workbook(io.BytesIO(file_content), data_only=True)
sheet_count = len(wb.sheetnames)
elif ext == ".txt":
text = file_content.decode("utf-8", errors="replace")
sheet_count = 0
else:
raise ValueError(f"Unsupported file type: {ext}. Use .docx, .xlsx, or .txt")
if not text or len(text.strip()) < 20:
raise ValueError("Document appears to be empty or too short to extract assets from.")
metadata = {
"char_count": len(text),
"sheet_count": sheet_count,
"file_type": ext,
}
# Truncate very long documents to manage token usage
if len(text) > 50000:
text = text[:50000] + "\n\n[Document truncated...]"
return text, metadata
def parse_text_with_ai(text: str) -> tuple[list[dict], dict]:
"""Send extracted text to Claude to identify assets. Returns (assets, usage_info)."""
response = call_claude(
system=SYSTEM_PROMPT,
user_message=f"Extract all deliverable assets from this client document:\n\n{text}",
tools=EXTRACT_TOOLS,
tool_choice={"type": "tool", "name": "extract_assets"},
max_tokens=16000,
)
usage_info = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
result = extract_tool_result(response)
if not result or "assets" not in result:
logger.warning("Claude did not return structured asset data, response: %s", extract_text(response))
return [], usage_info
return result["assets"], usage_info
def _extract_docx_text(content: bytes) -> str:
"""Extract text from a .docx file."""
doc = docx.Document(io.BytesIO(content))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if cells:
paragraphs.append(" | ".join(cells))
return "\n".join(paragraphs)
def _extract_excel_text(content: bytes) -> str:
"""Extract text from an Excel file, converting all sheets to text."""
wb = openpyxl.load_workbook(io.BytesIO(content), data_only=True)
parts = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
parts.append(f"\n=== Sheet: {sheet_name} ===")
for row in ws.iter_rows(values_only=True):
cells = [str(c) for c in row if c is not None]
if cells:
parts.append(" | ".join(cells))
return "\n".join(parts)