gmal-scope-builder/backend/app/services/doc_parser.py
DJP 26d3435be0 Improve matching, upload UX, collapse fix, full catalog approach
- Upload now shows live stage progress (uploading -> extracting -> AI parsing -> done)
- Fix match group collapse: proper React state instead of DOM manipulation
- Replace pre-filter with full GMAL catalog sent to Claude (~3k tokens, <$0.01)
  - FTS and keyword matching missed too many semantic matches
  - Claude now sees all 243 assets and uses semantic understanding
- Improved system prompt with terminology bridges for better scoring
- Per-project AI cost tracking persisted to DB
- Parallel matching with cancel support
- Auto-select matches >= 80%, YOLO button for rest
- Debug panel for AI call inspection

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 19:22:08 -04:00

147 lines
5.7 KiB
Python

"""Parse uploaded client documents (Word/Excel) to extract asset lists."""
import logging
import io
from pathlib import Path
import openpyxl
import docx
from app.utils.claude_client import call_claude, extract_tool_result, extract_text
logger = logging.getLogger(__name__)
EXTRACT_TOOLS = [
{
"name": "extract_assets",
"description": "Extract a structured list of deliverable assets from a client brief or scope document.",
"input_schema": {
"type": "object",
"properties": {
"assets": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The asset/deliverable name as described by the client"
},
"description": {
"type": "string",
"description": "Description of what this asset involves, including any complexity or format details"
},
"complexity_hint": {
"type": "string",
"enum": ["simple", "medium", "complex", "unknown"],
"description": "Estimated complexity based on the brief"
},
"volume": {
"type": "integer",
"description": "Number of this asset needed (default 1 if not specified)"
},
},
"required": ["name", "description", "complexity_hint", "volume"],
},
},
},
"required": ["assets"],
},
}
]
SYSTEM_PROMPT = """You are a creative agency asset specialist who understands production scoping.
Your job is to extract every distinct deliverable/asset from the client brief or scope document provided.
For each asset, provide:
- name: The asset name as the client describes it (e.g., "Social Media Banner", "TV Commercial Edit", "Brand Book")
- description: What this asset involves based on the document context. Include format, size, channel, and any other relevant details.
- complexity_hint: Your best estimate of complexity (simple/medium/complex) based on the description. Use "unknown" if unclear.
- volume: How many of this asset are needed. Default to 1 if not specified.
Be thorough - extract every distinct asset type mentioned. If the same asset appears at different complexity levels, list them separately.
Do NOT combine different asset types into one entry."""
def extract_text_from_file(file_content: bytes, filename: str) -> tuple[str, dict]:
"""Extract text from a file. Returns (text, metadata)."""
ext = Path(filename).suffix.lower()
if ext == ".docx":
text = _extract_docx_text(file_content)
sheet_count = 0
elif ext in (".xlsx", ".xls"):
text = _extract_excel_text(file_content)
wb = openpyxl.load_workbook(io.BytesIO(file_content), data_only=True)
sheet_count = len(wb.sheetnames)
elif ext == ".txt":
text = file_content.decode("utf-8", errors="replace")
sheet_count = 0
else:
raise ValueError(f"Unsupported file type: {ext}. Use .docx, .xlsx, or .txt")
if not text or len(text.strip()) < 20:
raise ValueError("Document appears to be empty or too short to extract assets from.")
metadata = {
"char_count": len(text),
"sheet_count": sheet_count,
"file_type": ext,
}
# Truncate very long documents to manage token usage
if len(text) > 50000:
text = text[:50000] + "\n\n[Document truncated...]"
return text, metadata
def parse_text_with_ai(text: str) -> tuple[list[dict], dict]:
"""Send extracted text to Claude to identify assets. Returns (assets, usage_info)."""
response = call_claude(
system=SYSTEM_PROMPT,
user_message=f"Extract all deliverable assets from this client document:\n\n{text}",
tools=EXTRACT_TOOLS,
tool_choice={"type": "tool", "name": "extract_assets"},
max_tokens=16000,
)
usage_info = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
result = extract_tool_result(response)
if not result or "assets" not in result:
logger.warning("Claude did not return structured asset data, response: %s", extract_text(response))
return [], usage_info
return result["assets"], usage_info
def _extract_docx_text(content: bytes) -> str:
"""Extract text from a .docx file."""
doc = docx.Document(io.BytesIO(content))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if cells:
paragraphs.append(" | ".join(cells))
return "\n".join(paragraphs)
def _extract_excel_text(content: bytes) -> str:
"""Extract text from an Excel file, converting all sheets to text."""
wb = openpyxl.load_workbook(io.BytesIO(content), data_only=True)
parts = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
parts.append(f"\n=== Sheet: {sheet_name} ===")
for row in ws.iter_rows(values_only=True):
cells = [str(c) for c in row if c is not None]
if cells:
parts.append(" | ".join(cells))
return "\n".join(parts)