- Upload now shows live stage progress (uploading -> extracting -> AI parsing -> done) - Fix match group collapse: proper React state instead of DOM manipulation - Replace pre-filter with full GMAL catalog sent to Claude (~3k tokens, <$0.01) - FTS and keyword matching missed too many semantic matches - Claude now sees all 243 assets and uses semantic understanding - Improved system prompt with terminology bridges for better scoring - Per-project AI cost tracking persisted to DB - Parallel matching with cancel support - Auto-select matches >= 80%, YOLO button for rest - Debug panel for AI call inspection Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
147 lines
5.7 KiB
Python
147 lines
5.7 KiB
Python
"""Parse uploaded client documents (Word/Excel) to extract asset lists."""
|
|
|
|
import logging
|
|
import io
|
|
from pathlib import Path
|
|
|
|
import openpyxl
|
|
import docx
|
|
|
|
from app.utils.claude_client import call_claude, extract_tool_result, extract_text
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
EXTRACT_TOOLS = [
|
|
{
|
|
"name": "extract_assets",
|
|
"description": "Extract a structured list of deliverable assets from a client brief or scope document.",
|
|
"input_schema": {
|
|
"type": "object",
|
|
"properties": {
|
|
"assets": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"name": {
|
|
"type": "string",
|
|
"description": "The asset/deliverable name as described by the client"
|
|
},
|
|
"description": {
|
|
"type": "string",
|
|
"description": "Description of what this asset involves, including any complexity or format details"
|
|
},
|
|
"complexity_hint": {
|
|
"type": "string",
|
|
"enum": ["simple", "medium", "complex", "unknown"],
|
|
"description": "Estimated complexity based on the brief"
|
|
},
|
|
"volume": {
|
|
"type": "integer",
|
|
"description": "Number of this asset needed (default 1 if not specified)"
|
|
},
|
|
},
|
|
"required": ["name", "description", "complexity_hint", "volume"],
|
|
},
|
|
},
|
|
},
|
|
"required": ["assets"],
|
|
},
|
|
}
|
|
]
|
|
|
|
SYSTEM_PROMPT = """You are a creative agency asset specialist who understands production scoping.
|
|
Your job is to extract every distinct deliverable/asset from the client brief or scope document provided.
|
|
|
|
For each asset, provide:
|
|
- name: The asset name as the client describes it (e.g., "Social Media Banner", "TV Commercial Edit", "Brand Book")
|
|
- description: What this asset involves based on the document context. Include format, size, channel, and any other relevant details.
|
|
- complexity_hint: Your best estimate of complexity (simple/medium/complex) based on the description. Use "unknown" if unclear.
|
|
- volume: How many of this asset are needed. Default to 1 if not specified.
|
|
|
|
Be thorough - extract every distinct asset type mentioned. If the same asset appears at different complexity levels, list them separately.
|
|
Do NOT combine different asset types into one entry."""
|
|
|
|
|
|
def extract_text_from_file(file_content: bytes, filename: str) -> tuple[str, dict]:
|
|
"""Extract text from a file. Returns (text, metadata)."""
|
|
ext = Path(filename).suffix.lower()
|
|
|
|
if ext == ".docx":
|
|
text = _extract_docx_text(file_content)
|
|
sheet_count = 0
|
|
elif ext in (".xlsx", ".xls"):
|
|
text = _extract_excel_text(file_content)
|
|
wb = openpyxl.load_workbook(io.BytesIO(file_content), data_only=True)
|
|
sheet_count = len(wb.sheetnames)
|
|
elif ext == ".txt":
|
|
text = file_content.decode("utf-8", errors="replace")
|
|
sheet_count = 0
|
|
else:
|
|
raise ValueError(f"Unsupported file type: {ext}. Use .docx, .xlsx, or .txt")
|
|
|
|
if not text or len(text.strip()) < 20:
|
|
raise ValueError("Document appears to be empty or too short to extract assets from.")
|
|
|
|
metadata = {
|
|
"char_count": len(text),
|
|
"sheet_count": sheet_count,
|
|
"file_type": ext,
|
|
}
|
|
|
|
# Truncate very long documents to manage token usage
|
|
if len(text) > 50000:
|
|
text = text[:50000] + "\n\n[Document truncated...]"
|
|
|
|
return text, metadata
|
|
|
|
|
|
def parse_text_with_ai(text: str) -> tuple[list[dict], dict]:
|
|
"""Send extracted text to Claude to identify assets. Returns (assets, usage_info)."""
|
|
response = call_claude(
|
|
system=SYSTEM_PROMPT,
|
|
user_message=f"Extract all deliverable assets from this client document:\n\n{text}",
|
|
tools=EXTRACT_TOOLS,
|
|
tool_choice={"type": "tool", "name": "extract_assets"},
|
|
max_tokens=16000,
|
|
)
|
|
|
|
usage_info = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
|
|
|
|
result = extract_tool_result(response)
|
|
if not result or "assets" not in result:
|
|
logger.warning("Claude did not return structured asset data, response: %s", extract_text(response))
|
|
return [], usage_info
|
|
|
|
return result["assets"], usage_info
|
|
|
|
|
|
def _extract_docx_text(content: bytes) -> str:
|
|
"""Extract text from a .docx file."""
|
|
doc = docx.Document(io.BytesIO(content))
|
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
|
|
# Also extract text from tables
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
|
if cells:
|
|
paragraphs.append(" | ".join(cells))
|
|
|
|
return "\n".join(paragraphs)
|
|
|
|
|
|
def _extract_excel_text(content: bytes) -> str:
|
|
"""Extract text from an Excel file, converting all sheets to text."""
|
|
wb = openpyxl.load_workbook(io.BytesIO(content), data_only=True)
|
|
parts = []
|
|
|
|
for sheet_name in wb.sheetnames:
|
|
ws = wb[sheet_name]
|
|
parts.append(f"\n=== Sheet: {sheet_name} ===")
|
|
for row in ws.iter_rows(values_only=True):
|
|
cells = [str(c) for c in row if c is not None]
|
|
if cells:
|
|
parts.append(" | ".join(cells))
|
|
|
|
return "\n".join(parts)
|