"""Parse uploaded client documents (Word/Excel) to extract asset lists.""" import logging import io from pathlib import Path import openpyxl import docx from app.utils.claude_client import call_claude, extract_tool_result, extract_text logger = logging.getLogger(__name__) EXTRACT_TOOLS = [ { "name": "extract_assets", "description": "Extract a structured list of deliverable assets from a client brief or scope document.", "input_schema": { "type": "object", "properties": { "assets": { "type": "array", "items": { "type": "object", "properties": { "name": { "type": "string", "description": "The asset/deliverable name as described by the client" }, "description": { "type": "string", "description": "Description of what this asset involves, including any complexity or format details" }, "complexity_hint": { "type": "string", "enum": ["simple", "medium", "complex", "unknown"], "description": "Estimated complexity based on the brief" }, "volume": { "type": "integer", "description": "Number of this asset needed (default 1 if not specified)" }, }, "required": ["name", "description", "complexity_hint", "volume"], }, }, }, "required": ["assets"], }, } ] SYSTEM_PROMPT = """You are a creative agency asset specialist who understands production scoping. Your job is to extract every distinct deliverable/asset from the client brief or scope document provided. For each asset, provide: - name: The asset name as the client describes it (e.g., "Social Media Banner", "TV Commercial Edit", "Brand Book") - description: What this asset involves based on the document context. Include format, size, channel, and any other relevant details. - complexity_hint: Your best estimate of complexity (simple/medium/complex) based on the description. Use "unknown" if unclear. - volume: How many of this asset are needed. Default to 1 if not specified. Be thorough - extract every distinct asset type mentioned. If the same asset appears at different complexity levels, list them separately. Do NOT combine different asset types into one entry.""" def extract_text_from_file(file_content: bytes, filename: str) -> tuple[str, dict]: """Extract text from a file. Returns (text, metadata).""" ext = Path(filename).suffix.lower() if ext == ".docx": text = _extract_docx_text(file_content) sheet_count = 0 elif ext in (".xlsx", ".xls"): text = _extract_excel_text(file_content) wb = openpyxl.load_workbook(io.BytesIO(file_content), data_only=True) sheet_count = len(wb.sheetnames) elif ext == ".txt": text = file_content.decode("utf-8", errors="replace") sheet_count = 0 else: raise ValueError(f"Unsupported file type: {ext}. Use .docx, .xlsx, or .txt") if not text or len(text.strip()) < 20: raise ValueError("Document appears to be empty or too short to extract assets from.") metadata = { "char_count": len(text), "sheet_count": sheet_count, "file_type": ext, } # Truncate very long documents to manage token usage if len(text) > 50000: text = text[:50000] + "\n\n[Document truncated...]" return text, metadata def parse_text_with_ai(text: str) -> tuple[list[dict], dict]: """Send extracted text to Claude to identify assets. Returns (assets, usage_info).""" response = call_claude( system=SYSTEM_PROMPT, user_message=f"Extract all deliverable assets from this client document:\n\n{text}", tools=EXTRACT_TOOLS, tool_choice={"type": "tool", "name": "extract_assets"}, max_tokens=16000, ) usage_info = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0}) result = extract_tool_result(response) if not result or "assets" not in result: logger.warning("Claude did not return structured asset data, response: %s", extract_text(response)) return [], usage_info return result["assets"], usage_info def _extract_docx_text(content: bytes) -> str: """Extract text from a .docx file.""" doc = docx.Document(io.BytesIO(content)) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] # Also extract text from tables for table in doc.tables: for row in table.rows: cells = [cell.text.strip() for cell in row.cells if cell.text.strip()] if cells: paragraphs.append(" | ".join(cells)) return "\n".join(paragraphs) def _extract_excel_text(content: bytes) -> str: """Extract text from an Excel file, converting all sheets to text.""" wb = openpyxl.load_workbook(io.BytesIO(content), data_only=True) parts = [] for sheet_name in wb.sheetnames: ws = wb[sheet_name] parts.append(f"\n=== Sheet: {sheet_name} ===") for row in ws.iter_rows(values_only=True): cells = [str(c) for c in row if c is not None] if cells: parts.append(" | ".join(cells)) return "\n".join(parts)