Root cause: "name 'name' is not defined" error on line 300
The f-string example {name:"KV 360", tier:"Tier B"} was interpreted
as Python set literal, not as JSON text. Changed to parentheses.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
333 lines
14 KiB
Python
333 lines
14 KiB
Python
"""Parse uploaded client documents (Word/Excel) to extract asset lists."""
|
||
|
||
import logging
|
||
import io
|
||
from pathlib import Path
|
||
|
||
import openpyxl
|
||
import docx
|
||
|
||
from app.utils.claude_client import call_claude, extract_tool_result, extract_text
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
EXTRACT_TOOLS = [
|
||
{
|
||
"name": "extract_assets",
|
||
"description": "Extract a structured list of deliverable assets from a client brief or scope document.",
|
||
"input_schema": {
|
||
"type": "object",
|
||
"properties": {
|
||
"assets": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"properties": {
|
||
"name": {
|
||
"type": "string",
|
||
"description": "The asset/deliverable name as described by the client"
|
||
},
|
||
"description": {
|
||
"type": "string",
|
||
"description": "Description of what this asset involves, including any complexity or format details"
|
||
},
|
||
"complexity_hint": {
|
||
"type": "string",
|
||
"enum": ["simple", "medium", "complex", "unknown"],
|
||
"description": "Estimated complexity based on the brief"
|
||
},
|
||
"volume": {
|
||
"type": "integer",
|
||
"description": "Number of this asset needed (default 1 if not specified)"
|
||
},
|
||
"tier": {
|
||
"type": "string",
|
||
"description": "The client's tier/complexity label if specified (e.g. 'Tier A', 'A', 'Gold', '1', 'Premium'). Leave empty string if no tier is specified."
|
||
},
|
||
},
|
||
"required": ["name", "description", "complexity_hint", "volume", "tier"],
|
||
},
|
||
},
|
||
},
|
||
"required": ["assets"],
|
||
},
|
||
}
|
||
]
|
||
|
||
SYSTEM_PROMPT = """You are a creative agency asset specialist who understands production scoping.
|
||
Your job is to extract every distinct deliverable/asset from the client brief or scope document provided.
|
||
|
||
For each asset, provide:
|
||
- name: The asset name as the client describes it (e.g., "Social Media Banner", "TV Commercial Edit", "Brand Book")
|
||
- description: What this asset involves based on the document context. Include format, size, channel, and any other relevant details.
|
||
- complexity_hint: Your best estimate of complexity (simple/medium/complex) based on the description. Use "unknown" if unclear.
|
||
- volume: How many of this asset are needed. Default to 1 if not specified.
|
||
- tier: If the client specifies a tier, grade, or complexity label for this asset (e.g. "Tier A", "A", "Gold", "Premium", "1"), include it exactly as written. If the document has columns like A/B/C or Tier 1/2/3, extract those labels. Leave empty string if no tier is specified.
|
||
|
||
Be thorough - extract every distinct deliverable.
|
||
|
||
CRITICAL TIER RULES:
|
||
- If the document has tier columns (Tier A/B/C, or similar), create a SEPARATE entry for EACH tier where the volume is greater than 0 or status is "Yes".
|
||
- Example: "KV 360" with Tier A=No/0, Tier B=Yes/1, Tier C=Yes/1 → create TWO entries:
|
||
1. name="KV 360", tier="Tier B", volume=1
|
||
2. name="KV 360", tier="Tier C", volume=1
|
||
- Do NOT create an entry for a tier where volume is 0 or status is "No".
|
||
- The tier field must match the column header exactly (e.g. "Tier A", "Tier B", "Tier C").
|
||
- Do NOT combine different asset types into one entry."""
|
||
|
||
|
||
def extract_text_from_file(file_content: bytes, filename: str) -> tuple[str, dict]:
|
||
"""Extract text from a file. Returns (text, metadata)."""
|
||
ext = Path(filename).suffix.lower()
|
||
|
||
if ext == ".docx":
|
||
text = _extract_docx_text(file_content)
|
||
sheet_count = 0
|
||
elif ext in (".xlsx", ".xls"):
|
||
text = _extract_excel_text(file_content)
|
||
wb = openpyxl.load_workbook(io.BytesIO(file_content), data_only=True)
|
||
sheet_count = len(wb.sheetnames)
|
||
elif ext == ".txt":
|
||
text = file_content.decode("utf-8", errors="replace")
|
||
sheet_count = 0
|
||
else:
|
||
raise ValueError(f"Unsupported file type: {ext}. Use .docx, .xlsx, or .txt")
|
||
|
||
if not text or len(text.strip()) < 20:
|
||
raise ValueError("Document appears to be empty or too short to extract assets from.")
|
||
|
||
metadata = {
|
||
"char_count": len(text),
|
||
"sheet_count": sheet_count,
|
||
"file_type": ext,
|
||
}
|
||
|
||
# Truncate very long documents to manage token usage
|
||
if len(text) > 50000:
|
||
text = text[:50000] + "\n\n[Document truncated...]"
|
||
|
||
return text, metadata
|
||
|
||
|
||
def parse_text_with_ai(text: str, user_context: str = "") -> tuple[list[dict], dict]:
|
||
"""Send extracted text to Claude to identify assets. Returns (assets, usage_info)."""
|
||
prompt_text = f"Extract all deliverable assets from this client document:\n\n{text}"
|
||
if user_context:
|
||
prompt_text = f"USER CONTEXT/INSTRUCTIONS:\n{user_context}\n\n{prompt_text}"
|
||
|
||
response = call_claude(
|
||
system=SYSTEM_PROMPT,
|
||
user_message=prompt_text,
|
||
tools=EXTRACT_TOOLS,
|
||
tool_choice={"type": "tool", "name": "extract_assets"},
|
||
max_tokens=32000,
|
||
)
|
||
|
||
usage_info = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
|
||
|
||
stop_reason = getattr(response, 'stop_reason', 'unknown')
|
||
result = extract_tool_result(response)
|
||
if not result or "assets" not in result:
|
||
logger.warning(f"Claude did not return structured asset data. stop_reason={stop_reason}, response: %s", extract_text(response)[:500])
|
||
return [], usage_info
|
||
|
||
return result["assets"], usage_info
|
||
|
||
|
||
def _extract_docx_text(content: bytes) -> str:
|
||
"""Extract text from a .docx file."""
|
||
doc = docx.Document(io.BytesIO(content))
|
||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||
|
||
# Also extract text from tables
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
||
if cells:
|
||
paragraphs.append(" | ".join(cells))
|
||
|
||
return "\n".join(paragraphs)
|
||
|
||
|
||
def _extract_excel_text(content: bytes) -> str:
|
||
"""Extract text from an Excel file with header-aware labelling.
|
||
|
||
Detects header rows, labels data with column names, handles merged cells.
|
||
"""
|
||
wb = openpyxl.load_workbook(io.BytesIO(content), data_only=True)
|
||
parts = []
|
||
|
||
for sheet_name in wb.sheetnames:
|
||
ws = wb[sheet_name]
|
||
|
||
# Skip sheets with very little data
|
||
if ws.max_row is None or ws.max_row < 3:
|
||
continue
|
||
if ws.max_column is None or ws.max_column < 2:
|
||
continue
|
||
|
||
# Count non-empty cells to skip truly empty sheets
|
||
sample_count = sum(
|
||
1 for row in ws.iter_rows(min_row=1, max_row=min(10, ws.max_row), values_only=True)
|
||
for c in row if c is not None
|
||
)
|
||
if sample_count < 3:
|
||
continue
|
||
|
||
parts.append(f"\n=== Sheet: {sheet_name} ({ws.max_row} rows × {ws.max_column} cols) ===")
|
||
|
||
# Build combined headers from first few rows (many spreadsheets have multi-row headers)
|
||
# Strategy: scan rows 1-5, combine all text-like values as column labels
|
||
# A "header" cell contains text (not just numbers, Yes/No, 0/1)
|
||
headers = {}
|
||
data_start_row = 1
|
||
|
||
def _is_header_value(val):
|
||
if val is None:
|
||
return False
|
||
s = str(val).strip()
|
||
if not s:
|
||
return False
|
||
# Pure numbers, Yes/No, single chars like ü are likely data, not headers
|
||
if s.replace('.', '').replace(',', '').isdigit():
|
||
return False
|
||
if s.lower() in ('yes', 'no', 'true', 'false', 'x', 'ü', "ü'", '0', '1', '-'):
|
||
return False
|
||
return True
|
||
|
||
found_data = False
|
||
for r in range(1, min(6, ws.max_row + 1)):
|
||
row_vals = [(c, ws.cell(row=r, column=c).value) for c in range(1, ws.max_column + 1)]
|
||
header_cells = sum(1 for _, v in row_vals if _is_header_value(v))
|
||
data_cells = sum(1 for _, v in row_vals if v is not None and not _is_header_value(v))
|
||
|
||
# If this row has more header-like text than data values, it's a header row
|
||
# But stop looking once we've hit a data row
|
||
if not found_data and header_cells >= 2 and header_cells >= data_cells:
|
||
for c, v in row_vals:
|
||
if _is_header_value(v) and c not in headers:
|
||
headers[c] = str(v).strip()[:50]
|
||
data_start_row = r + 1
|
||
elif data_cells > 0:
|
||
found_data = True
|
||
|
||
if headers:
|
||
parts.append(f"[Column headers]: {' | '.join(headers.values())}")
|
||
|
||
# Track last non-empty value per column for merged cell carry-forward
|
||
last_vals = {}
|
||
|
||
# Extract data rows with labelled fields
|
||
for row_idx in range(data_start_row, min(ws.max_row + 1, data_start_row + 200)):
|
||
fields = []
|
||
has_data = False
|
||
for c in range(1, ws.max_column + 1):
|
||
val = ws.cell(row=row_idx, column=c).value
|
||
if val is not None:
|
||
last_vals[c] = str(val).strip()
|
||
has_data = True
|
||
|
||
display_val = str(val).strip() if val is not None else None
|
||
if display_val and c in headers:
|
||
fields.append(f"{headers[c]}: {display_val[:100]}")
|
||
elif display_val:
|
||
fields.append(display_val[:100])
|
||
|
||
if has_data and fields:
|
||
parts.append(f"[Row {row_idx}] {' | '.join(fields)}")
|
||
|
||
return "\n".join(parts)
|
||
|
||
|
||
# ── Deep Extraction (two-pass AI) ──────────────────────────────────────────
|
||
|
||
STRUCTURE_ANALYSIS_PROMPT = """You are an expert at understanding complex spreadsheet layouts.
|
||
|
||
Analyze this spreadsheet data and describe its structure. Your analysis will be used by another AI to extract deliverable assets accurately.
|
||
|
||
For each sheet with meaningful data, describe:
|
||
1. Which row contains the column headers
|
||
2. What each column represents (asset name, description, tier, volume, status, GMAL code, caveats, questions, etc.)
|
||
3. Where are the actual asset/deliverable names located (which column)?
|
||
4. Are there tier columns (A/B/C, 1/2/3, Gold/Silver/Bronze)? Which columns?
|
||
5. Where are volume numbers?
|
||
6. Are there Q&A columns mixed in? Which ones are questions vs actual data?
|
||
7. Are there merged cells creating category groupings? How is the hierarchy structured?
|
||
8. Which sheets contain deliverables vs metadata/reference data?
|
||
|
||
Be specific — reference actual column names and row numbers."""
|
||
|
||
|
||
def deep_pass1_structure_analysis(text: str, user_context: str = "") -> tuple[str, dict]:
|
||
"""Pass 1 of deep extraction: analyze spreadsheet structure.
|
||
|
||
Returns (structure_analysis_text, usage_info).
|
||
"""
|
||
logger.info("Deep extraction Pass 1: Analyzing spreadsheet structure...")
|
||
response = call_claude(
|
||
system=STRUCTURE_ANALYSIS_PROMPT,
|
||
user_message=f"{'USER CONTEXT: ' + user_context + chr(10) + chr(10) if user_context else ''}Analyze the structure of this spreadsheet data:\n\n{text[:40000]}",
|
||
max_tokens=4096,
|
||
)
|
||
|
||
usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
|
||
analysis = extract_text(response)
|
||
logger.info(f"Deep extraction Pass 1 complete: {len(analysis)} chars of analysis")
|
||
return analysis, usage
|
||
|
||
|
||
def deep_pass2_guided_extraction(text: str, structure_analysis: str, user_context: str = "") -> tuple[list[dict], dict]:
|
||
"""Pass 2 of deep extraction: extract assets using structural understanding.
|
||
|
||
Returns (assets, usage_info).
|
||
"""
|
||
# Truncate structure analysis if very long to leave room for data
|
||
if len(structure_analysis) > 4000:
|
||
structure_analysis = structure_analysis[:4000] + "\n[Analysis truncated]"
|
||
|
||
logger.info(f"Deep extraction Pass 2: structure={len(structure_analysis)} chars, data={len(text)} chars")
|
||
|
||
context_block = f"USER CONTEXT/INSTRUCTIONS:\n{user_context}\n\n" if user_context else ""
|
||
guided_prompt = f"""{context_block}You have been given a structural analysis of a complex client spreadsheet.
|
||
Use this understanding to extract every deliverable asset accurately.
|
||
|
||
STRUCTURAL ANALYSIS:
|
||
{structure_analysis}
|
||
|
||
IMPORTANT GUIDELINES:
|
||
- Use the column mapping from the analysis to identify asset names, descriptions, tiers, and volumes
|
||
- CRITICAL: If the document has tier columns (Tier A/B/C etc.), create a SEPARATE entry for EACH tier where volume > 0 or status = "Yes"
|
||
- Example: "KV 360" with Tier A=No/0, Tier B=Yes/1, Tier C=Yes/1 → TWO entries: (name="KV 360", tier="Tier B", volume=1) and (name="KV 360", tier="Tier C", volume=1)
|
||
- Do NOT create entries for tiers with volume=0 or status="No"
|
||
- Skip rows that are questions, metadata, or caveats — those are not deliverables
|
||
- Carry forward category names from merged cells (the analysis explains the hierarchy)
|
||
- You MUST call the extract_assets tool with at least one asset
|
||
- Keep descriptions SHORT (1 sentence max) to stay within output limits
|
||
|
||
Now extract all deliverable assets from this data:
|
||
|
||
{text[:35000]}"""
|
||
|
||
response = call_claude(
|
||
system=SYSTEM_PROMPT,
|
||
user_message=guided_prompt,
|
||
tools=EXTRACT_TOOLS,
|
||
tool_choice={"type": "tool", "name": "extract_assets"},
|
||
max_tokens=32000,
|
||
)
|
||
|
||
usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
|
||
|
||
# Log response details for debugging
|
||
stop_reason = getattr(response, 'stop_reason', 'unknown')
|
||
logger.info(f"Deep extraction Pass 2 response: stop_reason={stop_reason}")
|
||
|
||
result = extract_tool_result(response)
|
||
if not result or "assets" not in result:
|
||
# Log what we got instead
|
||
response_text = extract_text(response)
|
||
logger.warning(f"Deep extraction Pass 2 returned no assets. stop_reason={stop_reason}, text_response={response_text[:500] if response_text else 'none'}")
|
||
return [], usage
|
||
|
||
logger.info(f"Deep extraction complete: {len(result['assets'])} assets found")
|
||
return result["assets"], usage
|