gmal-scope-builder/backend/app/services/doc_parser.py

"""Parse uploaded client documents (Word/Excel) to extract asset lists."""

import logging
import io
from pathlib import Path

import openpyxl
import docx

from app.utils.claude_client import call_claude, extract_tool_result, extract_text

logger = logging.getLogger(__name__)

EXTRACT_TOOLS = [
    {
        "name": "extract_assets",
        "description": "Extract a structured list of deliverable assets from a client brief or scope document.",
        "input_schema": {
            "type": "object",
            "properties": {
                "assets": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": {
                                "type": "string",
                                "description": "The asset/deliverable name as described by the client"
                            },
                            "description": {
                                "type": "string",
                                "description": "Description of what this asset involves, including any complexity or format details"
                            },
                            "complexity_hint": {
                                "type": "string",
                                "enum": ["simple", "medium", "complex", "unknown"],
                                "description": "Estimated complexity based on the brief"
                            },
                            "volume": {
                                "type": "integer",
                                "description": "Number of this asset needed (default 1 if not specified)"
                            },
                        },
                        "required": ["name", "description", "complexity_hint", "volume"],
                    },
                },
            },
            "required": ["assets"],
        },
    }
]

SYSTEM_PROMPT = """You are a creative agency asset specialist who understands production scoping.
Your job is to extract every distinct deliverable/asset from the client brief or scope document provided.

For each asset, provide:
- name: The asset name as the client describes it (e.g., "Social Media Banner", "TV Commercial Edit", "Brand Book")
- description: What this asset involves based on the document context. Include format, size, channel, and any other relevant details.
- complexity_hint: Your best estimate of complexity (simple/medium/complex) based on the description. Use "unknown" if unclear.
- volume: How many of this asset are needed. Default to 1 if not specified.

Be thorough - extract every distinct asset type mentioned. If the same asset appears at different complexity levels, list them separately.
Do NOT combine different asset types into one entry."""


def extract_text_from_file(file_content: bytes, filename: str) -> tuple[str, dict]:
    """Extract text from a file. Returns (text, metadata)."""
    ext = Path(filename).suffix.lower()

    if ext == ".docx":
        text = _extract_docx_text(file_content)
        sheet_count = 0
    elif ext in (".xlsx", ".xls"):
        text = _extract_excel_text(file_content)
        wb = openpyxl.load_workbook(io.BytesIO(file_content), data_only=True)
        sheet_count = len(wb.sheetnames)
    elif ext == ".txt":
        text = file_content.decode("utf-8", errors="replace")
        sheet_count = 0
    else:
        raise ValueError(f"Unsupported file type: {ext}. Use .docx, .xlsx, or .txt")

    if not text or len(text.strip()) < 20:
        raise ValueError("Document appears to be empty or too short to extract assets from.")

    metadata = {
        "char_count": len(text),
        "sheet_count": sheet_count,
        "file_type": ext,
    }

    # Truncate very long documents to manage token usage
    if len(text) > 50000:
        text = text[:50000] + "\n\n[Document truncated...]"

    return text, metadata


def parse_text_with_ai(text: str) -> tuple[list[dict], dict]:
    """Send extracted text to Claude to identify assets. Returns (assets, usage_info)."""
    response = call_claude(
        system=SYSTEM_PROMPT,
        user_message=f"Extract all deliverable assets from this client document:\n\n{text}",
        tools=EXTRACT_TOOLS,
        tool_choice={"type": "tool", "name": "extract_assets"},
        max_tokens=16000,
    )

    usage_info = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})

    result = extract_tool_result(response)
    if not result or "assets" not in result:
        logger.warning("Claude did not return structured asset data, response: %s", extract_text(response))
        return [], usage_info

    return result["assets"], usage_info


def _extract_docx_text(content: bytes) -> str:
    """Extract text from a .docx file."""
    doc = docx.Document(io.BytesIO(content))
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]

    # Also extract text from tables
    for table in doc.tables:
        for row in table.rows:
            cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
            if cells:
                paragraphs.append(" | ".join(cells))

    return "\n".join(paragraphs)


def _extract_excel_text(content: bytes) -> str:
    """Extract text from an Excel file, converting all sheets to text."""
    wb = openpyxl.load_workbook(io.BytesIO(content), data_only=True)
    parts = []

    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        parts.append(f"\n=== Sheet: {sheet_name} ===")
        for row in ws.iter_rows(values_only=True):
            cells = [str(c) for c in row if c is not None]
            if cells:
                parts.append(" | ".join(cells))

    return "\n".join(parts)