ai_qc/backend/excel_processor.py

#!/usr/bin/env python3
"""Excel reference-asset processor for HP Source Messaging files.

Mirrors pdf_processor.py: openpyxl extracts raw cell content from every
sheet, Gemini 2.5 Pro summarises the result into structured Markdown
under brand_guidelines/files/{file_id}_summary.md. The hp_copy_review
check pulls that Markdown into its prompt at QC time.

Public surface:
    process_excel_file(file_path, file_id) -> (summary_text, summary_path)

Behaviour mirrors pdf_processor.summarize_brand_guidelines: on Gemini
failure we write a degraded summary containing the raw extraction so
the reference asset stays usable downstream. The function does not
raise — failures are logged and surfaced via the degraded payload.
"""

import os
from typing import Tuple

from openpyxl import load_workbook


BRAND_GUIDELINES_DIR = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), 'brand_guidelines', 'files'
)

# Cap raw extraction at ~50K chars to keep the summary prompt bounded.
# A 30-row, 12-column workbook is ~10-15K chars in practice; this leaves
# headroom for HP's larger source files without blowing the prompt budget.
_RAW_EXTRACTION_CAP = 50_000


_SYSTEM_PROMPT = """You're processing an HP Source Messaging Excel into a structured Markdown reference. Output these sections exactly, in this order:

## Product / Variant
(brand, product line, variant if any — e.g. "HP OmniDesk Mini — Core")

## Key Selling Points (KSPs)
For each KSP: heading, value proposition, supporting body copy, message-length variants (ultra-short / short / medium / long if present in the source).

## Disclaimers / Footnotes
Numbered list, exact wording, what claim each footnote anchors to.

## Approved Brand and Product Names
Exact spellings, including trademark glyphs (™, ®, ©).

## Variant Notes / Watch-outs
Anything explicitly marked variant-specific (e.g. "Mainstream only", "Core only", "must not appear in entry tier").

## Verboten Phrasing
Any explicitly disallowed or deprecated phrasing called out in the source.

Be exhaustive but concise. Quote exactly where the source is explicit. If a section has no content in this source, write 'None specified' under it — do not omit the section heading."""


def process_excel_file(file_path: str, file_id: str) -> Tuple[str, str]:
    """Extract + summarise an HP Source Messaging Excel.

    Args:
        file_path: Path to the .xlsx file on disk.
        file_id: Stable identifier used for the output filename.

    Returns:
        Tuple of (summary_text, summary_path). Summary is written to
        BRAND_GUIDELINES_DIR/{file_id}_summary.md.

    Never raises. On Gemini failure, writes a degraded summary that
    embeds the raw extraction so the reference asset stays usable.
    """
    try:
        raw_text = _extract_workbook_text(file_path)
    except Exception as e:
        print(f"  Excel extraction failed for {file_id}: {type(e).__name__}: {e}")
        summary = (
            f"# {os.path.basename(file_path)} (degraded — extraction failed)\n\n"
            f"openpyxl extraction failed: {type(e).__name__}: {e}\n"
        )
        raw_text = ''
    else:
        try:
            summary = _summarise_with_gemini(raw_text, os.path.basename(file_path))
        except Exception as e:
            print(f"  Gemini summarisation failed for {file_id}: {type(e).__name__}: {e}")
            summary = (
                f"# {os.path.basename(file_path)} (degraded — summary failed)\n\n"
                f"Gemini summarisation failed: {type(e).__name__}: {e}\n\n"
                f"## Raw extraction\n\n```\n{raw_text}\n```\n"
            )

    os.makedirs(BRAND_GUIDELINES_DIR, exist_ok=True)
    summary_path = os.path.join(BRAND_GUIDELINES_DIR, f"{file_id}_summary.md")
    with open(summary_path, 'w', encoding='utf-8') as f:
        f.write(summary)
    return summary, summary_path


def _extract_workbook_text(file_path: str) -> str:
    """Read every sheet, dump as 'Sheet: <name>\\n<tab-aligned rows>\\n\\n'.

    Empty rows are skipped. Output is capped at _RAW_EXTRACTION_CAP chars;
    when exceeded, a truncation marker is appended and the rest is dropped.
    """
    wb = load_workbook(file_path, data_only=True, read_only=True)
    try:
        parts = []
        total_chars = 0
        for sheet in wb.worksheets:
            header = f"Sheet: {sheet.title}\n"
            parts.append(header)
            total_chars += len(header)
            for row in sheet.iter_rows(values_only=True):
                if not any((c is not None and str(c).strip()) for c in row):
                    continue
                line = '\t'.join(('' if c is None else str(c)) for c in row)
                parts.append(line + '\n')
                total_chars += len(line) + 1
                if total_chars >= _RAW_EXTRACTION_CAP:
                    parts.append(
                        f"\n[truncated — exceeded {_RAW_EXTRACTION_CAP}-char cap]\n"
                    )
                    return ''.join(parts)
            parts.append('\n')
            total_chars += 1
        return ''.join(parts)
    finally:
        wb.close()


def _summarise_with_gemini(raw_text: str, source_filename: str) -> str:
    """Send the extracted workbook text to Gemini 2.5 Pro for summarisation.

    Mirrors pdf_processor.summarize_brand_guidelines: uses
    google.generativeai directly with MODEL_VERSIONS.gemini_vision
    (currently gemini-2.5-pro). Raises on any failure; the caller
    converts failures into a degraded summary.
    """
    import google.generativeai as genai
    from llm_config import MODEL_VERSIONS

    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise RuntimeError("GOOGLE_API_KEY not configured")

    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(MODEL_VERSIONS.gemini_vision)

    prompt = (
        f"{_SYSTEM_PROMPT}\n\n"
        f"Source filename: {source_filename}\n\n"
        f"Raw cell content:\n\n```\n{raw_text}\n```"
    )
    response = model.generate_content(prompt)

    # Mirror pdf_processor's safety-block handling: surface a useful error.
    if not getattr(response, 'parts', None):
        feedback = getattr(response, 'prompt_feedback', 'No specific feedback provided.')
        raise RuntimeError(
            f"Gemini response blocked or empty. Feedback: {feedback}"
        )

    return response.text