ai_qc/backend/excel_processor.py
nickviljoen c51e0729ce fix(excel-processor): wrap extraction in try/except to honour 'never raises'
Code review found that _extract_workbook_text was unwrapped — a
corrupt/locked .xlsx or InvalidFileException would leak out of
process_excel_file despite the docstring promising 'Never raises'.
Wrap the extraction call too; on extraction failure, write a
degraded summary explaining the failure and return cleanly.

Verified by passing a non-existent file: the function returns a
degraded summary instead of raising FileNotFoundError.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 20:55:54 +02:00

162 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""Excel reference-asset processor for HP Source Messaging files.
Mirrors pdf_processor.py: openpyxl extracts raw cell content from every
sheet, Gemini 2.5 Pro summarises the result into structured Markdown
under brand_guidelines/files/{file_id}_summary.md. The hp_copy_review
check pulls that Markdown into its prompt at QC time.
Public surface:
process_excel_file(file_path, file_id) -> (summary_text, summary_path)
Behaviour mirrors pdf_processor.summarize_brand_guidelines: on Gemini
failure we write a degraded summary containing the raw extraction so
the reference asset stays usable downstream. The function does not
raise — failures are logged and surfaced via the degraded payload.
"""
import os
from typing import Tuple
from openpyxl import load_workbook
BRAND_GUIDELINES_DIR = os.path.join(
os.path.dirname(os.path.abspath(__file__)), 'brand_guidelines', 'files'
)
# Cap raw extraction at ~50K chars to keep the summary prompt bounded.
# A 30-row, 12-column workbook is ~10-15K chars in practice; this leaves
# headroom for HP's larger source files without blowing the prompt budget.
_RAW_EXTRACTION_CAP = 50_000
_SYSTEM_PROMPT = """You're processing an HP Source Messaging Excel into a structured Markdown reference. Output these sections exactly, in this order:
## Product / Variant
(brand, product line, variant if any — e.g. "HP OmniDesk Mini — Core")
## Key Selling Points (KSPs)
For each KSP: heading, value proposition, supporting body copy, message-length variants (ultra-short / short / medium / long if present in the source).
## Disclaimers / Footnotes
Numbered list, exact wording, what claim each footnote anchors to.
## Approved Brand and Product Names
Exact spellings, including trademark glyphs (™, ®, ©).
## Variant Notes / Watch-outs
Anything explicitly marked variant-specific (e.g. "Mainstream only", "Core only", "must not appear in entry tier").
## Verboten Phrasing
Any explicitly disallowed or deprecated phrasing called out in the source.
Be exhaustive but concise. Quote exactly where the source is explicit. If a section has no content in this source, write 'None specified' under it — do not omit the section heading."""
def process_excel_file(file_path: str, file_id: str) -> Tuple[str, str]:
"""Extract + summarise an HP Source Messaging Excel.
Args:
file_path: Path to the .xlsx file on disk.
file_id: Stable identifier used for the output filename.
Returns:
Tuple of (summary_text, summary_path). Summary is written to
BRAND_GUIDELINES_DIR/{file_id}_summary.md.
Never raises. On Gemini failure, writes a degraded summary that
embeds the raw extraction so the reference asset stays usable.
"""
try:
raw_text = _extract_workbook_text(file_path)
except Exception as e:
print(f" Excel extraction failed for {file_id}: {type(e).__name__}: {e}")
summary = (
f"# {os.path.basename(file_path)} (degraded — extraction failed)\n\n"
f"openpyxl extraction failed: {type(e).__name__}: {e}\n"
)
raw_text = ''
else:
try:
summary = _summarise_with_gemini(raw_text, os.path.basename(file_path))
except Exception as e:
print(f" Gemini summarisation failed for {file_id}: {type(e).__name__}: {e}")
summary = (
f"# {os.path.basename(file_path)} (degraded — summary failed)\n\n"
f"Gemini summarisation failed: {type(e).__name__}: {e}\n\n"
f"## Raw extraction\n\n```\n{raw_text}\n```\n"
)
os.makedirs(BRAND_GUIDELINES_DIR, exist_ok=True)
summary_path = os.path.join(BRAND_GUIDELINES_DIR, f"{file_id}_summary.md")
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(summary)
return summary, summary_path
def _extract_workbook_text(file_path: str) -> str:
"""Read every sheet, dump as 'Sheet: <name>\\n<tab-aligned rows>\\n\\n'.
Empty rows are skipped. Output is capped at _RAW_EXTRACTION_CAP chars;
when exceeded, a truncation marker is appended and the rest is dropped.
"""
wb = load_workbook(file_path, data_only=True, read_only=True)
try:
parts = []
total_chars = 0
for sheet in wb.worksheets:
header = f"Sheet: {sheet.title}\n"
parts.append(header)
total_chars += len(header)
for row in sheet.iter_rows(values_only=True):
if not any((c is not None and str(c).strip()) for c in row):
continue
line = '\t'.join(('' if c is None else str(c)) for c in row)
parts.append(line + '\n')
total_chars += len(line) + 1
if total_chars >= _RAW_EXTRACTION_CAP:
parts.append(
f"\n[truncated — exceeded {_RAW_EXTRACTION_CAP}-char cap]\n"
)
return ''.join(parts)
parts.append('\n')
total_chars += 1
return ''.join(parts)
finally:
wb.close()
def _summarise_with_gemini(raw_text: str, source_filename: str) -> str:
"""Send the extracted workbook text to Gemini 2.5 Pro for summarisation.
Mirrors pdf_processor.summarize_brand_guidelines: uses
google.generativeai directly with MODEL_VERSIONS.gemini_vision
(currently gemini-2.5-pro). Raises on any failure; the caller
converts failures into a degraded summary.
"""
import google.generativeai as genai
from llm_config import MODEL_VERSIONS
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise RuntimeError("GOOGLE_API_KEY not configured")
genai.configure(api_key=api_key)
model = genai.GenerativeModel(MODEL_VERSIONS.gemini_vision)
prompt = (
f"{_SYSTEM_PROMPT}\n\n"
f"Source filename: {source_filename}\n\n"
f"Raw cell content:\n\n```\n{raw_text}\n```"
)
response = model.generate_content(prompt)
# Mirror pdf_processor's safety-block handling: surface a useful error.
if not getattr(response, 'parts', None):
feedback = getattr(response, 'prompt_feedback', 'No specific feedback provided.')
raise RuntimeError(
f"Gemini response blocked or empty. Feedback: {feedback}"
)
return response.text