Code review found that _extract_workbook_text was unwrapped — a corrupt/locked .xlsx or InvalidFileException would leak out of process_excel_file despite the docstring promising 'Never raises'. Wrap the extraction call too; on extraction failure, write a degraded summary explaining the failure and return cleanly. Verified by passing a non-existent file: the function returns a degraded summary instead of raising FileNotFoundError. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
162 lines
6.2 KiB
Python
162 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Excel reference-asset processor for HP Source Messaging files.
|
|
|
|
Mirrors pdf_processor.py: openpyxl extracts raw cell content from every
|
|
sheet, Gemini 2.5 Pro summarises the result into structured Markdown
|
|
under brand_guidelines/files/{file_id}_summary.md. The hp_copy_review
|
|
check pulls that Markdown into its prompt at QC time.
|
|
|
|
Public surface:
|
|
process_excel_file(file_path, file_id) -> (summary_text, summary_path)
|
|
|
|
Behaviour mirrors pdf_processor.summarize_brand_guidelines: on Gemini
|
|
failure we write a degraded summary containing the raw extraction so
|
|
the reference asset stays usable downstream. The function does not
|
|
raise — failures are logged and surfaced via the degraded payload.
|
|
"""
|
|
|
|
import os
|
|
from typing import Tuple
|
|
|
|
from openpyxl import load_workbook
|
|
|
|
|
|
BRAND_GUIDELINES_DIR = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)), 'brand_guidelines', 'files'
|
|
)
|
|
|
|
# Cap raw extraction at ~50K chars to keep the summary prompt bounded.
|
|
# A 30-row, 12-column workbook is ~10-15K chars in practice; this leaves
|
|
# headroom for HP's larger source files without blowing the prompt budget.
|
|
_RAW_EXTRACTION_CAP = 50_000
|
|
|
|
|
|
_SYSTEM_PROMPT = """You're processing an HP Source Messaging Excel into a structured Markdown reference. Output these sections exactly, in this order:
|
|
|
|
## Product / Variant
|
|
(brand, product line, variant if any — e.g. "HP OmniDesk Mini — Core")
|
|
|
|
## Key Selling Points (KSPs)
|
|
For each KSP: heading, value proposition, supporting body copy, message-length variants (ultra-short / short / medium / long if present in the source).
|
|
|
|
## Disclaimers / Footnotes
|
|
Numbered list, exact wording, what claim each footnote anchors to.
|
|
|
|
## Approved Brand and Product Names
|
|
Exact spellings, including trademark glyphs (™, ®, ©).
|
|
|
|
## Variant Notes / Watch-outs
|
|
Anything explicitly marked variant-specific (e.g. "Mainstream only", "Core only", "must not appear in entry tier").
|
|
|
|
## Verboten Phrasing
|
|
Any explicitly disallowed or deprecated phrasing called out in the source.
|
|
|
|
Be exhaustive but concise. Quote exactly where the source is explicit. If a section has no content in this source, write 'None specified' under it — do not omit the section heading."""
|
|
|
|
|
|
def process_excel_file(file_path: str, file_id: str) -> Tuple[str, str]:
|
|
"""Extract + summarise an HP Source Messaging Excel.
|
|
|
|
Args:
|
|
file_path: Path to the .xlsx file on disk.
|
|
file_id: Stable identifier used for the output filename.
|
|
|
|
Returns:
|
|
Tuple of (summary_text, summary_path). Summary is written to
|
|
BRAND_GUIDELINES_DIR/{file_id}_summary.md.
|
|
|
|
Never raises. On Gemini failure, writes a degraded summary that
|
|
embeds the raw extraction so the reference asset stays usable.
|
|
"""
|
|
try:
|
|
raw_text = _extract_workbook_text(file_path)
|
|
except Exception as e:
|
|
print(f" Excel extraction failed for {file_id}: {type(e).__name__}: {e}")
|
|
summary = (
|
|
f"# {os.path.basename(file_path)} (degraded — extraction failed)\n\n"
|
|
f"openpyxl extraction failed: {type(e).__name__}: {e}\n"
|
|
)
|
|
raw_text = ''
|
|
else:
|
|
try:
|
|
summary = _summarise_with_gemini(raw_text, os.path.basename(file_path))
|
|
except Exception as e:
|
|
print(f" Gemini summarisation failed for {file_id}: {type(e).__name__}: {e}")
|
|
summary = (
|
|
f"# {os.path.basename(file_path)} (degraded — summary failed)\n\n"
|
|
f"Gemini summarisation failed: {type(e).__name__}: {e}\n\n"
|
|
f"## Raw extraction\n\n```\n{raw_text}\n```\n"
|
|
)
|
|
|
|
os.makedirs(BRAND_GUIDELINES_DIR, exist_ok=True)
|
|
summary_path = os.path.join(BRAND_GUIDELINES_DIR, f"{file_id}_summary.md")
|
|
with open(summary_path, 'w', encoding='utf-8') as f:
|
|
f.write(summary)
|
|
return summary, summary_path
|
|
|
|
|
|
def _extract_workbook_text(file_path: str) -> str:
|
|
"""Read every sheet, dump as 'Sheet: <name>\\n<tab-aligned rows>\\n\\n'.
|
|
|
|
Empty rows are skipped. Output is capped at _RAW_EXTRACTION_CAP chars;
|
|
when exceeded, a truncation marker is appended and the rest is dropped.
|
|
"""
|
|
wb = load_workbook(file_path, data_only=True, read_only=True)
|
|
try:
|
|
parts = []
|
|
total_chars = 0
|
|
for sheet in wb.worksheets:
|
|
header = f"Sheet: {sheet.title}\n"
|
|
parts.append(header)
|
|
total_chars += len(header)
|
|
for row in sheet.iter_rows(values_only=True):
|
|
if not any((c is not None and str(c).strip()) for c in row):
|
|
continue
|
|
line = '\t'.join(('' if c is None else str(c)) for c in row)
|
|
parts.append(line + '\n')
|
|
total_chars += len(line) + 1
|
|
if total_chars >= _RAW_EXTRACTION_CAP:
|
|
parts.append(
|
|
f"\n[truncated — exceeded {_RAW_EXTRACTION_CAP}-char cap]\n"
|
|
)
|
|
return ''.join(parts)
|
|
parts.append('\n')
|
|
total_chars += 1
|
|
return ''.join(parts)
|
|
finally:
|
|
wb.close()
|
|
|
|
|
|
def _summarise_with_gemini(raw_text: str, source_filename: str) -> str:
|
|
"""Send the extracted workbook text to Gemini 2.5 Pro for summarisation.
|
|
|
|
Mirrors pdf_processor.summarize_brand_guidelines: uses
|
|
google.generativeai directly with MODEL_VERSIONS.gemini_vision
|
|
(currently gemini-2.5-pro). Raises on any failure; the caller
|
|
converts failures into a degraded summary.
|
|
"""
|
|
import google.generativeai as genai
|
|
from llm_config import MODEL_VERSIONS
|
|
|
|
api_key = os.getenv("GOOGLE_API_KEY")
|
|
if not api_key:
|
|
raise RuntimeError("GOOGLE_API_KEY not configured")
|
|
|
|
genai.configure(api_key=api_key)
|
|
model = genai.GenerativeModel(MODEL_VERSIONS.gemini_vision)
|
|
|
|
prompt = (
|
|
f"{_SYSTEM_PROMPT}\n\n"
|
|
f"Source filename: {source_filename}\n\n"
|
|
f"Raw cell content:\n\n```\n{raw_text}\n```"
|
|
)
|
|
response = model.generate_content(prompt)
|
|
|
|
# Mirror pdf_processor's safety-block handling: surface a useful error.
|
|
if not getattr(response, 'parts', None):
|
|
feedback = getattr(response, 'prompt_feedback', 'No specific feedback provided.')
|
|
raise RuntimeError(
|
|
f"Gemini response blocked or empty. Feedback: {feedback}"
|
|
)
|
|
|
|
return response.text
|