#!/usr/bin/env python3 """Excel reference-asset processor for HP Source Messaging files. Mirrors pdf_processor.py: openpyxl extracts raw cell content from every sheet, Gemini 2.5 Pro summarises the result into structured Markdown under brand_guidelines/files/{file_id}_summary.md. The hp_copy_review check pulls that Markdown into its prompt at QC time. Public surface: process_excel_file(file_path, file_id) -> (summary_text, summary_path) Behaviour mirrors pdf_processor.summarize_brand_guidelines: on Gemini failure we write a degraded summary containing the raw extraction so the reference asset stays usable downstream. The function does not raise — failures are logged and surfaced via the degraded payload. """ import os from typing import Tuple from openpyxl import load_workbook BRAND_GUIDELINES_DIR = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'brand_guidelines', 'files' ) # Cap raw extraction at ~50K chars to keep the summary prompt bounded. # A 30-row, 12-column workbook is ~10-15K chars in practice; this leaves # headroom for HP's larger source files without blowing the prompt budget. _RAW_EXTRACTION_CAP = 50_000 _SYSTEM_PROMPT = """You're processing an HP Source Messaging Excel into a structured Markdown reference. Output these sections exactly, in this order: ## Product / Variant (brand, product line, variant if any — e.g. "HP OmniDesk Mini — Core") ## Key Selling Points (KSPs) For each KSP: heading, value proposition, supporting body copy, message-length variants (ultra-short / short / medium / long if present in the source). ## Disclaimers / Footnotes Numbered list, exact wording, what claim each footnote anchors to. ## Approved Brand and Product Names Exact spellings, including trademark glyphs (™, ®, ©). ## Variant Notes / Watch-outs Anything explicitly marked variant-specific (e.g. "Mainstream only", "Core only", "must not appear in entry tier"). ## Verboten Phrasing Any explicitly disallowed or deprecated phrasing called out in the source. Be exhaustive but concise. Quote exactly where the source is explicit. If a section has no content in this source, write 'None specified' under it — do not omit the section heading.""" def process_excel_file(file_path: str, file_id: str) -> Tuple[str, str]: """Extract + summarise an HP Source Messaging Excel. Args: file_path: Path to the .xlsx file on disk. file_id: Stable identifier used for the output filename. Returns: Tuple of (summary_text, summary_path). Summary is written to BRAND_GUIDELINES_DIR/{file_id}_summary.md. Never raises. On Gemini failure, writes a degraded summary that embeds the raw extraction so the reference asset stays usable. """ try: raw_text = _extract_workbook_text(file_path) except Exception as e: print(f" Excel extraction failed for {file_id}: {type(e).__name__}: {e}") summary = ( f"# {os.path.basename(file_path)} (degraded — extraction failed)\n\n" f"openpyxl extraction failed: {type(e).__name__}: {e}\n" ) raw_text = '' else: try: summary = _summarise_with_gemini(raw_text, os.path.basename(file_path)) except Exception as e: print(f" Gemini summarisation failed for {file_id}: {type(e).__name__}: {e}") summary = ( f"# {os.path.basename(file_path)} (degraded — summary failed)\n\n" f"Gemini summarisation failed: {type(e).__name__}: {e}\n\n" f"## Raw extraction\n\n```\n{raw_text}\n```\n" ) os.makedirs(BRAND_GUIDELINES_DIR, exist_ok=True) summary_path = os.path.join(BRAND_GUIDELINES_DIR, f"{file_id}_summary.md") with open(summary_path, 'w', encoding='utf-8') as f: f.write(summary) return summary, summary_path def _extract_workbook_text(file_path: str) -> str: """Read every sheet, dump as 'Sheet: \\n\\n\\n'. Empty rows are skipped. Output is capped at _RAW_EXTRACTION_CAP chars; when exceeded, a truncation marker is appended and the rest is dropped. """ wb = load_workbook(file_path, data_only=True, read_only=True) try: parts = [] total_chars = 0 for sheet in wb.worksheets: header = f"Sheet: {sheet.title}\n" parts.append(header) total_chars += len(header) for row in sheet.iter_rows(values_only=True): if not any((c is not None and str(c).strip()) for c in row): continue line = '\t'.join(('' if c is None else str(c)) for c in row) parts.append(line + '\n') total_chars += len(line) + 1 if total_chars >= _RAW_EXTRACTION_CAP: parts.append( f"\n[truncated — exceeded {_RAW_EXTRACTION_CAP}-char cap]\n" ) return ''.join(parts) parts.append('\n') total_chars += 1 return ''.join(parts) finally: wb.close() def _summarise_with_gemini(raw_text: str, source_filename: str) -> str: """Send the extracted workbook text to Gemini 2.5 Pro for summarisation. Mirrors pdf_processor.summarize_brand_guidelines: uses google.generativeai directly with MODEL_VERSIONS.gemini_vision (currently gemini-2.5-pro). Raises on any failure; the caller converts failures into a degraded summary. """ import google.generativeai as genai from llm_config import MODEL_VERSIONS api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise RuntimeError("GOOGLE_API_KEY not configured") genai.configure(api_key=api_key) model = genai.GenerativeModel(MODEL_VERSIONS.gemini_vision) prompt = ( f"{_SYSTEM_PROMPT}\n\n" f"Source filename: {source_filename}\n\n" f"Raw cell content:\n\n```\n{raw_text}\n```" ) response = model.generate_content(prompt) # Mirror pdf_processor's safety-block handling: surface a useful error. if not getattr(response, 'parts', None): feedback = getattr(response, 'prompt_feedback', 'No specific feedback provided.') raise RuntimeError( f"Gemini response blocked or empty. Feedback: {feedback}" ) return response.text