#!/usr/bin/env python3 """ PDF Reference Asset Processor Extracts text from all pages of PDF brand guidelines, generates a structured summary using Gemini, and stores processed artifacts for use in QC checks. """ import os import fitz # PyMuPDF from datetime import datetime from PIL import Image from typing import Optional, Dict def extract_text_from_pdf(pdf_path: str) -> str: """ Extract text from ALL pages of a PDF using PyMuPDF. Args: pdf_path: Path to the PDF file Returns: Concatenated text from all pages with page delimiters """ try: doc = fitz.open(pdf_path) all_text = [] for page_num in range(doc.page_count): page = doc.load_page(page_num) page_text = page.get_text().strip() if page_text: all_text.append(f"--- Page {page_num + 1} ---\n{page_text}") doc.close() return "\n\n".join(all_text) except Exception as e: print(f"Error extracting text from PDF {pdf_path}: {e}") return "" def extract_cover_image(pdf_path: str, output_path: str) -> Optional[str]: """ Extract page 1 of a PDF as a PNG image. Args: pdf_path: Path to the PDF file output_path: Path to save the cover image Returns: Path to the saved image, or None on failure """ try: doc = fitz.open(pdf_path) if doc.page_count == 0: doc.close() return None page = doc.load_page(0) zoom = 2.0 # 150 DPI equivalent mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat, alpha=False) pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) doc.close() # Resize to max 1024x1024 maintaining aspect ratio pil_image.thumbnail((1024, 1024), Image.LANCZOS) pil_image.save(output_path, "PNG") return output_path except Exception as e: print(f"Error extracting cover image from PDF {pdf_path}: {e}") return None def summarize_brand_guidelines(raw_text: str, brand_name: str, page_count: int = 0) -> str: """ Use Gemini to create a structured brand guidelines summary. Args: raw_text: Extracted text from the PDF brand_name: Name of the brand page_count: Number of pages in the PDF Returns: Structured summary text """ try: import google.generativeai as genai from llm_config import MODEL_VERSIONS api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise RuntimeError("GOOGLE_API_KEY not configured") genai.configure(api_key=api_key) model = genai.GenerativeModel(MODEL_VERSIONS.gemini_vision) # Truncate very large texts to stay within token limits max_chars = 800000 # ~200K tokens, well within Gemini's 1M context if len(raw_text) > max_chars: raw_text = raw_text[:max_chars] + "\n\n... [remaining content truncated due to length]" prompt = f"""You are a brand guidelines analyst. Below is the full text extracted from a {page_count}-page brand guidelines PDF for "{brand_name}". Create a structured summary that captures ALL quality-control-relevant information. This summary will be used by an AI system performing visual QC checks on marketing materials, so focus on concrete, measurable specifications. Structure your summary as follows: ## Brand Identity - Primary and secondary colors (with hex codes, RGB, CMYK values if available) - Logo specifications (minimum sizes, exclusion zones, acceptable variations) - Visual identity elements ## Typography - Primary and secondary fonts (exact names) - Font sizes, weights, leading, tracking specifications - Hierarchy rules (headline, subhead, body text, legal, etc.) ## Layout & Composition - Grid systems, margins, safe areas - Element positioning rules - Aspect ratios and format specifications ## Imagery & Photography - Photography style guidelines - Image treatment rules (filters, overlays, etc.) - Illustration style if applicable ## Do's and Don'ts - Explicit rules about what is/isn't allowed - Common mistakes to avoid ## QC-Critical Specifications - Minimum sizes for any elements - Required elements that must always be present - Spacing and clearance rules - Color usage restrictions Keep the summary factual and specific. Include exact values (hex codes, pixel sizes, percentages) wherever they appear in the source material. Target length: 2000-4000 words. If the PDF text is sparse (image-heavy PDF), note that explicitly and summarize whatever text is available. --- BEGIN EXTRACTED PDF TEXT --- {raw_text} --- END EXTRACTED PDF TEXT ---""" response = model.generate_content(prompt) return response.text except Exception as e: print(f"Error summarizing brand guidelines with LLM: {e}") raise def process_pdf_guideline(pdf_path: str, file_id: str, brand_name: str, files_dir: str) -> Dict: """ Full PDF processing pipeline: extract text, extract cover, summarize. Args: pdf_path: Path to the stored PDF file file_id: Unique file identifier brand_name: Brand name for context files_dir: Directory to store processed files Returns: Dict with processing results and file paths """ print(f"Processing PDF guideline: {file_id} ({brand_name})") result = { 'processed': True, 'processed_at': datetime.now().isoformat(), } # Step 1: Extract text from all pages try: doc = fitz.open(pdf_path) page_count = doc.page_count doc.close() result['page_count'] = page_count print(f" PDF has {page_count} pages") except Exception as e: result['page_count'] = 0 print(f" Could not determine page count: {e}") raw_text = extract_text_from_pdf(pdf_path) result['extracted_text_length'] = len(raw_text) print(f" Extracted {len(raw_text)} characters of text") # Step 2: Extract cover image cover_path = os.path.join(files_dir, f"{file_id}_cover.png") cover_result = extract_cover_image(pdf_path, cover_path) if cover_result: result['cover_image_path'] = cover_path print(f" Cover image saved to: {cover_path}") else: print(f" Could not extract cover image") # Step 3: Summarize with LLM summary_path = os.path.join(files_dir, f"{file_id}_summary.txt") if raw_text and len(raw_text) > 100: try: summary = summarize_brand_guidelines(raw_text, brand_name, result.get('page_count', 0)) with open(summary_path, 'w', encoding='utf-8') as f: f.write(summary) result['summary_path'] = summary_path result['summary_length'] = len(summary) print(f" Summary saved ({len(summary)} chars) to: {summary_path}") except Exception as e: print(f" LLM summarization failed: {e}. Saving raw text as fallback.") # Fallback: save truncated raw text fallback_text = f"[LLM summarization failed - raw extracted text below]\n\n{raw_text[:8000]}" with open(summary_path, 'w', encoding='utf-8') as f: f.write(fallback_text) result['summary_path'] = summary_path result['summary_length'] = len(fallback_text) result['processed'] = 'partial' result['processing_error'] = str(e) else: # Image-heavy PDF with little text fallback_text = f"[This PDF for {brand_name} contains mostly images with limited extractable text ({len(raw_text)} characters). Visual reference via cover image is recommended.]" with open(summary_path, 'w', encoding='utf-8') as f: f.write(fallback_text) result['summary_path'] = summary_path result['summary_length'] = len(fallback_text) print(f" Image-heavy PDF, saved minimal summary") print(f" Processing complete for {file_id}") return result