ai_qc/backend/pdf_processor.py
nickviljoen 3f4cc149ad Process multi-page PDF reference assets with LLM summarization
PDF brand guidelines were previously ignored - QC checks received no
content from uploaded PDFs. Now on upload, all pages are text-extracted,
summarized by Gemini into a structured brand guidelines summary, and
a cover image is extracted. QC checks receive the full summary in their
prompt and the cover image as visual reference.

- New backend/pdf_processor.py: text extraction, cover image, LLM summary
- brand_guidelines_db.py: summary/cover path tracking, cleanup on delete
- api_server.py: background processing on upload, summary-aware content
  retrieval, PDF cover image support, status/reprocess endpoints, startup
  backfill for existing unprocessed PDFs
- web_ui.html: processing status badges and upload feedback for PDFs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:02:47 +02:00

231 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""
PDF Reference Asset Processor
Extracts text from all pages of PDF brand guidelines, generates a structured
summary using Gemini, and stores processed artifacts for use in QC checks.
"""
import os
import fitz # PyMuPDF
from datetime import datetime
from PIL import Image
from typing import Optional, Dict
def extract_text_from_pdf(pdf_path: str) -> str:
"""
Extract text from ALL pages of a PDF using PyMuPDF.
Args:
pdf_path: Path to the PDF file
Returns:
Concatenated text from all pages with page delimiters
"""
try:
doc = fitz.open(pdf_path)
all_text = []
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
page_text = page.get_text().strip()
if page_text:
all_text.append(f"--- Page {page_num + 1} ---\n{page_text}")
doc.close()
return "\n\n".join(all_text)
except Exception as e:
print(f"Error extracting text from PDF {pdf_path}: {e}")
return ""
def extract_cover_image(pdf_path: str, output_path: str) -> Optional[str]:
"""
Extract page 1 of a PDF as a PNG image.
Args:
pdf_path: Path to the PDF file
output_path: Path to save the cover image
Returns:
Path to the saved image, or None on failure
"""
try:
doc = fitz.open(pdf_path)
if doc.page_count == 0:
doc.close()
return None
page = doc.load_page(0)
zoom = 2.0 # 150 DPI equivalent
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
doc.close()
# Resize to max 1024x1024 maintaining aspect ratio
pil_image.thumbnail((1024, 1024), Image.LANCZOS)
pil_image.save(output_path, "PNG")
return output_path
except Exception as e:
print(f"Error extracting cover image from PDF {pdf_path}: {e}")
return None
def summarize_brand_guidelines(raw_text: str, brand_name: str, page_count: int = 0) -> str:
"""
Use Gemini to create a structured brand guidelines summary.
Args:
raw_text: Extracted text from the PDF
brand_name: Name of the brand
page_count: Number of pages in the PDF
Returns:
Structured summary text
"""
try:
import google.generativeai as genai
from llm_config import MODEL_VERSIONS
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise RuntimeError("GOOGLE_API_KEY not configured")
genai.configure(api_key=api_key)
model = genai.GenerativeModel(MODEL_VERSIONS.gemini_vision)
# Truncate very large texts to stay within token limits
max_chars = 800000 # ~200K tokens, well within Gemini's 1M context
if len(raw_text) > max_chars:
raw_text = raw_text[:max_chars] + "\n\n... [remaining content truncated due to length]"
prompt = f"""You are a brand guidelines analyst. Below is the full text extracted from a {page_count}-page brand guidelines PDF for "{brand_name}".
Create a structured summary that captures ALL quality-control-relevant information. This summary will be used by an AI system performing visual QC checks on marketing materials, so focus on concrete, measurable specifications.
Structure your summary as follows:
## Brand Identity
- Primary and secondary colors (with hex codes, RGB, CMYK values if available)
- Logo specifications (minimum sizes, exclusion zones, acceptable variations)
- Visual identity elements
## Typography
- Primary and secondary fonts (exact names)
- Font sizes, weights, leading, tracking specifications
- Hierarchy rules (headline, subhead, body text, legal, etc.)
## Layout & Composition
- Grid systems, margins, safe areas
- Element positioning rules
- Aspect ratios and format specifications
## Imagery & Photography
- Photography style guidelines
- Image treatment rules (filters, overlays, etc.)
- Illustration style if applicable
## Do's and Don'ts
- Explicit rules about what is/isn't allowed
- Common mistakes to avoid
## QC-Critical Specifications
- Minimum sizes for any elements
- Required elements that must always be present
- Spacing and clearance rules
- Color usage restrictions
Keep the summary factual and specific. Include exact values (hex codes, pixel sizes, percentages) wherever they appear in the source material. Target length: 2000-4000 words.
If the PDF text is sparse (image-heavy PDF), note that explicitly and summarize whatever text is available.
--- BEGIN EXTRACTED PDF TEXT ---
{raw_text}
--- END EXTRACTED PDF TEXT ---"""
response = model.generate_content(prompt)
return response.text
except Exception as e:
print(f"Error summarizing brand guidelines with LLM: {e}")
raise
def process_pdf_guideline(pdf_path: str, file_id: str, brand_name: str, files_dir: str) -> Dict:
"""
Full PDF processing pipeline: extract text, extract cover, summarize.
Args:
pdf_path: Path to the stored PDF file
file_id: Unique file identifier
brand_name: Brand name for context
files_dir: Directory to store processed files
Returns:
Dict with processing results and file paths
"""
print(f"Processing PDF guideline: {file_id} ({brand_name})")
result = {
'processed': True,
'processed_at': datetime.now().isoformat(),
}
# Step 1: Extract text from all pages
try:
doc = fitz.open(pdf_path)
page_count = doc.page_count
doc.close()
result['page_count'] = page_count
print(f" PDF has {page_count} pages")
except Exception as e:
result['page_count'] = 0
print(f" Could not determine page count: {e}")
raw_text = extract_text_from_pdf(pdf_path)
result['extracted_text_length'] = len(raw_text)
print(f" Extracted {len(raw_text)} characters of text")
# Step 2: Extract cover image
cover_path = os.path.join(files_dir, f"{file_id}_cover.png")
cover_result = extract_cover_image(pdf_path, cover_path)
if cover_result:
result['cover_image_path'] = cover_path
print(f" Cover image saved to: {cover_path}")
else:
print(f" Could not extract cover image")
# Step 3: Summarize with LLM
summary_path = os.path.join(files_dir, f"{file_id}_summary.txt")
if raw_text and len(raw_text) > 100:
try:
summary = summarize_brand_guidelines(raw_text, brand_name, result.get('page_count', 0))
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(summary)
result['summary_path'] = summary_path
result['summary_length'] = len(summary)
print(f" Summary saved ({len(summary)} chars) to: {summary_path}")
except Exception as e:
print(f" LLM summarization failed: {e}. Saving raw text as fallback.")
# Fallback: save truncated raw text
fallback_text = f"[LLM summarization failed - raw extracted text below]\n\n{raw_text[:8000]}"
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(fallback_text)
result['summary_path'] = summary_path
result['summary_length'] = len(fallback_text)
result['processed'] = 'partial'
result['processing_error'] = str(e)
else:
# Image-heavy PDF with little text
fallback_text = f"[This PDF for {brand_name} contains mostly images with limited extractable text ({len(raw_text)} characters). Visual reference via cover image is recommended.]"
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(fallback_text)
result['summary_path'] = summary_path
result['summary_length'] = len(fallback_text)
print(f" Image-heavy PDF, saved minimal summary")
print(f" Processing complete for {file_id}")
return result