PDF brand guidelines were previously ignored - QC checks received no content from uploaded PDFs. Now on upload, all pages are text-extracted, summarized by Gemini into a structured brand guidelines summary, and a cover image is extracted. QC checks receive the full summary in their prompt and the cover image as visual reference. - New backend/pdf_processor.py: text extraction, cover image, LLM summary - brand_guidelines_db.py: summary/cover path tracking, cleanup on delete - api_server.py: background processing on upload, summary-aware content retrieval, PDF cover image support, status/reprocess endpoints, startup backfill for existing unprocessed PDFs - web_ui.html: processing status badges and upload feedback for PDFs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
231 lines
7.9 KiB
Python
231 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PDF Reference Asset Processor
|
|
Extracts text from all pages of PDF brand guidelines, generates a structured
|
|
summary using Gemini, and stores processed artifacts for use in QC checks.
|
|
"""
|
|
|
|
import os
|
|
import fitz # PyMuPDF
|
|
from datetime import datetime
|
|
from PIL import Image
|
|
from typing import Optional, Dict
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path: str) -> str:
|
|
"""
|
|
Extract text from ALL pages of a PDF using PyMuPDF.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
|
|
Returns:
|
|
Concatenated text from all pages with page delimiters
|
|
"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
all_text = []
|
|
|
|
for page_num in range(doc.page_count):
|
|
page = doc.load_page(page_num)
|
|
page_text = page.get_text().strip()
|
|
if page_text:
|
|
all_text.append(f"--- Page {page_num + 1} ---\n{page_text}")
|
|
|
|
doc.close()
|
|
return "\n\n".join(all_text)
|
|
|
|
except Exception as e:
|
|
print(f"Error extracting text from PDF {pdf_path}: {e}")
|
|
return ""
|
|
|
|
|
|
def extract_cover_image(pdf_path: str, output_path: str) -> Optional[str]:
|
|
"""
|
|
Extract page 1 of a PDF as a PNG image.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
output_path: Path to save the cover image
|
|
|
|
Returns:
|
|
Path to the saved image, or None on failure
|
|
"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
if doc.page_count == 0:
|
|
doc.close()
|
|
return None
|
|
|
|
page = doc.load_page(0)
|
|
zoom = 2.0 # 150 DPI equivalent
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
doc.close()
|
|
|
|
# Resize to max 1024x1024 maintaining aspect ratio
|
|
pil_image.thumbnail((1024, 1024), Image.LANCZOS)
|
|
pil_image.save(output_path, "PNG")
|
|
|
|
return output_path
|
|
|
|
except Exception as e:
|
|
print(f"Error extracting cover image from PDF {pdf_path}: {e}")
|
|
return None
|
|
|
|
|
|
def summarize_brand_guidelines(raw_text: str, brand_name: str, page_count: int = 0) -> str:
|
|
"""
|
|
Use Gemini to create a structured brand guidelines summary.
|
|
|
|
Args:
|
|
raw_text: Extracted text from the PDF
|
|
brand_name: Name of the brand
|
|
page_count: Number of pages in the PDF
|
|
|
|
Returns:
|
|
Structured summary text
|
|
"""
|
|
try:
|
|
import google.generativeai as genai
|
|
from llm_config import MODEL_VERSIONS
|
|
|
|
api_key = os.getenv("GOOGLE_API_KEY")
|
|
if not api_key:
|
|
raise RuntimeError("GOOGLE_API_KEY not configured")
|
|
|
|
genai.configure(api_key=api_key)
|
|
model = genai.GenerativeModel(MODEL_VERSIONS.gemini_vision)
|
|
|
|
# Truncate very large texts to stay within token limits
|
|
max_chars = 800000 # ~200K tokens, well within Gemini's 1M context
|
|
if len(raw_text) > max_chars:
|
|
raw_text = raw_text[:max_chars] + "\n\n... [remaining content truncated due to length]"
|
|
|
|
prompt = f"""You are a brand guidelines analyst. Below is the full text extracted from a {page_count}-page brand guidelines PDF for "{brand_name}".
|
|
|
|
Create a structured summary that captures ALL quality-control-relevant information. This summary will be used by an AI system performing visual QC checks on marketing materials, so focus on concrete, measurable specifications.
|
|
|
|
Structure your summary as follows:
|
|
|
|
## Brand Identity
|
|
- Primary and secondary colors (with hex codes, RGB, CMYK values if available)
|
|
- Logo specifications (minimum sizes, exclusion zones, acceptable variations)
|
|
- Visual identity elements
|
|
|
|
## Typography
|
|
- Primary and secondary fonts (exact names)
|
|
- Font sizes, weights, leading, tracking specifications
|
|
- Hierarchy rules (headline, subhead, body text, legal, etc.)
|
|
|
|
## Layout & Composition
|
|
- Grid systems, margins, safe areas
|
|
- Element positioning rules
|
|
- Aspect ratios and format specifications
|
|
|
|
## Imagery & Photography
|
|
- Photography style guidelines
|
|
- Image treatment rules (filters, overlays, etc.)
|
|
- Illustration style if applicable
|
|
|
|
## Do's and Don'ts
|
|
- Explicit rules about what is/isn't allowed
|
|
- Common mistakes to avoid
|
|
|
|
## QC-Critical Specifications
|
|
- Minimum sizes for any elements
|
|
- Required elements that must always be present
|
|
- Spacing and clearance rules
|
|
- Color usage restrictions
|
|
|
|
Keep the summary factual and specific. Include exact values (hex codes, pixel sizes, percentages) wherever they appear in the source material. Target length: 2000-4000 words.
|
|
If the PDF text is sparse (image-heavy PDF), note that explicitly and summarize whatever text is available.
|
|
|
|
--- BEGIN EXTRACTED PDF TEXT ---
|
|
{raw_text}
|
|
--- END EXTRACTED PDF TEXT ---"""
|
|
|
|
response = model.generate_content(prompt)
|
|
return response.text
|
|
|
|
except Exception as e:
|
|
print(f"Error summarizing brand guidelines with LLM: {e}")
|
|
raise
|
|
|
|
|
|
def process_pdf_guideline(pdf_path: str, file_id: str, brand_name: str, files_dir: str) -> Dict:
|
|
"""
|
|
Full PDF processing pipeline: extract text, extract cover, summarize.
|
|
|
|
Args:
|
|
pdf_path: Path to the stored PDF file
|
|
file_id: Unique file identifier
|
|
brand_name: Brand name for context
|
|
files_dir: Directory to store processed files
|
|
|
|
Returns:
|
|
Dict with processing results and file paths
|
|
"""
|
|
print(f"Processing PDF guideline: {file_id} ({brand_name})")
|
|
result = {
|
|
'processed': True,
|
|
'processed_at': datetime.now().isoformat(),
|
|
}
|
|
|
|
# Step 1: Extract text from all pages
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
page_count = doc.page_count
|
|
doc.close()
|
|
result['page_count'] = page_count
|
|
print(f" PDF has {page_count} pages")
|
|
except Exception as e:
|
|
result['page_count'] = 0
|
|
print(f" Could not determine page count: {e}")
|
|
|
|
raw_text = extract_text_from_pdf(pdf_path)
|
|
result['extracted_text_length'] = len(raw_text)
|
|
print(f" Extracted {len(raw_text)} characters of text")
|
|
|
|
# Step 2: Extract cover image
|
|
cover_path = os.path.join(files_dir, f"{file_id}_cover.png")
|
|
cover_result = extract_cover_image(pdf_path, cover_path)
|
|
if cover_result:
|
|
result['cover_image_path'] = cover_path
|
|
print(f" Cover image saved to: {cover_path}")
|
|
else:
|
|
print(f" Could not extract cover image")
|
|
|
|
# Step 3: Summarize with LLM
|
|
summary_path = os.path.join(files_dir, f"{file_id}_summary.txt")
|
|
|
|
if raw_text and len(raw_text) > 100:
|
|
try:
|
|
summary = summarize_brand_guidelines(raw_text, brand_name, result.get('page_count', 0))
|
|
with open(summary_path, 'w', encoding='utf-8') as f:
|
|
f.write(summary)
|
|
result['summary_path'] = summary_path
|
|
result['summary_length'] = len(summary)
|
|
print(f" Summary saved ({len(summary)} chars) to: {summary_path}")
|
|
except Exception as e:
|
|
print(f" LLM summarization failed: {e}. Saving raw text as fallback.")
|
|
# Fallback: save truncated raw text
|
|
fallback_text = f"[LLM summarization failed - raw extracted text below]\n\n{raw_text[:8000]}"
|
|
with open(summary_path, 'w', encoding='utf-8') as f:
|
|
f.write(fallback_text)
|
|
result['summary_path'] = summary_path
|
|
result['summary_length'] = len(fallback_text)
|
|
result['processed'] = 'partial'
|
|
result['processing_error'] = str(e)
|
|
else:
|
|
# Image-heavy PDF with little text
|
|
fallback_text = f"[This PDF for {brand_name} contains mostly images with limited extractable text ({len(raw_text)} characters). Visual reference via cover image is recommended.]"
|
|
with open(summary_path, 'w', encoding='utf-8') as f:
|
|
f.write(fallback_text)
|
|
result['summary_path'] = summary_path
|
|
result['summary_length'] = len(fallback_text)
|
|
print(f" Image-heavy PDF, saved minimal summary")
|
|
|
|
print(f" Processing complete for {file_id}")
|
|
return result
|