ai_qc/backend/pdf_processor.py

#!/usr/bin/env python3
"""
PDF Reference Asset Processor
Extracts text from all pages of PDF brand guidelines, generates a structured
summary using Gemini, and stores processed artifacts for use in QC checks.
"""

import os
import fitz  # PyMuPDF
from datetime import datetime
from PIL import Image
from typing import Optional, Dict


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from ALL pages of a PDF using PyMuPDF.

    Args:
        pdf_path: Path to the PDF file

    Returns:
        Concatenated text from all pages with page delimiters
    """
    try:
        doc = fitz.open(pdf_path)
        all_text = []

        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            page_text = page.get_text().strip()
            if page_text:
                all_text.append(f"--- Page {page_num + 1} ---\n{page_text}")

        doc.close()
        return "\n\n".join(all_text)

    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {e}")
        return ""


def extract_cover_image(pdf_path: str, output_path: str) -> Optional[str]:
    """
    Extract page 1 of a PDF as a PNG image.

    Args:
        pdf_path: Path to the PDF file
        output_path: Path to save the cover image

    Returns:
        Path to the saved image, or None on failure
    """
    try:
        doc = fitz.open(pdf_path)
        if doc.page_count == 0:
            doc.close()
            return None

        page = doc.load_page(0)
        zoom = 2.0  # 150 DPI equivalent
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        doc.close()

        # Resize to max 1024x1024 maintaining aspect ratio
        pil_image.thumbnail((1024, 1024), Image.LANCZOS)
        pil_image.save(output_path, "PNG")

        return output_path

    except Exception as e:
        print(f"Error extracting cover image from PDF {pdf_path}: {e}")
        return None


def summarize_brand_guidelines(raw_text: str, brand_name: str, page_count: int = 0) -> str:
    """
    Use Gemini to create a structured brand guidelines summary.

    Args:
        raw_text: Extracted text from the PDF
        brand_name: Name of the brand
        page_count: Number of pages in the PDF

    Returns:
        Structured summary text
    """
    try:
        import google.generativeai as genai
        from llm_config import MODEL_VERSIONS

        api_key = os.getenv("GOOGLE_API_KEY")
        if not api_key:
            raise RuntimeError("GOOGLE_API_KEY not configured")

        genai.configure(api_key=api_key)
        model = genai.GenerativeModel(MODEL_VERSIONS.gemini_vision)

        # Truncate very large texts to stay within token limits
        max_chars = 800000  # ~200K tokens, well within Gemini's 1M context
        if len(raw_text) > max_chars:
            raw_text = raw_text[:max_chars] + "\n\n... [remaining content truncated due to length]"

        prompt = f"""You are a brand guidelines analyst. Below is the full text extracted from a {page_count}-page brand guidelines PDF for "{brand_name}".

Create a structured summary that captures ALL quality-control-relevant information. This summary will be used by an AI system performing visual QC checks on marketing materials, so focus on concrete, measurable specifications.

Structure your summary as follows:

## Brand Identity
- Primary and secondary colors (with hex codes, RGB, CMYK values if available)
- Logo specifications (minimum sizes, exclusion zones, acceptable variations)
- Visual identity elements

## Typography
- Primary and secondary fonts (exact names)
- Font sizes, weights, leading, tracking specifications
- Hierarchy rules (headline, subhead, body text, legal, etc.)

## Layout & Composition
- Grid systems, margins, safe areas
- Element positioning rules
- Aspect ratios and format specifications

## Imagery & Photography
- Photography style guidelines
- Image treatment rules (filters, overlays, etc.)
- Illustration style if applicable

## Do's and Don'ts
- Explicit rules about what is/isn't allowed
- Common mistakes to avoid

## QC-Critical Specifications
- Minimum sizes for any elements
- Required elements that must always be present
- Spacing and clearance rules
- Color usage restrictions

Keep the summary factual and specific. Include exact values (hex codes, pixel sizes, percentages) wherever they appear in the source material. Target length: 2000-4000 words.
If the PDF text is sparse (image-heavy PDF), note that explicitly and summarize whatever text is available.

--- BEGIN EXTRACTED PDF TEXT ---
{raw_text}
--- END EXTRACTED PDF TEXT ---"""

        response = model.generate_content(prompt)
        return response.text

    except Exception as e:
        print(f"Error summarizing brand guidelines with LLM: {e}")
        raise


def process_pdf_guideline(pdf_path: str, file_id: str, brand_name: str, files_dir: str) -> Dict:
    """
    Full PDF processing pipeline: extract text, extract cover, summarize.

    Args:
        pdf_path: Path to the stored PDF file
        file_id: Unique file identifier
        brand_name: Brand name for context
        files_dir: Directory to store processed files

    Returns:
        Dict with processing results and file paths
    """
    print(f"Processing PDF guideline: {file_id} ({brand_name})")
    result = {
        'processed': True,
        'processed_at': datetime.now().isoformat(),
    }

    # Step 1: Extract text from all pages
    try:
        doc = fitz.open(pdf_path)
        page_count = doc.page_count
        doc.close()
        result['page_count'] = page_count
        print(f"  PDF has {page_count} pages")
    except Exception as e:
        result['page_count'] = 0
        print(f"  Could not determine page count: {e}")

    raw_text = extract_text_from_pdf(pdf_path)
    result['extracted_text_length'] = len(raw_text)
    print(f"  Extracted {len(raw_text)} characters of text")

    # Step 2: Extract cover image
    cover_path = os.path.join(files_dir, f"{file_id}_cover.png")
    cover_result = extract_cover_image(pdf_path, cover_path)
    if cover_result:
        result['cover_image_path'] = cover_path
        print(f"  Cover image saved to: {cover_path}")
    else:
        print(f"  Could not extract cover image")

    # Step 3: Summarize with LLM
    summary_path = os.path.join(files_dir, f"{file_id}_summary.txt")

    if raw_text and len(raw_text) > 100:
        try:
            summary = summarize_brand_guidelines(raw_text, brand_name, result.get('page_count', 0))
            with open(summary_path, 'w', encoding='utf-8') as f:
                f.write(summary)
            result['summary_path'] = summary_path
            result['summary_length'] = len(summary)
            print(f"  Summary saved ({len(summary)} chars) to: {summary_path}")
        except Exception as e:
            print(f"  LLM summarization failed: {e}. Saving raw text as fallback.")
            # Fallback: save truncated raw text
            fallback_text = f"[LLM summarization failed - raw extracted text below]\n\n{raw_text[:8000]}"
            with open(summary_path, 'w', encoding='utf-8') as f:
                f.write(fallback_text)
            result['summary_path'] = summary_path
            result['summary_length'] = len(fallback_text)
            result['processed'] = 'partial'
            result['processing_error'] = str(e)
    else:
        # Image-heavy PDF with little text
        fallback_text = f"[This PDF for {brand_name} contains mostly images with limited extractable text ({len(raw_text)} characters). Visual reference via cover image is recommended.]"
        with open(summary_path, 'w', encoding='utf-8') as f:
            f.write(fallback_text)
        result['summary_path'] = summary_path
        result['summary_length'] = len(fallback_text)
        print(f"  Image-heavy PDF, saved minimal summary")

    print(f"  Processing complete for {file_id}")
    return result