modcomms/backend/app/services/pdf_service.py
michael c1030ee292 Add PDF rasterization support for reliable preview and analysis
PDFs are now converted to PNG images at 200 DPI before being sent to
Gemini for analysis. This fixes the unreliable iframe-based PDF preview
and ensures all pages are properly analyzed.

- Add PyMuPDF dependency for PDF rasterization
- Create pdf_service.py with rasterize() and get_page_count()
- Update agent interfaces to accept list of images for multi-page support
- Add analyze_with_images() to Gemini service for multi-image analysis
- Return rasterized PDF pages via WebSocket for frontend display
- Add page navigation UI for multi-page PDFs in preview components

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 09:36:56 -06:00

106 lines
3.3 KiB
Python

"""
PDF Rasterization Service.
Converts PDF pages to PNG images for analysis and display.
Uses PyMuPDF (fitz) for high-quality rasterization.
"""
import logging
from typing import List, Tuple
import fitz # PyMuPDF
logger = logging.getLogger(__name__)
# Target DPI for rasterization (150 DPI minimum required, using 200 for quality)
TARGET_DPI = 200
# Default PDF resolution is 72 DPI, so scale factor = target_dpi / 72
SCALE_FACTOR = TARGET_DPI / 72
class PDFService:
"""Service for PDF rasterization operations."""
def rasterize(
self, pdf_data: bytes, max_pages: int = 10
) -> List[Tuple[bytes, int, int]]:
"""
Convert PDF pages to PNG images.
Args:
pdf_data: Raw PDF file bytes
max_pages: Maximum number of pages to rasterize (default 10)
Returns:
List of tuples containing (png_bytes, width, height) for each page
Raises:
ValueError: If the PDF cannot be opened or is password-protected
"""
try:
logger.info(f"[PDF] Starting rasterization, max_pages={max_pages}")
doc = fitz.open(stream=pdf_data, filetype="pdf")
if doc.is_encrypted:
doc.close()
raise ValueError("Password-protected PDFs are not supported")
pages: List[Tuple[bytes, int, int]] = []
num_pages = min(doc.page_count, max_pages)
logger.info(f"[PDF] Document has {doc.page_count} pages, processing {num_pages}")
for page_num in range(num_pages):
page = doc.load_page(page_num)
# Create transformation matrix for desired DPI
mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
# Render page to pixmap (RGB)
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert to PNG bytes
png_data = pix.tobytes("png")
pages.append((png_data, pix.width, pix.height))
logger.info(
f"[PDF] Rasterized page {page_num + 1}/{num_pages}: "
f"{pix.width}x{pix.height}px at {TARGET_DPI} DPI"
)
doc.close()
logger.info(f"[PDF] Rasterization complete, {len(pages)} pages processed")
return pages
except fitz.FileDataError as e:
logger.error(f"[PDF] Invalid or corrupted PDF: {str(e)}")
raise ValueError(f"Invalid or corrupted PDF file: {str(e)}")
except Exception as e:
logger.error(f"[PDF] Rasterization error: {str(e)}")
raise ValueError(f"Failed to rasterize PDF: {str(e)}")
def get_page_count(self, pdf_data: bytes) -> int:
"""
Get the number of pages in a PDF.
Args:
pdf_data: Raw PDF file bytes
Returns:
Number of pages in the PDF
Raises:
ValueError: If the PDF cannot be opened
"""
try:
doc = fitz.open(stream=pdf_data, filetype="pdf")
count = doc.page_count
doc.close()
return count
except Exception as e:
logger.error(f"[PDF] Failed to get page count: {str(e)}")
raise ValueError(f"Failed to read PDF: {str(e)}")
# Singleton instance
pdf_service = PDFService()