""" PDF Rasterization Service. Converts PDF pages to PNG images for analysis and display. Uses PyMuPDF (fitz) for high-quality rasterization. """ import logging from typing import List, Tuple import fitz # PyMuPDF logger = logging.getLogger(__name__) # Target DPI for rasterization (150 DPI minimum required, using 200 for quality) TARGET_DPI = 200 # Default PDF resolution is 72 DPI, so scale factor = target_dpi / 72 SCALE_FACTOR = TARGET_DPI / 72 class PDFService: """Service for PDF rasterization operations.""" def rasterize( self, pdf_data: bytes, max_pages: int = 10 ) -> List[Tuple[bytes, int, int]]: """ Convert PDF pages to PNG images. Args: pdf_data: Raw PDF file bytes max_pages: Maximum number of pages to rasterize (default 10) Returns: List of tuples containing (png_bytes, width, height) for each page Raises: ValueError: If the PDF cannot be opened or is password-protected """ try: logger.info(f"[PDF] Starting rasterization, max_pages={max_pages}") doc = fitz.open(stream=pdf_data, filetype="pdf") if doc.is_encrypted: doc.close() raise ValueError("Password-protected PDFs are not supported") pages: List[Tuple[bytes, int, int]] = [] num_pages = min(doc.page_count, max_pages) logger.info(f"[PDF] Document has {doc.page_count} pages, processing {num_pages}") for page_num in range(num_pages): page = doc.load_page(page_num) # Create transformation matrix for desired DPI mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR) # Render page to pixmap (RGB) pix = page.get_pixmap(matrix=mat, alpha=False) # Convert to PNG bytes png_data = pix.tobytes("png") pages.append((png_data, pix.width, pix.height)) logger.info( f"[PDF] Rasterized page {page_num + 1}/{num_pages}: " f"{pix.width}x{pix.height}px at {TARGET_DPI} DPI" ) doc.close() logger.info(f"[PDF] Rasterization complete, {len(pages)} pages processed") return pages except fitz.FileDataError as e: logger.error(f"[PDF] Invalid or corrupted PDF: {str(e)}") raise ValueError(f"Invalid or corrupted PDF file: {str(e)}") except Exception as e: logger.error(f"[PDF] Rasterization error: {str(e)}") raise ValueError(f"Failed to rasterize PDF: {str(e)}") def get_page_count(self, pdf_data: bytes) -> int: """ Get the number of pages in a PDF. Args: pdf_data: Raw PDF file bytes Returns: Number of pages in the PDF Raises: ValueError: If the PDF cannot be opened """ try: doc = fitz.open(stream=pdf_data, filetype="pdf") count = doc.page_count doc.close() return count except Exception as e: logger.error(f"[PDF] Failed to get page count: {str(e)}") raise ValueError(f"Failed to read PDF: {str(e)}") # Singleton instance pdf_service = PDFService()