modcomms/backend/app/services/pdf_service.py

"""
PDF Rasterization Service.

Converts PDF pages to PNG images for analysis and display.
Uses PyMuPDF (fitz) for high-quality rasterization.
"""

import logging
from typing import List, Tuple

import fitz  # PyMuPDF

logger = logging.getLogger(__name__)

# Target DPI for rasterization (150 DPI minimum required, using 200 for quality)
TARGET_DPI = 200
# Default PDF resolution is 72 DPI, so scale factor = target_dpi / 72
SCALE_FACTOR = TARGET_DPI / 72


class PDFService:
    """Service for PDF rasterization operations."""

    def rasterize(
        self, pdf_data: bytes, max_pages: int = 10
    ) -> List[Tuple[bytes, int, int]]:
        """
        Convert PDF pages to PNG images.

        Args:
            pdf_data: Raw PDF file bytes
            max_pages: Maximum number of pages to rasterize (default 10)

        Returns:
            List of tuples containing (png_bytes, width, height) for each page

        Raises:
            ValueError: If the PDF cannot be opened or is password-protected
        """
        try:
            logger.info(f"[PDF] Starting rasterization, max_pages={max_pages}")
            doc = fitz.open(stream=pdf_data, filetype="pdf")

            if doc.is_encrypted:
                doc.close()
                raise ValueError("Password-protected PDFs are not supported")

            pages: List[Tuple[bytes, int, int]] = []
            num_pages = min(doc.page_count, max_pages)

            logger.info(f"[PDF] Document has {doc.page_count} pages, processing {num_pages}")

            for page_num in range(num_pages):
                page = doc.load_page(page_num)

                # Create transformation matrix for desired DPI
                mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)

                # Render page to pixmap (RGB)
                pix = page.get_pixmap(matrix=mat, alpha=False)

                # Convert to PNG bytes
                png_data = pix.tobytes("png")

                pages.append((png_data, pix.width, pix.height))
                logger.info(
                    f"[PDF] Rasterized page {page_num + 1}/{num_pages}: "
                    f"{pix.width}x{pix.height}px at {TARGET_DPI} DPI"
                )

            doc.close()
            logger.info(f"[PDF] Rasterization complete, {len(pages)} pages processed")
            return pages

        except fitz.FileDataError as e:
            logger.error(f"[PDF] Invalid or corrupted PDF: {str(e)}")
            raise ValueError(f"Invalid or corrupted PDF file: {str(e)}")
        except Exception as e:
            logger.error(f"[PDF] Rasterization error: {str(e)}")
            raise ValueError(f"Failed to rasterize PDF: {str(e)}")

    def get_page_count(self, pdf_data: bytes) -> int:
        """
        Get the number of pages in a PDF.

        Args:
            pdf_data: Raw PDF file bytes

        Returns:
            Number of pages in the PDF

        Raises:
            ValueError: If the PDF cannot be opened
        """
        try:
            doc = fitz.open(stream=pdf_data, filetype="pdf")
            count = doc.page_count
            doc.close()
            return count
        except Exception as e:
            logger.error(f"[PDF] Failed to get page count: {str(e)}")
            raise ValueError(f"Failed to read PDF: {str(e)}")


# Singleton instance
pdf_service = PDFService()