ai_qc/backend/document_mode/ingest.py

"""PDF ingestion for document-mode QC.

Renders each page of a multi-page PDF to a PNG, extracts per-page text spans
with font name + weight + size, and returns a structured list the dispatcher
loops over. Phase-1 LLM checks consume only the rendered page image; the
text-span data is captured here so Phase-2 deterministic checks (font
compliance, bold-words) can plug in without a re-ingest pass.
"""

import os
from typing import Dict, List, Optional

import fitz  # PyMuPDF
from PIL import Image


PYMUPDF_BOLD_FLAG = 16  # bit 4 of span['flags']
PYMUPDF_ITALIC_FLAG = 2  # bit 1 of span['flags']
DEFAULT_RENDER_ZOOM = 2.0  # ≈150 DPI — matches pdf_processor.extract_cover_image
DEFAULT_MAX_DIMENSION = 1600  # px — slightly larger than reference-asset thumbnails so per-page text stays legible to the LLM
DEFAULT_PAGE_LIMIT = 200  # safety cap; AXA policy docs are ~80 pages


def _span_is_bold(span: Dict) -> bool:
    """A span counts as bold if PyMuPDF's flags say so OR the font name signals it."""
    flags = span.get('flags', 0)
    if flags & PYMUPDF_BOLD_FLAG:
        return True
    font = (span.get('font') or '').lower()
    return any(token in font for token in ('bold', 'black', 'heavy'))


def _span_is_italic(span: Dict) -> bool:
    flags = span.get('flags', 0)
    if flags & PYMUPDF_ITALIC_FLAG:
        return True
    font = (span.get('font') or '').lower()
    return 'italic' in font or 'oblique' in font


def _extract_page_spans(page: fitz.Page) -> List[Dict]:
    """Flatten PyMuPDF's blocks→lines→spans into a list of QC-relevant span dicts."""
    spans = []
    try:
        text_dict = page.get_text("dict")
    except Exception as e:
        print(f"  [ingest] get_text(dict) failed on page {page.number + 1}: {e}")
        return spans

    for block in text_dict.get('blocks', []):
        if block.get('type') != 0:  # 0 = text block, 1 = image
            continue
        for line in block.get('lines', []):
            for span in line.get('spans', []):
                text = (span.get('text') or '').strip()
                if not text:
                    continue
                spans.append({
                    'text': text,
                    'font': span.get('font'),
                    'size': round(span.get('size', 0), 2),
                    'bold': _span_is_bold(span),
                    'italic': _span_is_italic(span),
                    'bbox': span.get('bbox'),  # (x0, y0, x1, y1) in PDF points
                    'flags': span.get('flags', 0),
                })
    return spans


def _render_page(page: fitz.Page, output_path: str, zoom: float, max_dim: int) -> Optional[str]:
    """Render a single page to PNG. Returns saved path or None on failure."""
    try:
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img.thumbnail((max_dim, max_dim), Image.LANCZOS)
        img.save(output_path, "PNG")
        return output_path
    except Exception as e:
        print(f"  [ingest] render failed on page {page.number + 1}: {e}")
        return None


def get_page_count(pdf_path: str) -> int:
    """Page count without rendering anything. Returns 0 on failure."""
    try:
        doc = fitz.open(pdf_path)
        count = doc.page_count
        doc.close()
        return count
    except Exception as e:
        print(f"  [ingest] page count failed for {pdf_path}: {e}")
        return 0


def ingest_pdf(
    pdf_path: str,
    output_dir: str,
    page_limit: int = DEFAULT_PAGE_LIMIT,
    progress_callback=None,
) -> Dict:
    """Render every page of a PDF and capture per-page structured text.

    Args:
        pdf_path: source PDF path.
        output_dir: directory to write page PNGs into. Created if missing.
        page_limit: hard cap on pages processed. Pages beyond the cap are skipped.
        progress_callback: optional callable(page_num, total) for live progress.

    Returns:
        {
            'page_count': int,                  # total pages in source PDF
            'pages_processed': int,             # pages we actually rendered
            'truncated': bool,                  # True if page_count > page_limit
            'pages': [
                {
                    'page_num': 1-indexed int,
                    'image_path': str,
                    'raw_text': str,
                    'spans': [{ text, font, size, bold, italic, bbox, flags }, ...],
                    'fonts_used': sorted list of unique font names,
                },
                ...
            ],
        }
    """
    os.makedirs(output_dir, exist_ok=True)

    doc = fitz.open(pdf_path)
    total_pages = doc.page_count
    pages_to_process = min(total_pages, page_limit)
    truncated = total_pages > page_limit
    if truncated:
        print(f"  [ingest] PDF has {total_pages} pages, processing first {page_limit} only")

    pages: List[Dict] = []

    for i in range(pages_to_process):
        page_num = i + 1  # 1-indexed
        page = doc.load_page(i)

        image_filename = f"page_{page_num:04d}.png"
        image_path = os.path.join(output_dir, image_filename)
        rendered = _render_page(page, image_path, DEFAULT_RENDER_ZOOM, DEFAULT_MAX_DIMENSION)

        spans = _extract_page_spans(page)
        raw_text = page.get_text().strip()
        fonts_used = sorted({s['font'] for s in spans if s.get('font')})

        pages.append({
            'page_num': page_num,
            'image_path': rendered,
            'raw_text': raw_text,
            'spans': spans,
            'fonts_used': fonts_used,
        })

        if progress_callback:
            try:
                progress_callback(page_num, pages_to_process)
            except Exception as e:
                print(f"  [ingest] progress callback raised on page {page_num}: {e}")

    doc.close()

    return {
        'pdf_path': pdf_path,
        'page_count': total_pages,
        'pages_processed': pages_to_process,
        'truncated': truncated,
        'pages': pages,
    }