"""PDF ingestion for document-mode QC. Renders each page of a multi-page PDF to a PNG, extracts per-page text spans with font name + weight + size, and returns a structured list the dispatcher loops over. Phase-1 LLM checks consume only the rendered page image; the text-span data is captured here so Phase-2 deterministic checks (font compliance, bold-words) can plug in without a re-ingest pass. """ import os from typing import Dict, List, Optional import fitz # PyMuPDF from PIL import Image PYMUPDF_BOLD_FLAG = 16 # bit 4 of span['flags'] PYMUPDF_ITALIC_FLAG = 2 # bit 1 of span['flags'] DEFAULT_RENDER_ZOOM = 2.0 # ≈150 DPI — matches pdf_processor.extract_cover_image DEFAULT_MAX_DIMENSION = 1600 # px — slightly larger than reference-asset thumbnails so per-page text stays legible to the LLM DEFAULT_PAGE_LIMIT = 200 # safety cap; AXA policy docs are ~80 pages def _span_is_bold(span: Dict) -> bool: """A span counts as bold if PyMuPDF's flags say so OR the font name signals it.""" flags = span.get('flags', 0) if flags & PYMUPDF_BOLD_FLAG: return True font = (span.get('font') or '').lower() return any(token in font for token in ('bold', 'black', 'heavy')) def _span_is_italic(span: Dict) -> bool: flags = span.get('flags', 0) if flags & PYMUPDF_ITALIC_FLAG: return True font = (span.get('font') or '').lower() return 'italic' in font or 'oblique' in font def _extract_page_spans(page: fitz.Page) -> List[Dict]: """Flatten PyMuPDF's blocks→lines→spans into a list of QC-relevant span dicts.""" spans = [] try: text_dict = page.get_text("dict") except Exception as e: print(f" [ingest] get_text(dict) failed on page {page.number + 1}: {e}") return spans for block in text_dict.get('blocks', []): if block.get('type') != 0: # 0 = text block, 1 = image continue for line in block.get('lines', []): for span in line.get('spans', []): text = (span.get('text') or '').strip() if not text: continue color_int = span.get('color', 0) or 0 spans.append({ 'text': text, 'font': span.get('font'), 'size': round(span.get('size', 0), 2), 'bold': _span_is_bold(span), 'italic': _span_is_italic(span), 'bbox': span.get('bbox'), # (x0, y0, x1, y1) in PDF points 'flags': span.get('flags', 0), 'color': f'#{color_int & 0xFFFFFF:06x}', }) return spans def _render_page(page: fitz.Page, output_path: str, zoom: float, max_dim: int) -> Optional[str]: """Render a single page to PNG. Returns saved path or None on failure.""" try: mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat, alpha=False) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) img.thumbnail((max_dim, max_dim), Image.LANCZOS) img.save(output_path, "PNG") return output_path except Exception as e: print(f" [ingest] render failed on page {page.number + 1}: {e}") return None def get_page_count(pdf_path: str) -> int: """Page count without rendering anything. Returns 0 on failure.""" try: doc = fitz.open(pdf_path) count = doc.page_count doc.close() return count except Exception as e: print(f" [ingest] page count failed for {pdf_path}: {e}") return 0 def ingest_pdf( pdf_path: str, output_dir: str, page_limit: int = DEFAULT_PAGE_LIMIT, progress_callback=None, ) -> Dict: """Render every page of a PDF and capture per-page structured text. Args: pdf_path: source PDF path. output_dir: directory to write page PNGs into. Created if missing. page_limit: hard cap on pages processed. Pages beyond the cap are skipped. progress_callback: optional callable(page_num, total) for live progress. Returns: { 'page_count': int, # total pages in source PDF 'pages_processed': int, # pages we actually rendered 'truncated': bool, # True if page_count > page_limit 'pages': [ { 'page_num': 1-indexed int, 'image_path': str, 'raw_text': str, 'spans': [{ text, font, size, bold, italic, color, bbox, flags }, ...], 'fonts_used': sorted list of unique font names, }, ... ], } """ os.makedirs(output_dir, exist_ok=True) doc = fitz.open(pdf_path) total_pages = doc.page_count pages_to_process = min(total_pages, page_limit) truncated = total_pages > page_limit if truncated: print(f" [ingest] PDF has {total_pages} pages, processing first {page_limit} only") pages: List[Dict] = [] # Page classifier is optional — only used by document-mode profiles that # need a strict-grade exemption for non-artwork pages (e.g. Boots PPack). # Importing locally keeps it out of the hot path for AXA-style profiles. from .page_classifier import classify_page for i in range(pages_to_process): page_num = i + 1 # 1-indexed page = doc.load_page(i) image_filename = f"page_{page_num:04d}.png" image_path = os.path.join(output_dir, image_filename) rendered = _render_page(page, image_path, DEFAULT_RENDER_ZOOM, DEFAULT_MAX_DIMENSION) spans = _extract_page_spans(page) raw_text = page.get_text().strip() fonts_used = sorted({s['font'] for s in spans if s.get('font')}) page_record = { 'page_num': page_num, 'image_path': rendered, 'raw_text': raw_text, 'spans': spans, 'fonts_used': fonts_used, } page_record['page_type'] = classify_page(page_record) pages.append(page_record) if progress_callback: try: progress_callback(page_num, pages_to_process) except Exception as e: print(f" [ingest] progress callback raised on page {page_num}: {e}") doc.close() return { 'pdf_path': pdf_path, 'page_count': total_pages, 'pages_processed': pages_to_process, 'truncated': truncated, 'pages': pages, }