ai_qc/backend/document_mode/ingest.py
nickviljoen 98679e7329 feat(document_mode): add deterministic span formatting diff
New formatting_diff module compares span-level bold/italic/font/size/
color attributes between aligned page-pairs. Pure-Python; reads
PyMuPDF metadata already captured during ingest. Aggregates identical
flips into single findings and flags page-wide style shifts.

Powers the AXA document_diff fix for missed formatting changes that
the vision-LLM does not reliably detect.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 09:56:34 +02:00

181 lines
6.4 KiB
Python

"""PDF ingestion for document-mode QC.
Renders each page of a multi-page PDF to a PNG, extracts per-page text spans
with font name + weight + size, and returns a structured list the dispatcher
loops over. Phase-1 LLM checks consume only the rendered page image; the
text-span data is captured here so Phase-2 deterministic checks (font
compliance, bold-words) can plug in without a re-ingest pass.
"""
import os
from typing import Dict, List, Optional
import fitz # PyMuPDF
from PIL import Image
PYMUPDF_BOLD_FLAG = 16 # bit 4 of span['flags']
PYMUPDF_ITALIC_FLAG = 2 # bit 1 of span['flags']
DEFAULT_RENDER_ZOOM = 2.0 # ≈150 DPI — matches pdf_processor.extract_cover_image
DEFAULT_MAX_DIMENSION = 1600 # px — slightly larger than reference-asset thumbnails so per-page text stays legible to the LLM
DEFAULT_PAGE_LIMIT = 200 # safety cap; AXA policy docs are ~80 pages
def _span_is_bold(span: Dict) -> bool:
"""A span counts as bold if PyMuPDF's flags say so OR the font name signals it."""
flags = span.get('flags', 0)
if flags & PYMUPDF_BOLD_FLAG:
return True
font = (span.get('font') or '').lower()
return any(token in font for token in ('bold', 'black', 'heavy'))
def _span_is_italic(span: Dict) -> bool:
flags = span.get('flags', 0)
if flags & PYMUPDF_ITALIC_FLAG:
return True
font = (span.get('font') or '').lower()
return 'italic' in font or 'oblique' in font
def _extract_page_spans(page: fitz.Page) -> List[Dict]:
"""Flatten PyMuPDF's blocks→lines→spans into a list of QC-relevant span dicts."""
spans = []
try:
text_dict = page.get_text("dict")
except Exception as e:
print(f" [ingest] get_text(dict) failed on page {page.number + 1}: {e}")
return spans
for block in text_dict.get('blocks', []):
if block.get('type') != 0: # 0 = text block, 1 = image
continue
for line in block.get('lines', []):
for span in line.get('spans', []):
text = (span.get('text') or '').strip()
if not text:
continue
color_int = span.get('color', 0) or 0
spans.append({
'text': text,
'font': span.get('font'),
'size': round(span.get('size', 0), 2),
'bold': _span_is_bold(span),
'italic': _span_is_italic(span),
'bbox': span.get('bbox'), # (x0, y0, x1, y1) in PDF points
'flags': span.get('flags', 0),
'color': f'#{color_int & 0xFFFFFF:06x}',
})
return spans
def _render_page(page: fitz.Page, output_path: str, zoom: float, max_dim: int) -> Optional[str]:
"""Render a single page to PNG. Returns saved path or None on failure."""
try:
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img.thumbnail((max_dim, max_dim), Image.LANCZOS)
img.save(output_path, "PNG")
return output_path
except Exception as e:
print(f" [ingest] render failed on page {page.number + 1}: {e}")
return None
def get_page_count(pdf_path: str) -> int:
"""Page count without rendering anything. Returns 0 on failure."""
try:
doc = fitz.open(pdf_path)
count = doc.page_count
doc.close()
return count
except Exception as e:
print(f" [ingest] page count failed for {pdf_path}: {e}")
return 0
def ingest_pdf(
pdf_path: str,
output_dir: str,
page_limit: int = DEFAULT_PAGE_LIMIT,
progress_callback=None,
) -> Dict:
"""Render every page of a PDF and capture per-page structured text.
Args:
pdf_path: source PDF path.
output_dir: directory to write page PNGs into. Created if missing.
page_limit: hard cap on pages processed. Pages beyond the cap are skipped.
progress_callback: optional callable(page_num, total) for live progress.
Returns:
{
'page_count': int, # total pages in source PDF
'pages_processed': int, # pages we actually rendered
'truncated': bool, # True if page_count > page_limit
'pages': [
{
'page_num': 1-indexed int,
'image_path': str,
'raw_text': str,
'spans': [{ text, font, size, bold, italic, color, bbox, flags }, ...],
'fonts_used': sorted list of unique font names,
},
...
],
}
"""
os.makedirs(output_dir, exist_ok=True)
doc = fitz.open(pdf_path)
total_pages = doc.page_count
pages_to_process = min(total_pages, page_limit)
truncated = total_pages > page_limit
if truncated:
print(f" [ingest] PDF has {total_pages} pages, processing first {page_limit} only")
pages: List[Dict] = []
# Page classifier is optional — only used by document-mode profiles that
# need a strict-grade exemption for non-artwork pages (e.g. Boots PPack).
# Importing locally keeps it out of the hot path for AXA-style profiles.
from .page_classifier import classify_page
for i in range(pages_to_process):
page_num = i + 1 # 1-indexed
page = doc.load_page(i)
image_filename = f"page_{page_num:04d}.png"
image_path = os.path.join(output_dir, image_filename)
rendered = _render_page(page, image_path, DEFAULT_RENDER_ZOOM, DEFAULT_MAX_DIMENSION)
spans = _extract_page_spans(page)
raw_text = page.get_text().strip()
fonts_used = sorted({s['font'] for s in spans if s.get('font')})
page_record = {
'page_num': page_num,
'image_path': rendered,
'raw_text': raw_text,
'spans': spans,
'fonts_used': fonts_used,
}
page_record['page_type'] = classify_page(page_record)
pages.append(page_record)
if progress_callback:
try:
progress_callback(page_num, pages_to_process)
except Exception as e:
print(f" [ingest] progress callback raised on page {page_num}: {e}")
doc.close()
return {
'pdf_path': pdf_path,
'page_count': total_pages,
'pages_processed': pages_to_process,
'truncated': truncated,
'pages': pages,
}