New formatting_diff module compares span-level bold/italic/font/size/ color attributes between aligned page-pairs. Pure-Python; reads PyMuPDF metadata already captured during ingest. Aggregates identical flips into single findings and flags page-wide style shifts. Powers the AXA document_diff fix for missed formatting changes that the vision-LLM does not reliably detect. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
181 lines
6.4 KiB
Python
181 lines
6.4 KiB
Python
"""PDF ingestion for document-mode QC.
|
|
|
|
Renders each page of a multi-page PDF to a PNG, extracts per-page text spans
|
|
with font name + weight + size, and returns a structured list the dispatcher
|
|
loops over. Phase-1 LLM checks consume only the rendered page image; the
|
|
text-span data is captured here so Phase-2 deterministic checks (font
|
|
compliance, bold-words) can plug in without a re-ingest pass.
|
|
"""
|
|
|
|
import os
|
|
from typing import Dict, List, Optional
|
|
|
|
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
|
|
|
|
PYMUPDF_BOLD_FLAG = 16 # bit 4 of span['flags']
|
|
PYMUPDF_ITALIC_FLAG = 2 # bit 1 of span['flags']
|
|
DEFAULT_RENDER_ZOOM = 2.0 # ≈150 DPI — matches pdf_processor.extract_cover_image
|
|
DEFAULT_MAX_DIMENSION = 1600 # px — slightly larger than reference-asset thumbnails so per-page text stays legible to the LLM
|
|
DEFAULT_PAGE_LIMIT = 200 # safety cap; AXA policy docs are ~80 pages
|
|
|
|
|
|
def _span_is_bold(span: Dict) -> bool:
|
|
"""A span counts as bold if PyMuPDF's flags say so OR the font name signals it."""
|
|
flags = span.get('flags', 0)
|
|
if flags & PYMUPDF_BOLD_FLAG:
|
|
return True
|
|
font = (span.get('font') or '').lower()
|
|
return any(token in font for token in ('bold', 'black', 'heavy'))
|
|
|
|
|
|
def _span_is_italic(span: Dict) -> bool:
|
|
flags = span.get('flags', 0)
|
|
if flags & PYMUPDF_ITALIC_FLAG:
|
|
return True
|
|
font = (span.get('font') or '').lower()
|
|
return 'italic' in font or 'oblique' in font
|
|
|
|
|
|
def _extract_page_spans(page: fitz.Page) -> List[Dict]:
|
|
"""Flatten PyMuPDF's blocks→lines→spans into a list of QC-relevant span dicts."""
|
|
spans = []
|
|
try:
|
|
text_dict = page.get_text("dict")
|
|
except Exception as e:
|
|
print(f" [ingest] get_text(dict) failed on page {page.number + 1}: {e}")
|
|
return spans
|
|
|
|
for block in text_dict.get('blocks', []):
|
|
if block.get('type') != 0: # 0 = text block, 1 = image
|
|
continue
|
|
for line in block.get('lines', []):
|
|
for span in line.get('spans', []):
|
|
text = (span.get('text') or '').strip()
|
|
if not text:
|
|
continue
|
|
color_int = span.get('color', 0) or 0
|
|
spans.append({
|
|
'text': text,
|
|
'font': span.get('font'),
|
|
'size': round(span.get('size', 0), 2),
|
|
'bold': _span_is_bold(span),
|
|
'italic': _span_is_italic(span),
|
|
'bbox': span.get('bbox'), # (x0, y0, x1, y1) in PDF points
|
|
'flags': span.get('flags', 0),
|
|
'color': f'#{color_int & 0xFFFFFF:06x}',
|
|
})
|
|
return spans
|
|
|
|
|
|
def _render_page(page: fitz.Page, output_path: str, zoom: float, max_dim: int) -> Optional[str]:
|
|
"""Render a single page to PNG. Returns saved path or None on failure."""
|
|
try:
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
img.thumbnail((max_dim, max_dim), Image.LANCZOS)
|
|
img.save(output_path, "PNG")
|
|
return output_path
|
|
except Exception as e:
|
|
print(f" [ingest] render failed on page {page.number + 1}: {e}")
|
|
return None
|
|
|
|
|
|
def get_page_count(pdf_path: str) -> int:
|
|
"""Page count without rendering anything. Returns 0 on failure."""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
count = doc.page_count
|
|
doc.close()
|
|
return count
|
|
except Exception as e:
|
|
print(f" [ingest] page count failed for {pdf_path}: {e}")
|
|
return 0
|
|
|
|
|
|
def ingest_pdf(
|
|
pdf_path: str,
|
|
output_dir: str,
|
|
page_limit: int = DEFAULT_PAGE_LIMIT,
|
|
progress_callback=None,
|
|
) -> Dict:
|
|
"""Render every page of a PDF and capture per-page structured text.
|
|
|
|
Args:
|
|
pdf_path: source PDF path.
|
|
output_dir: directory to write page PNGs into. Created if missing.
|
|
page_limit: hard cap on pages processed. Pages beyond the cap are skipped.
|
|
progress_callback: optional callable(page_num, total) for live progress.
|
|
|
|
Returns:
|
|
{
|
|
'page_count': int, # total pages in source PDF
|
|
'pages_processed': int, # pages we actually rendered
|
|
'truncated': bool, # True if page_count > page_limit
|
|
'pages': [
|
|
{
|
|
'page_num': 1-indexed int,
|
|
'image_path': str,
|
|
'raw_text': str,
|
|
'spans': [{ text, font, size, bold, italic, color, bbox, flags }, ...],
|
|
'fonts_used': sorted list of unique font names,
|
|
},
|
|
...
|
|
],
|
|
}
|
|
"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
doc = fitz.open(pdf_path)
|
|
total_pages = doc.page_count
|
|
pages_to_process = min(total_pages, page_limit)
|
|
truncated = total_pages > page_limit
|
|
if truncated:
|
|
print(f" [ingest] PDF has {total_pages} pages, processing first {page_limit} only")
|
|
|
|
pages: List[Dict] = []
|
|
|
|
# Page classifier is optional — only used by document-mode profiles that
|
|
# need a strict-grade exemption for non-artwork pages (e.g. Boots PPack).
|
|
# Importing locally keeps it out of the hot path for AXA-style profiles.
|
|
from .page_classifier import classify_page
|
|
|
|
for i in range(pages_to_process):
|
|
page_num = i + 1 # 1-indexed
|
|
page = doc.load_page(i)
|
|
|
|
image_filename = f"page_{page_num:04d}.png"
|
|
image_path = os.path.join(output_dir, image_filename)
|
|
rendered = _render_page(page, image_path, DEFAULT_RENDER_ZOOM, DEFAULT_MAX_DIMENSION)
|
|
|
|
spans = _extract_page_spans(page)
|
|
raw_text = page.get_text().strip()
|
|
fonts_used = sorted({s['font'] for s in spans if s.get('font')})
|
|
|
|
page_record = {
|
|
'page_num': page_num,
|
|
'image_path': rendered,
|
|
'raw_text': raw_text,
|
|
'spans': spans,
|
|
'fonts_used': fonts_used,
|
|
}
|
|
page_record['page_type'] = classify_page(page_record)
|
|
pages.append(page_record)
|
|
|
|
if progress_callback:
|
|
try:
|
|
progress_callback(page_num, pages_to_process)
|
|
except Exception as e:
|
|
print(f" [ingest] progress callback raised on page {page_num}: {e}")
|
|
|
|
doc.close()
|
|
|
|
return {
|
|
'pdf_path': pdf_path,
|
|
'page_count': total_pages,
|
|
'pages_processed': pages_to_process,
|
|
'truncated': truncated,
|
|
'pages': pages,
|
|
}
|