Multi-page PDF QC for AXA Ireland policy documents. Runs as a third mode alongside static + video, gated on profile.mode. New code isolated under backend/document_mode/ with new endpoints under /api/document/*. Phase 1 — Spine + 6 deterministic doc-scope checks ($0, runs in seconds): - Scope-aware dispatcher (document/targeted/page_sample/page_pair/page_each) - axa_font_inventory, axa_phone_inventory, axa_bold_words_definitions, axa_page_numbering, axa_print_code, axa_omg_versioning - Bootstrap bold-words dictionary extracted from Example 1 General Definitions Phase 3 — Old-vs-new diff (~$0.50/run, 3-5 min): - Page alignment via difflib SequenceMatcher (windowed fuzzy match) - Vision-LLM page-pair diff via Gemini 2.5 Pro (8 concurrent) - Two-slot upload UX, axa_policy_document_diff profile, mode=document_diff Phase 4 — PDF accessibility (PyMuPDF, $0): - 9 PDF/UA-1 aligned criteria (tagged structure, /MarkInfo, title, /Lang, encryption, font embedding, PDF version, XMP UA-conformance, alt-text) - _run_verapdf() stub for optional Java-based veraPDF integration later Phase 5 — Print preflight (PyMuPDF, $0): - 7 criteria (page geometry, bleed, image colour spaces, image DPI, transparency, PDF/X conformance, spot colours) Profile additions: - axa_policy_document — 8 deterministic checks, $0 cost - axa_policy_document_diff — 1 page-pair LLM check, ~$0.50/run API additions: - POST /api/document/start_analysis (single PDF) - POST /api/document/start_diff (old + new PDFs) Frontend additions: - Third profile.mode value (document_diff) in applyProfileMode() - Two-slot upload UX with PDF-only file pickers - checkFormValidity() branches by mode for the analyse-button gate Smoke-tested locally against Example 1 (Home Insurance V8, 86pp) and Example 2 (Landlord V1 vs V10, 68→74pp) with real findings caught including bold-words gaps, missing PDF/UA flag, transparency on press, V1→V10 bold-formatting fixes. Plan + integration map + gotchas in backend/AXA_DOCUMENT_MODE_PLAN.md. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
172 lines
6 KiB
Python
172 lines
6 KiB
Python
"""PDF ingestion for document-mode QC.
|
|
|
|
Renders each page of a multi-page PDF to a PNG, extracts per-page text spans
|
|
with font name + weight + size, and returns a structured list the dispatcher
|
|
loops over. Phase-1 LLM checks consume only the rendered page image; the
|
|
text-span data is captured here so Phase-2 deterministic checks (font
|
|
compliance, bold-words) can plug in without a re-ingest pass.
|
|
"""
|
|
|
|
import os
|
|
from typing import Dict, List, Optional
|
|
|
|
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
|
|
|
|
PYMUPDF_BOLD_FLAG = 16 # bit 4 of span['flags']
|
|
PYMUPDF_ITALIC_FLAG = 2 # bit 1 of span['flags']
|
|
DEFAULT_RENDER_ZOOM = 2.0 # ≈150 DPI — matches pdf_processor.extract_cover_image
|
|
DEFAULT_MAX_DIMENSION = 1600 # px — slightly larger than reference-asset thumbnails so per-page text stays legible to the LLM
|
|
DEFAULT_PAGE_LIMIT = 200 # safety cap; AXA policy docs are ~80 pages
|
|
|
|
|
|
def _span_is_bold(span: Dict) -> bool:
|
|
"""A span counts as bold if PyMuPDF's flags say so OR the font name signals it."""
|
|
flags = span.get('flags', 0)
|
|
if flags & PYMUPDF_BOLD_FLAG:
|
|
return True
|
|
font = (span.get('font') or '').lower()
|
|
return any(token in font for token in ('bold', 'black', 'heavy'))
|
|
|
|
|
|
def _span_is_italic(span: Dict) -> bool:
|
|
flags = span.get('flags', 0)
|
|
if flags & PYMUPDF_ITALIC_FLAG:
|
|
return True
|
|
font = (span.get('font') or '').lower()
|
|
return 'italic' in font or 'oblique' in font
|
|
|
|
|
|
def _extract_page_spans(page: fitz.Page) -> List[Dict]:
|
|
"""Flatten PyMuPDF's blocks→lines→spans into a list of QC-relevant span dicts."""
|
|
spans = []
|
|
try:
|
|
text_dict = page.get_text("dict")
|
|
except Exception as e:
|
|
print(f" [ingest] get_text(dict) failed on page {page.number + 1}: {e}")
|
|
return spans
|
|
|
|
for block in text_dict.get('blocks', []):
|
|
if block.get('type') != 0: # 0 = text block, 1 = image
|
|
continue
|
|
for line in block.get('lines', []):
|
|
for span in line.get('spans', []):
|
|
text = (span.get('text') or '').strip()
|
|
if not text:
|
|
continue
|
|
spans.append({
|
|
'text': text,
|
|
'font': span.get('font'),
|
|
'size': round(span.get('size', 0), 2),
|
|
'bold': _span_is_bold(span),
|
|
'italic': _span_is_italic(span),
|
|
'bbox': span.get('bbox'), # (x0, y0, x1, y1) in PDF points
|
|
'flags': span.get('flags', 0),
|
|
})
|
|
return spans
|
|
|
|
|
|
def _render_page(page: fitz.Page, output_path: str, zoom: float, max_dim: int) -> Optional[str]:
|
|
"""Render a single page to PNG. Returns saved path or None on failure."""
|
|
try:
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
img.thumbnail((max_dim, max_dim), Image.LANCZOS)
|
|
img.save(output_path, "PNG")
|
|
return output_path
|
|
except Exception as e:
|
|
print(f" [ingest] render failed on page {page.number + 1}: {e}")
|
|
return None
|
|
|
|
|
|
def get_page_count(pdf_path: str) -> int:
|
|
"""Page count without rendering anything. Returns 0 on failure."""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
count = doc.page_count
|
|
doc.close()
|
|
return count
|
|
except Exception as e:
|
|
print(f" [ingest] page count failed for {pdf_path}: {e}")
|
|
return 0
|
|
|
|
|
|
def ingest_pdf(
|
|
pdf_path: str,
|
|
output_dir: str,
|
|
page_limit: int = DEFAULT_PAGE_LIMIT,
|
|
progress_callback=None,
|
|
) -> Dict:
|
|
"""Render every page of a PDF and capture per-page structured text.
|
|
|
|
Args:
|
|
pdf_path: source PDF path.
|
|
output_dir: directory to write page PNGs into. Created if missing.
|
|
page_limit: hard cap on pages processed. Pages beyond the cap are skipped.
|
|
progress_callback: optional callable(page_num, total) for live progress.
|
|
|
|
Returns:
|
|
{
|
|
'page_count': int, # total pages in source PDF
|
|
'pages_processed': int, # pages we actually rendered
|
|
'truncated': bool, # True if page_count > page_limit
|
|
'pages': [
|
|
{
|
|
'page_num': 1-indexed int,
|
|
'image_path': str,
|
|
'raw_text': str,
|
|
'spans': [{ text, font, size, bold, italic, bbox, flags }, ...],
|
|
'fonts_used': sorted list of unique font names,
|
|
},
|
|
...
|
|
],
|
|
}
|
|
"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
doc = fitz.open(pdf_path)
|
|
total_pages = doc.page_count
|
|
pages_to_process = min(total_pages, page_limit)
|
|
truncated = total_pages > page_limit
|
|
if truncated:
|
|
print(f" [ingest] PDF has {total_pages} pages, processing first {page_limit} only")
|
|
|
|
pages: List[Dict] = []
|
|
|
|
for i in range(pages_to_process):
|
|
page_num = i + 1 # 1-indexed
|
|
page = doc.load_page(i)
|
|
|
|
image_filename = f"page_{page_num:04d}.png"
|
|
image_path = os.path.join(output_dir, image_filename)
|
|
rendered = _render_page(page, image_path, DEFAULT_RENDER_ZOOM, DEFAULT_MAX_DIMENSION)
|
|
|
|
spans = _extract_page_spans(page)
|
|
raw_text = page.get_text().strip()
|
|
fonts_used = sorted({s['font'] for s in spans if s.get('font')})
|
|
|
|
pages.append({
|
|
'page_num': page_num,
|
|
'image_path': rendered,
|
|
'raw_text': raw_text,
|
|
'spans': spans,
|
|
'fonts_used': fonts_used,
|
|
})
|
|
|
|
if progress_callback:
|
|
try:
|
|
progress_callback(page_num, pages_to_process)
|
|
except Exception as e:
|
|
print(f" [ingest] progress callback raised on page {page_num}: {e}")
|
|
|
|
doc.close()
|
|
|
|
return {
|
|
'pdf_path': pdf_path,
|
|
'page_count': total_pages,
|
|
'pages_processed': pages_to_process,
|
|
'truncated': truncated,
|
|
'pages': pages,
|
|
}
|