Multi-page PDF QC for AXA Ireland policy documents. Runs as a third mode alongside static + video, gated on profile.mode. New code isolated under backend/document_mode/ with new endpoints under /api/document/*. Phase 1 — Spine + 6 deterministic doc-scope checks ($0, runs in seconds): - Scope-aware dispatcher (document/targeted/page_sample/page_pair/page_each) - axa_font_inventory, axa_phone_inventory, axa_bold_words_definitions, axa_page_numbering, axa_print_code, axa_omg_versioning - Bootstrap bold-words dictionary extracted from Example 1 General Definitions Phase 3 — Old-vs-new diff (~$0.50/run, 3-5 min): - Page alignment via difflib SequenceMatcher (windowed fuzzy match) - Vision-LLM page-pair diff via Gemini 2.5 Pro (8 concurrent) - Two-slot upload UX, axa_policy_document_diff profile, mode=document_diff Phase 4 — PDF accessibility (PyMuPDF, $0): - 9 PDF/UA-1 aligned criteria (tagged structure, /MarkInfo, title, /Lang, encryption, font embedding, PDF version, XMP UA-conformance, alt-text) - _run_verapdf() stub for optional Java-based veraPDF integration later Phase 5 — Print preflight (PyMuPDF, $0): - 7 criteria (page geometry, bleed, image colour spaces, image DPI, transparency, PDF/X conformance, spot colours) Profile additions: - axa_policy_document — 8 deterministic checks, $0 cost - axa_policy_document_diff — 1 page-pair LLM check, ~$0.50/run API additions: - POST /api/document/start_analysis (single PDF) - POST /api/document/start_diff (old + new PDFs) Frontend additions: - Third profile.mode value (document_diff) in applyProfileMode() - Two-slot upload UX with PDF-only file pickers - checkFormValidity() branches by mode for the analyse-button gate Smoke-tested locally against Example 1 (Home Insurance V8, 86pp) and Example 2 (Landlord V1 vs V10, 68→74pp) with real findings caught including bold-words gaps, missing PDF/UA flag, transparency on press, V1→V10 bold-formatting fixes. Plan + integration map + gotchas in backend/AXA_DOCUMENT_MODE_PLAN.md. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
384 lines
15 KiB
Python
384 lines
15 KiB
Python
"""Print preflight checks — "is this PDF print-ready?".
|
||
|
||
Deterministic Python implementation using PyMuPDF. Covers the high-impact
|
||
preflight signals that catch the most common press surprises without
|
||
requiring Ghostscript or veraPDF (PDF/X) tooling.
|
||
|
||
Criteria checked:
|
||
• PP1 Page geometry consistency — every page has the same MediaBox size
|
||
• PP2 Bleed area defined — TrimBox/BleedBox differ from MediaBox
|
||
• PP3 Image colour spaces — flag RGB images (press wants CMYK/Gray)
|
||
• PP4 Image effective DPI — flag images rendering below 150 DPI
|
||
• PP5 Transparency / overprint — flag pages using transparency (smask, ExtGState)
|
||
• PP6 PDF/X conformance — XMP declares pdfxid:GTS_PDFXVersion or pdfx:GTS_*
|
||
• PP7 Spot colour usage — flag /Separation or /DeviceN colour spaces (Pantone)
|
||
|
||
Phase-5 scope is "is it print-ready?" — simple yes/no with drill-down.
|
||
Future expansion (Ghostscript-based total ink coverage, registration black,
|
||
crop-mark detection, full PDF/X conformance) goes here when scope grows.
|
||
|
||
Note: many AXA policy PDFs are digital-intent (no bleed, RGB OK). For those,
|
||
several of these criteria will fail — that's correct, not a bug. The check
|
||
surfaces the data; the reviewer judges whether print-readiness is required.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from typing import Dict, List, Optional, Tuple
|
||
|
||
import fitz # PyMuPDF
|
||
|
||
|
||
# DPI thresholds (industry conventions)
|
||
DPI_OFFSET_MIN = 300 # commercial offset / glossy stock
|
||
DPI_NEWSPRINT_MIN = 150 # newspaper / coated stock
|
||
DPI_DANGER = 150 # below this, we flag as definite risk
|
||
|
||
|
||
def _criterion(code: str, title: str, passed: bool, note: str = '', detail: Optional[Dict] = None) -> Dict:
|
||
return {
|
||
'code': code,
|
||
'title': title,
|
||
'passed': passed,
|
||
'note': note,
|
||
'detail': detail or {},
|
||
}
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Criterion implementations
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def _check_page_geometry(doc: fitz.Document) -> Dict:
|
||
"""Every page should have the same MediaBox dimensions. Mixed page
|
||
sizes are valid PDF but a press red flag — usually an authoring error."""
|
||
sizes: List[Tuple[float, float]] = []
|
||
for i in range(doc.page_count):
|
||
p = doc.load_page(i)
|
||
w = round(p.mediabox.width, 1)
|
||
h = round(p.mediabox.height, 1)
|
||
# Normalise orientation — landscape vs portrait of same size = same
|
||
sizes.append(tuple(sorted((w, h))))
|
||
distinct = sorted(set(sizes))
|
||
if len(distinct) == 1:
|
||
w, h = distinct[0]
|
||
# Convert to mm for human readability (1 pt = 0.3528 mm)
|
||
w_mm = round(w * 0.3528, 1)
|
||
h_mm = round(h * 0.3528, 1)
|
||
return _criterion(
|
||
'PP1', 'Page geometry consistency', True,
|
||
f'All {doc.page_count} pages are {w_mm} × {h_mm} mm.',
|
||
)
|
||
return _criterion(
|
||
'PP1', 'Page geometry consistency', False,
|
||
f'{len(distinct)} different page sizes found across {doc.page_count} pages.',
|
||
{'distinct_sizes_pts': [list(s) for s in distinct]},
|
||
)
|
||
|
||
|
||
def _check_bleed_defined(doc: fitz.Document) -> Dict:
|
||
"""For print, BleedBox should extend ~3mm beyond TrimBox, and TrimBox
|
||
should be inset from MediaBox. If MediaBox == TrimBox == BleedBox, no
|
||
bleed has been authored — page edge artwork will white out on press.
|
||
|
||
Heuristic: if any page has BleedBox > MediaBox or TrimBox != MediaBox,
|
||
treat as "bleed defined". Otherwise fail.
|
||
"""
|
||
pages_with_bleed = 0
|
||
for i in range(doc.page_count):
|
||
p = doc.load_page(i)
|
||
media = p.mediabox
|
||
trim = p.trimbox
|
||
bleed = p.bleedbox
|
||
# Compare areas — even sub-mm differences count
|
||
if (round(trim.width, 2) != round(media.width, 2)
|
||
or round(trim.height, 2) != round(media.height, 2)
|
||
or round(bleed.width, 2) != round(media.width, 2)
|
||
or round(bleed.height, 2) != round(media.height, 2)):
|
||
pages_with_bleed += 1
|
||
|
||
if pages_with_bleed == 0:
|
||
return _criterion(
|
||
'PP2', 'Bleed area defined', False,
|
||
'No page has TrimBox/BleedBox different from MediaBox — bleed not authored.',
|
||
)
|
||
return _criterion(
|
||
'PP2', 'Bleed area defined', True,
|
||
f'{pages_with_bleed} of {doc.page_count} pages have bleed/trim authored.',
|
||
{'pages_with_bleed': pages_with_bleed},
|
||
)
|
||
|
||
|
||
def _check_image_colorspaces(doc: fitz.Document) -> Dict:
|
||
"""Walk every image, count by colour space. Flag RGB count > 0 — those
|
||
will go through driver-side conversion on press, with risk of colour
|
||
shift. CMYK / DeviceGray / Indexed (palette) are press-safe.
|
||
"""
|
||
cs_counts: Dict[str, int] = {}
|
||
rgb_pages: List[int] = []
|
||
total = 0
|
||
for i in range(doc.page_count):
|
||
for img in doc.get_page_images(i, full=True):
|
||
cs = img[5] or 'Unknown'
|
||
cs_counts[cs] = cs_counts.get(cs, 0) + 1
|
||
total += 1
|
||
if cs == 'DeviceRGB' and (i + 1) not in rgb_pages:
|
||
rgb_pages.append(i + 1)
|
||
|
||
if total == 0:
|
||
return _criterion(
|
||
'PP3', 'Image colour spaces', True,
|
||
'No raster images — colour-space risk does not apply.',
|
||
)
|
||
|
||
rgb_count = cs_counts.get('DeviceRGB', 0)
|
||
cmyk_count = cs_counts.get('DeviceCMYK', 0)
|
||
gray_count = cs_counts.get('DeviceGray', 0)
|
||
|
||
if rgb_count > 0:
|
||
return _criterion(
|
||
'PP3', 'Image colour spaces', False,
|
||
f'{rgb_count} of {total} images are DeviceRGB — press will perform colour conversion.',
|
||
{'colorspace_counts': cs_counts, 'rgb_pages': rgb_pages, 'total_images': total},
|
||
)
|
||
return _criterion(
|
||
'PP3', 'Image colour spaces', True,
|
||
f'No RGB images. Breakdown: CMYK={cmyk_count}, Gray={gray_count}, '
|
||
f'other={total - cmyk_count - gray_count}.',
|
||
{'colorspace_counts': cs_counts, 'total_images': total},
|
||
)
|
||
|
||
|
||
def _check_image_dpi(doc: fitz.Document) -> Dict:
|
||
"""Sample every placed image, compute its effective DPI (raw pixels /
|
||
rendered inches). Flag any below DPI_DANGER (150 DPI).
|
||
"""
|
||
low_dpi: List[Dict] = []
|
||
sampled = 0
|
||
for i in range(doc.page_count):
|
||
page = doc.load_page(i)
|
||
# Build a quick lookup: xref → raw pixel size
|
||
raw_lookup: Dict[int, Tuple[int, int]] = {}
|
||
for img in doc.get_page_images(i, full=True):
|
||
raw_lookup[img[0]] = (img[2], img[3])
|
||
for info in page.get_image_info(xrefs=True):
|
||
xref = info.get('xref')
|
||
bbox = info.get('bbox')
|
||
if xref not in raw_lookup or not bbox:
|
||
continue
|
||
raw_w, raw_h = raw_lookup[xref]
|
||
width_in = (bbox[2] - bbox[0]) / 72.0
|
||
height_in = (bbox[3] - bbox[1]) / 72.0
|
||
if width_in <= 0 or height_in <= 0:
|
||
continue
|
||
dpi_x = raw_w / width_in
|
||
dpi_y = raw_h / height_in
|
||
effective = min(dpi_x, dpi_y)
|
||
sampled += 1
|
||
if effective < DPI_DANGER:
|
||
low_dpi.append({
|
||
'page': i + 1,
|
||
'xref': xref,
|
||
'effective_dpi': round(effective, 0),
|
||
'raw_pixels': [raw_w, raw_h],
|
||
'rendered_inches': [round(width_in, 2), round(height_in, 2)],
|
||
})
|
||
|
||
if sampled == 0:
|
||
return _criterion(
|
||
'PP4', 'Image effective DPI', True,
|
||
'No raster images to inspect.',
|
||
)
|
||
if low_dpi:
|
||
return _criterion(
|
||
'PP4', 'Image effective DPI', False,
|
||
f'{len(low_dpi)} of {sampled} images render below {DPI_DANGER} DPI.',
|
||
{'low_dpi_images': low_dpi, 'sampled': sampled, 'threshold': DPI_DANGER},
|
||
)
|
||
return _criterion(
|
||
'PP4', 'Image effective DPI', True,
|
||
f'All {sampled} images render at ≥ {DPI_DANGER} DPI.',
|
||
{'sampled': sampled, 'threshold': DPI_DANGER},
|
||
)
|
||
|
||
|
||
def _check_transparency(doc: fitz.Document) -> Dict:
|
||
"""Detect transparency / soft-mask usage. Inspect ExtGState dictionaries
|
||
and image SMask references. Live transparency on press = unpredictable
|
||
colour blending unless explicitly flattened.
|
||
"""
|
||
transparent_pages = 0
|
||
for i in range(doc.page_count):
|
||
page = doc.load_page(i)
|
||
# Check ExtGState resources for non-1.0 alpha or SMask
|
||
# PyMuPDF's get_text("dict") doesn't expose this — peek via xref
|
||
try:
|
||
page_obj = doc.xref_object(page.xref)
|
||
except Exception:
|
||
continue
|
||
if '/ExtGState' in page_obj or '/SMask' in page_obj:
|
||
# Could be benign; do a tighter check by scanning resources
|
||
resources_match = re.search(r'/Resources\s*(\d+)\s*0\s*R', page_obj)
|
||
if resources_match:
|
||
try:
|
||
res_obj = doc.xref_object(int(resources_match.group(1)))
|
||
except Exception:
|
||
res_obj = ''
|
||
if 'CA' in res_obj or 'ca' in res_obj or 'SMask' in res_obj:
|
||
transparent_pages += 1
|
||
continue
|
||
transparent_pages += 1
|
||
|
||
if transparent_pages == 0:
|
||
return _criterion(
|
||
'PP5', 'Transparency / overprint', True,
|
||
'No transparency or soft-mask usage detected.',
|
||
)
|
||
return _criterion(
|
||
'PP5', 'Transparency / overprint', False,
|
||
f'{transparent_pages} of {doc.page_count} pages use transparency / soft-masks.',
|
||
{'transparent_pages_count': transparent_pages},
|
||
)
|
||
|
||
|
||
def _check_pdfx_conformance(doc: fitz.Document) -> Dict:
|
||
"""PDF/X is the print-industry conformance standard (PDF/X-1a, 3, 4).
|
||
Look for the XMP declaration of pdfxid:GTS_PDFXVersion or pdfx:GTS_*.
|
||
"""
|
||
try:
|
||
xmp = doc.get_xml_metadata() or ''
|
||
except Exception:
|
||
xmp = ''
|
||
if not xmp:
|
||
return _criterion(
|
||
'PP6', 'PDF/X conformance', False,
|
||
'No XMP metadata stream found.',
|
||
)
|
||
if re.search(r'pdfxid:GTS_PDFXVersion|pdfx:GTS_PDFXVersion', xmp):
|
||
m = re.search(r'GTS_PDFXVersion[^>]*>([^<]+)<', xmp)
|
||
version = m.group(1).strip() if m else '(version not parsed)'
|
||
return _criterion(
|
||
'PP6', 'PDF/X conformance', True,
|
||
f'PDF/X conformance declared: {version}',
|
||
)
|
||
return _criterion(
|
||
'PP6', 'PDF/X conformance', False,
|
||
'No PDF/X conformance flag in XMP metadata.',
|
||
)
|
||
|
||
|
||
def _check_spot_colors(doc: fitz.Document) -> Dict:
|
||
"""Look for /Separation (single spot, e.g. Pantone) or /DeviceN (multi-
|
||
channel spot) colour spaces in the catalog graph. Spot colours are
|
||
print-meaningful but require explicit handling on press; flag presence
|
||
so the reviewer can confirm the spot list is intentional.
|
||
"""
|
||
found_spaces: List[str] = []
|
||
sample_xrefs = list(range(1, min(doc.xref_length(), 1000)))
|
||
for xref in sample_xrefs:
|
||
try:
|
||
obj = doc.xref_object(xref)
|
||
except Exception:
|
||
continue
|
||
if '/Separation' in obj:
|
||
# Pull the spot name token if present
|
||
m = re.search(r'/Separation\s*/([A-Za-z0-9_#=-]+)', obj)
|
||
if m:
|
||
name = m.group(1)
|
||
if name not in found_spaces:
|
||
found_spaces.append(name)
|
||
if '/DeviceN' in obj and 'DeviceN' not in found_spaces:
|
||
found_spaces.append('DeviceN(multi-spot)')
|
||
|
||
if not found_spaces:
|
||
return _criterion(
|
||
'PP7', 'Spot colour usage', True,
|
||
'No spot colour spaces detected — pure CMYK/RGB/Gray.',
|
||
)
|
||
return _criterion(
|
||
'PP7', 'Spot colour usage', False,
|
||
f'{len(found_spaces)} spot colour spaces detected — confirm spot list is intentional.',
|
||
{'spot_spaces': found_spaces},
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Top-level entry
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def axa_print_preflight(ingest_result: Dict, scope_args: Optional[Dict] = None) -> Dict:
|
||
"""Run the full deterministic print-preflight check set on the ingested PDF."""
|
||
pdf_path = ingest_result.get('pdf_path')
|
||
if not pdf_path:
|
||
return {
|
||
'check_name': 'axa_print_preflight',
|
||
'scope': 'document',
|
||
'score': 0.0,
|
||
'pass': False,
|
||
'summary': 'Cannot run — pdf_path missing from ingest_result.',
|
||
'findings': {'error': 'pdf_path_missing'},
|
||
'response': '',
|
||
}
|
||
|
||
try:
|
||
doc = fitz.open(pdf_path)
|
||
except Exception as e:
|
||
return {
|
||
'check_name': 'axa_print_preflight',
|
||
'scope': 'document',
|
||
'score': 0.0,
|
||
'pass': False,
|
||
'summary': f'Failed to open PDF: {e}',
|
||
'findings': {'error': str(e)},
|
||
'response': '',
|
||
}
|
||
|
||
try:
|
||
criteria = [
|
||
_check_page_geometry(doc),
|
||
_check_bleed_defined(doc),
|
||
_check_image_colorspaces(doc),
|
||
_check_image_dpi(doc),
|
||
_check_transparency(doc),
|
||
_check_pdfx_conformance(doc),
|
||
_check_spot_colors(doc),
|
||
]
|
||
finally:
|
||
doc.close()
|
||
|
||
passed = [c for c in criteria if c['passed']]
|
||
failed = [c for c in criteria if not c['passed']]
|
||
total = len(criteria)
|
||
score = round((len(passed) / total) * 10, 2) if total else 0.0
|
||
pass_flag = len(failed) == 0
|
||
|
||
if pass_flag:
|
||
summary = f'All {total} print-preflight criteria passed — print-ready.'
|
||
elif len(failed) <= 2:
|
||
summary = f'{len(failed)} of {total} criteria failed — likely digital-intent or minor preflight gaps.'
|
||
else:
|
||
summary = f'{len(failed)} of {total} criteria failed — not print-ready as-is.'
|
||
|
||
response_lines = [summary, '']
|
||
for c in criteria:
|
||
marker = '✓' if c['passed'] else '✗'
|
||
response_lines.append(f" {marker} {c['code']} — {c['title']}: {c['note']}")
|
||
response = '\n'.join(response_lines)
|
||
|
||
return {
|
||
'check_name': 'axa_print_preflight',
|
||
'scope': 'document',
|
||
'score': score,
|
||
'pass': pass_flag,
|
||
'summary': summary,
|
||
'findings': {
|
||
'criteria': criteria,
|
||
'criteria_total': total,
|
||
'criteria_passed': len(passed),
|
||
'criteria_failed': len(failed),
|
||
},
|
||
'response': response,
|
||
}
|