"""Print preflight checks — "is this PDF print-ready?". Deterministic Python implementation using PyMuPDF. Covers the high-impact preflight signals that catch the most common press surprises without requiring Ghostscript or veraPDF (PDF/X) tooling. Criteria checked: • PP1 Page geometry consistency — every page has the same MediaBox size • PP2 Bleed area defined — TrimBox/BleedBox differ from MediaBox • PP3 Image colour spaces — flag RGB images (press wants CMYK/Gray) • PP4 Image effective DPI — flag images rendering below 150 DPI • PP5 Transparency / overprint — flag pages using transparency (smask, ExtGState) • PP6 PDF/X conformance — XMP declares pdfxid:GTS_PDFXVersion or pdfx:GTS_* • PP7 Spot colour usage — flag /Separation or /DeviceN colour spaces (Pantone) Phase-5 scope is "is it print-ready?" — simple yes/no with drill-down. Future expansion (Ghostscript-based total ink coverage, registration black, crop-mark detection, full PDF/X conformance) goes here when scope grows. Note: many AXA policy PDFs are digital-intent (no bleed, RGB OK). For those, several of these criteria will fail — that's correct, not a bug. The check surfaces the data; the reviewer judges whether print-readiness is required. """ from __future__ import annotations import re from typing import Dict, List, Optional, Tuple import fitz # PyMuPDF # DPI thresholds (industry conventions) DPI_OFFSET_MIN = 300 # commercial offset / glossy stock DPI_NEWSPRINT_MIN = 150 # newspaper / coated stock DPI_DANGER = 150 # below this, we flag as definite risk def _criterion(code: str, title: str, passed: bool, note: str = '', detail: Optional[Dict] = None) -> Dict: return { 'code': code, 'title': title, 'passed': passed, 'note': note, 'detail': detail or {}, } # ───────────────────────────────────────────────────────────────────────────── # Criterion implementations # ───────────────────────────────────────────────────────────────────────────── def _check_page_geometry(doc: fitz.Document) -> Dict: """Every page should have the same MediaBox dimensions. Mixed page sizes are valid PDF but a press red flag — usually an authoring error.""" sizes: List[Tuple[float, float]] = [] for i in range(doc.page_count): p = doc.load_page(i) w = round(p.mediabox.width, 1) h = round(p.mediabox.height, 1) # Normalise orientation — landscape vs portrait of same size = same sizes.append(tuple(sorted((w, h)))) distinct = sorted(set(sizes)) if len(distinct) == 1: w, h = distinct[0] # Convert to mm for human readability (1 pt = 0.3528 mm) w_mm = round(w * 0.3528, 1) h_mm = round(h * 0.3528, 1) return _criterion( 'PP1', 'Page geometry consistency', True, f'All {doc.page_count} pages are {w_mm} × {h_mm} mm.', ) return _criterion( 'PP1', 'Page geometry consistency', False, f'{len(distinct)} different page sizes found across {doc.page_count} pages.', {'distinct_sizes_pts': [list(s) for s in distinct]}, ) def _check_bleed_defined(doc: fitz.Document) -> Dict: """For print, BleedBox should extend ~3mm beyond TrimBox, and TrimBox should be inset from MediaBox. If MediaBox == TrimBox == BleedBox, no bleed has been authored — page edge artwork will white out on press. Heuristic: if any page has BleedBox > MediaBox or TrimBox != MediaBox, treat as "bleed defined". Otherwise fail. """ pages_with_bleed = 0 for i in range(doc.page_count): p = doc.load_page(i) media = p.mediabox trim = p.trimbox bleed = p.bleedbox # Compare areas — even sub-mm differences count if (round(trim.width, 2) != round(media.width, 2) or round(trim.height, 2) != round(media.height, 2) or round(bleed.width, 2) != round(media.width, 2) or round(bleed.height, 2) != round(media.height, 2)): pages_with_bleed += 1 if pages_with_bleed == 0: return _criterion( 'PP2', 'Bleed area defined', False, 'No page has TrimBox/BleedBox different from MediaBox — bleed not authored.', ) return _criterion( 'PP2', 'Bleed area defined', True, f'{pages_with_bleed} of {doc.page_count} pages have bleed/trim authored.', {'pages_with_bleed': pages_with_bleed}, ) def _check_image_colorspaces(doc: fitz.Document) -> Dict: """Walk every image, count by colour space. Flag RGB count > 0 — those will go through driver-side conversion on press, with risk of colour shift. CMYK / DeviceGray / Indexed (palette) are press-safe. """ cs_counts: Dict[str, int] = {} rgb_pages: List[int] = [] total = 0 for i in range(doc.page_count): for img in doc.get_page_images(i, full=True): cs = img[5] or 'Unknown' cs_counts[cs] = cs_counts.get(cs, 0) + 1 total += 1 if cs == 'DeviceRGB' and (i + 1) not in rgb_pages: rgb_pages.append(i + 1) if total == 0: return _criterion( 'PP3', 'Image colour spaces', True, 'No raster images — colour-space risk does not apply.', ) rgb_count = cs_counts.get('DeviceRGB', 0) cmyk_count = cs_counts.get('DeviceCMYK', 0) gray_count = cs_counts.get('DeviceGray', 0) if rgb_count > 0: return _criterion( 'PP3', 'Image colour spaces', False, f'{rgb_count} of {total} images are DeviceRGB — press will perform colour conversion.', {'colorspace_counts': cs_counts, 'rgb_pages': rgb_pages, 'total_images': total}, ) return _criterion( 'PP3', 'Image colour spaces', True, f'No RGB images. Breakdown: CMYK={cmyk_count}, Gray={gray_count}, ' f'other={total - cmyk_count - gray_count}.', {'colorspace_counts': cs_counts, 'total_images': total}, ) def _check_image_dpi(doc: fitz.Document) -> Dict: """Sample every placed image, compute its effective DPI (raw pixels / rendered inches). Flag any below DPI_DANGER (150 DPI). """ low_dpi: List[Dict] = [] sampled = 0 for i in range(doc.page_count): page = doc.load_page(i) # Build a quick lookup: xref → raw pixel size raw_lookup: Dict[int, Tuple[int, int]] = {} for img in doc.get_page_images(i, full=True): raw_lookup[img[0]] = (img[2], img[3]) for info in page.get_image_info(xrefs=True): xref = info.get('xref') bbox = info.get('bbox') if xref not in raw_lookup or not bbox: continue raw_w, raw_h = raw_lookup[xref] width_in = (bbox[2] - bbox[0]) / 72.0 height_in = (bbox[3] - bbox[1]) / 72.0 if width_in <= 0 or height_in <= 0: continue dpi_x = raw_w / width_in dpi_y = raw_h / height_in effective = min(dpi_x, dpi_y) sampled += 1 if effective < DPI_DANGER: low_dpi.append({ 'page': i + 1, 'xref': xref, 'effective_dpi': round(effective, 0), 'raw_pixels': [raw_w, raw_h], 'rendered_inches': [round(width_in, 2), round(height_in, 2)], }) if sampled == 0: return _criterion( 'PP4', 'Image effective DPI', True, 'No raster images to inspect.', ) if low_dpi: return _criterion( 'PP4', 'Image effective DPI', False, f'{len(low_dpi)} of {sampled} images render below {DPI_DANGER} DPI.', {'low_dpi_images': low_dpi, 'sampled': sampled, 'threshold': DPI_DANGER}, ) return _criterion( 'PP4', 'Image effective DPI', True, f'All {sampled} images render at ≥ {DPI_DANGER} DPI.', {'sampled': sampled, 'threshold': DPI_DANGER}, ) def _check_transparency(doc: fitz.Document) -> Dict: """Detect transparency / soft-mask usage. Inspect ExtGState dictionaries and image SMask references. Live transparency on press = unpredictable colour blending unless explicitly flattened. """ transparent_pages = 0 for i in range(doc.page_count): page = doc.load_page(i) # Check ExtGState resources for non-1.0 alpha or SMask # PyMuPDF's get_text("dict") doesn't expose this — peek via xref try: page_obj = doc.xref_object(page.xref) except Exception: continue if '/ExtGState' in page_obj or '/SMask' in page_obj: # Could be benign; do a tighter check by scanning resources resources_match = re.search(r'/Resources\s*(\d+)\s*0\s*R', page_obj) if resources_match: try: res_obj = doc.xref_object(int(resources_match.group(1))) except Exception: res_obj = '' if 'CA' in res_obj or 'ca' in res_obj or 'SMask' in res_obj: transparent_pages += 1 continue transparent_pages += 1 if transparent_pages == 0: return _criterion( 'PP5', 'Transparency / overprint', True, 'No transparency or soft-mask usage detected.', ) return _criterion( 'PP5', 'Transparency / overprint', False, f'{transparent_pages} of {doc.page_count} pages use transparency / soft-masks.', {'transparent_pages_count': transparent_pages}, ) def _check_pdfx_conformance(doc: fitz.Document) -> Dict: """PDF/X is the print-industry conformance standard (PDF/X-1a, 3, 4). Look for the XMP declaration of pdfxid:GTS_PDFXVersion or pdfx:GTS_*. """ try: xmp = doc.get_xml_metadata() or '' except Exception: xmp = '' if not xmp: return _criterion( 'PP6', 'PDF/X conformance', False, 'No XMP metadata stream found.', ) if re.search(r'pdfxid:GTS_PDFXVersion|pdfx:GTS_PDFXVersion', xmp): m = re.search(r'GTS_PDFXVersion[^>]*>([^<]+)<', xmp) version = m.group(1).strip() if m else '(version not parsed)' return _criterion( 'PP6', 'PDF/X conformance', True, f'PDF/X conformance declared: {version}', ) return _criterion( 'PP6', 'PDF/X conformance', False, 'No PDF/X conformance flag in XMP metadata.', ) def _check_spot_colors(doc: fitz.Document) -> Dict: """Look for /Separation (single spot, e.g. Pantone) or /DeviceN (multi- channel spot) colour spaces in the catalog graph. Spot colours are print-meaningful but require explicit handling on press; flag presence so the reviewer can confirm the spot list is intentional. """ found_spaces: List[str] = [] sample_xrefs = list(range(1, min(doc.xref_length(), 1000))) for xref in sample_xrefs: try: obj = doc.xref_object(xref) except Exception: continue if '/Separation' in obj: # Pull the spot name token if present m = re.search(r'/Separation\s*/([A-Za-z0-9_#=-]+)', obj) if m: name = m.group(1) if name not in found_spaces: found_spaces.append(name) if '/DeviceN' in obj and 'DeviceN' not in found_spaces: found_spaces.append('DeviceN(multi-spot)') if not found_spaces: return _criterion( 'PP7', 'Spot colour usage', True, 'No spot colour spaces detected — pure CMYK/RGB/Gray.', ) return _criterion( 'PP7', 'Spot colour usage', False, f'{len(found_spaces)} spot colour spaces detected — confirm spot list is intentional.', {'spot_spaces': found_spaces}, ) # ───────────────────────────────────────────────────────────────────────────── # Top-level entry # ───────────────────────────────────────────────────────────────────────────── def axa_print_preflight(ingest_result: Dict, scope_args: Optional[Dict] = None) -> Dict: """Run the full deterministic print-preflight check set on the ingested PDF.""" pdf_path = ingest_result.get('pdf_path') if not pdf_path: return { 'check_name': 'axa_print_preflight', 'scope': 'document', 'score': 0.0, 'pass': False, 'summary': 'Cannot run — pdf_path missing from ingest_result.', 'findings': {'error': 'pdf_path_missing'}, 'response': '', } try: doc = fitz.open(pdf_path) except Exception as e: return { 'check_name': 'axa_print_preflight', 'scope': 'document', 'score': 0.0, 'pass': False, 'summary': f'Failed to open PDF: {e}', 'findings': {'error': str(e)}, 'response': '', } try: criteria = [ _check_page_geometry(doc), _check_bleed_defined(doc), _check_image_colorspaces(doc), _check_image_dpi(doc), _check_transparency(doc), _check_pdfx_conformance(doc), _check_spot_colors(doc), ] finally: doc.close() passed = [c for c in criteria if c['passed']] failed = [c for c in criteria if not c['passed']] total = len(criteria) score = round((len(passed) / total) * 10, 2) if total else 0.0 pass_flag = len(failed) == 0 if pass_flag: summary = f'All {total} print-preflight criteria passed — print-ready.' elif len(failed) <= 2: summary = f'{len(failed)} of {total} criteria failed — likely digital-intent or minor preflight gaps.' else: summary = f'{len(failed)} of {total} criteria failed — not print-ready as-is.' response_lines = [summary, ''] for c in criteria: marker = '✓' if c['passed'] else '✗' response_lines.append(f" {marker} {c['code']} — {c['title']}: {c['note']}") response = '\n'.join(response_lines) return { 'check_name': 'axa_print_preflight', 'scope': 'document', 'score': score, 'pass': pass_flag, 'summary': summary, 'findings': { 'criteria': criteria, 'criteria_total': total, 'criteria_passed': len(passed), 'criteria_failed': len(failed), }, 'response': response, }