ai_qc/backend/document_mode/print_preflight_checks.py

"""Print preflight checks — "is this PDF print-ready?".

Deterministic Python implementation using PyMuPDF. Covers the high-impact
preflight signals that catch the most common press surprises without
requiring Ghostscript or veraPDF (PDF/X) tooling.

Criteria checked:
  • PP1  Page geometry consistency — every page has the same MediaBox size
  • PP2  Bleed area defined — TrimBox/BleedBox differ from MediaBox
  • PP3  Image colour spaces — flag RGB images (press wants CMYK/Gray)
  • PP4  Image effective DPI — flag images rendering below 150 DPI
  • PP5  Transparency / overprint — flag pages using transparency (smask, ExtGState)
  • PP6  PDF/X conformance — XMP declares pdfxid:GTS_PDFXVersion or pdfx:GTS_*
  • PP7  Spot colour usage — flag /Separation or /DeviceN colour spaces (Pantone)

Phase-5 scope is "is it print-ready?" — simple yes/no with drill-down.
Future expansion (Ghostscript-based total ink coverage, registration black,
crop-mark detection, full PDF/X conformance) goes here when scope grows.

Note: many AXA policy PDFs are digital-intent (no bleed, RGB OK). For those,
several of these criteria will fail — that's correct, not a bug. The check
surfaces the data; the reviewer judges whether print-readiness is required.
"""

from __future__ import annotations

import re
from typing import Dict, List, Optional, Tuple

import fitz  # PyMuPDF


# DPI thresholds (industry conventions)
DPI_OFFSET_MIN = 300   # commercial offset / glossy stock
DPI_NEWSPRINT_MIN = 150  # newspaper / coated stock
DPI_DANGER = 150       # below this, we flag as definite risk


def _criterion(code: str, title: str, passed: bool, note: str = '', detail: Optional[Dict] = None) -> Dict:
    return {
        'code': code,
        'title': title,
        'passed': passed,
        'note': note,
        'detail': detail or {},
    }


# ─────────────────────────────────────────────────────────────────────────────
# Criterion implementations
# ─────────────────────────────────────────────────────────────────────────────


def _check_page_geometry(doc: fitz.Document) -> Dict:
    """Every page should have the same MediaBox dimensions. Mixed page
    sizes are valid PDF but a press red flag — usually an authoring error."""
    sizes: List[Tuple[float, float]] = []
    for i in range(doc.page_count):
        p = doc.load_page(i)
        w = round(p.mediabox.width, 1)
        h = round(p.mediabox.height, 1)
        # Normalise orientation — landscape vs portrait of same size = same
        sizes.append(tuple(sorted((w, h))))
    distinct = sorted(set(sizes))
    if len(distinct) == 1:
        w, h = distinct[0]
        # Convert to mm for human readability (1 pt = 0.3528 mm)
        w_mm = round(w * 0.3528, 1)
        h_mm = round(h * 0.3528, 1)
        return _criterion(
            'PP1', 'Page geometry consistency', True,
            f'All {doc.page_count} pages are {w_mm} × {h_mm} mm.',
        )
    return _criterion(
        'PP1', 'Page geometry consistency', False,
        f'{len(distinct)} different page sizes found across {doc.page_count} pages.',
        {'distinct_sizes_pts': [list(s) for s in distinct]},
    )


def _check_bleed_defined(doc: fitz.Document) -> Dict:
    """For print, BleedBox should extend ~3mm beyond TrimBox, and TrimBox
    should be inset from MediaBox. If MediaBox == TrimBox == BleedBox, no
    bleed has been authored — page edge artwork will white out on press.

    Heuristic: if any page has BleedBox > MediaBox or TrimBox != MediaBox,
    treat as "bleed defined". Otherwise fail.
    """
    pages_with_bleed = 0
    for i in range(doc.page_count):
        p = doc.load_page(i)
        media = p.mediabox
        trim = p.trimbox
        bleed = p.bleedbox
        # Compare areas — even sub-mm differences count
        if (round(trim.width, 2) != round(media.width, 2)
                or round(trim.height, 2) != round(media.height, 2)
                or round(bleed.width, 2) != round(media.width, 2)
                or round(bleed.height, 2) != round(media.height, 2)):
            pages_with_bleed += 1

    if pages_with_bleed == 0:
        return _criterion(
            'PP2', 'Bleed area defined', False,
            'No page has TrimBox/BleedBox different from MediaBox — bleed not authored.',
        )
    return _criterion(
        'PP2', 'Bleed area defined', True,
        f'{pages_with_bleed} of {doc.page_count} pages have bleed/trim authored.',
        {'pages_with_bleed': pages_with_bleed},
    )


def _check_image_colorspaces(doc: fitz.Document) -> Dict:
    """Walk every image, count by colour space. Flag RGB count > 0 — those
    will go through driver-side conversion on press, with risk of colour
    shift. CMYK / DeviceGray / Indexed (palette) are press-safe.
    """
    cs_counts: Dict[str, int] = {}
    rgb_pages: List[int] = []
    total = 0
    for i in range(doc.page_count):
        for img in doc.get_page_images(i, full=True):
            cs = img[5] or 'Unknown'
            cs_counts[cs] = cs_counts.get(cs, 0) + 1
            total += 1
            if cs == 'DeviceRGB' and (i + 1) not in rgb_pages:
                rgb_pages.append(i + 1)

    if total == 0:
        return _criterion(
            'PP3', 'Image colour spaces', True,
            'No raster images — colour-space risk does not apply.',
        )

    rgb_count = cs_counts.get('DeviceRGB', 0)
    cmyk_count = cs_counts.get('DeviceCMYK', 0)
    gray_count = cs_counts.get('DeviceGray', 0)

    if rgb_count > 0:
        return _criterion(
            'PP3', 'Image colour spaces', False,
            f'{rgb_count} of {total} images are DeviceRGB — press will perform colour conversion.',
            {'colorspace_counts': cs_counts, 'rgb_pages': rgb_pages, 'total_images': total},
        )
    return _criterion(
        'PP3', 'Image colour spaces', True,
        f'No RGB images. Breakdown: CMYK={cmyk_count}, Gray={gray_count}, '
        f'other={total - cmyk_count - gray_count}.',
        {'colorspace_counts': cs_counts, 'total_images': total},
    )


def _check_image_dpi(doc: fitz.Document) -> Dict:
    """Sample every placed image, compute its effective DPI (raw pixels /
    rendered inches). Flag any below DPI_DANGER (150 DPI).
    """
    low_dpi: List[Dict] = []
    sampled = 0
    for i in range(doc.page_count):
        page = doc.load_page(i)
        # Build a quick lookup: xref → raw pixel size
        raw_lookup: Dict[int, Tuple[int, int]] = {}
        for img in doc.get_page_images(i, full=True):
            raw_lookup[img[0]] = (img[2], img[3])
        for info in page.get_image_info(xrefs=True):
            xref = info.get('xref')
            bbox = info.get('bbox')
            if xref not in raw_lookup or not bbox:
                continue
            raw_w, raw_h = raw_lookup[xref]
            width_in = (bbox[2] - bbox[0]) / 72.0
            height_in = (bbox[3] - bbox[1]) / 72.0
            if width_in <= 0 or height_in <= 0:
                continue
            dpi_x = raw_w / width_in
            dpi_y = raw_h / height_in
            effective = min(dpi_x, dpi_y)
            sampled += 1
            if effective < DPI_DANGER:
                low_dpi.append({
                    'page': i + 1,
                    'xref': xref,
                    'effective_dpi': round(effective, 0),
                    'raw_pixels': [raw_w, raw_h],
                    'rendered_inches': [round(width_in, 2), round(height_in, 2)],
                })

    if sampled == 0:
        return _criterion(
            'PP4', 'Image effective DPI', True,
            'No raster images to inspect.',
        )
    if low_dpi:
        return _criterion(
            'PP4', 'Image effective DPI', False,
            f'{len(low_dpi)} of {sampled} images render below {DPI_DANGER} DPI.',
            {'low_dpi_images': low_dpi, 'sampled': sampled, 'threshold': DPI_DANGER},
        )
    return _criterion(
        'PP4', 'Image effective DPI', True,
        f'All {sampled} images render at ≥ {DPI_DANGER} DPI.',
        {'sampled': sampled, 'threshold': DPI_DANGER},
    )


def _check_transparency(doc: fitz.Document) -> Dict:
    """Detect transparency / soft-mask usage. Inspect ExtGState dictionaries
    and image SMask references. Live transparency on press = unpredictable
    colour blending unless explicitly flattened.
    """
    transparent_pages = 0
    for i in range(doc.page_count):
        page = doc.load_page(i)
        # Check ExtGState resources for non-1.0 alpha or SMask
        # PyMuPDF's get_text("dict") doesn't expose this — peek via xref
        try:
            page_obj = doc.xref_object(page.xref)
        except Exception:
            continue
        if '/ExtGState' in page_obj or '/SMask' in page_obj:
            # Could be benign; do a tighter check by scanning resources
            resources_match = re.search(r'/Resources\s*(\d+)\s*0\s*R', page_obj)
            if resources_match:
                try:
                    res_obj = doc.xref_object(int(resources_match.group(1)))
                except Exception:
                    res_obj = ''
                if 'CA' in res_obj or 'ca' in res_obj or 'SMask' in res_obj:
                    transparent_pages += 1
                    continue
            transparent_pages += 1

    if transparent_pages == 0:
        return _criterion(
            'PP5', 'Transparency / overprint', True,
            'No transparency or soft-mask usage detected.',
        )
    return _criterion(
        'PP5', 'Transparency / overprint', False,
        f'{transparent_pages} of {doc.page_count} pages use transparency / soft-masks.',
        {'transparent_pages_count': transparent_pages},
    )


def _check_pdfx_conformance(doc: fitz.Document) -> Dict:
    """PDF/X is the print-industry conformance standard (PDF/X-1a, 3, 4).
    Look for the XMP declaration of pdfxid:GTS_PDFXVersion or pdfx:GTS_*.
    """
    try:
        xmp = doc.get_xml_metadata() or ''
    except Exception:
        xmp = ''
    if not xmp:
        return _criterion(
            'PP6', 'PDF/X conformance', False,
            'No XMP metadata stream found.',
        )
    if re.search(r'pdfxid:GTS_PDFXVersion|pdfx:GTS_PDFXVersion', xmp):
        m = re.search(r'GTS_PDFXVersion[^>]*>([^<]+)<', xmp)
        version = m.group(1).strip() if m else '(version not parsed)'
        return _criterion(
            'PP6', 'PDF/X conformance', True,
            f'PDF/X conformance declared: {version}',
        )
    return _criterion(
        'PP6', 'PDF/X conformance', False,
        'No PDF/X conformance flag in XMP metadata.',
    )


def _check_spot_colors(doc: fitz.Document) -> Dict:
    """Look for /Separation (single spot, e.g. Pantone) or /DeviceN (multi-
    channel spot) colour spaces in the catalog graph. Spot colours are
    print-meaningful but require explicit handling on press; flag presence
    so the reviewer can confirm the spot list is intentional.
    """
    found_spaces: List[str] = []
    sample_xrefs = list(range(1, min(doc.xref_length(), 1000)))
    for xref in sample_xrefs:
        try:
            obj = doc.xref_object(xref)
        except Exception:
            continue
        if '/Separation' in obj:
            # Pull the spot name token if present
            m = re.search(r'/Separation\s*/([A-Za-z0-9_#=-]+)', obj)
            if m:
                name = m.group(1)
                if name not in found_spaces:
                    found_spaces.append(name)
        if '/DeviceN' in obj and 'DeviceN' not in found_spaces:
            found_spaces.append('DeviceN(multi-spot)')

    if not found_spaces:
        return _criterion(
            'PP7', 'Spot colour usage', True,
            'No spot colour spaces detected — pure CMYK/RGB/Gray.',
        )
    return _criterion(
        'PP7', 'Spot colour usage', False,
        f'{len(found_spaces)} spot colour spaces detected — confirm spot list is intentional.',
        {'spot_spaces': found_spaces},
    )


# ─────────────────────────────────────────────────────────────────────────────
# Top-level entry
# ─────────────────────────────────────────────────────────────────────────────


def axa_print_preflight(ingest_result: Dict, scope_args: Optional[Dict] = None) -> Dict:
    """Run the full deterministic print-preflight check set on the ingested PDF."""
    pdf_path = ingest_result.get('pdf_path')
    if not pdf_path:
        return {
            'check_name': 'axa_print_preflight',
            'scope': 'document',
            'score': 0.0,
            'pass': False,
            'summary': 'Cannot run — pdf_path missing from ingest_result.',
            'findings': {'error': 'pdf_path_missing'},
            'response': '',
        }

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        return {
            'check_name': 'axa_print_preflight',
            'scope': 'document',
            'score': 0.0,
            'pass': False,
            'summary': f'Failed to open PDF: {e}',
            'findings': {'error': str(e)},
            'response': '',
        }

    try:
        criteria = [
            _check_page_geometry(doc),
            _check_bleed_defined(doc),
            _check_image_colorspaces(doc),
            _check_image_dpi(doc),
            _check_transparency(doc),
            _check_pdfx_conformance(doc),
            _check_spot_colors(doc),
        ]
    finally:
        doc.close()

    passed = [c for c in criteria if c['passed']]
    failed = [c for c in criteria if not c['passed']]
    total = len(criteria)
    score = round((len(passed) / total) * 10, 2) if total else 0.0
    pass_flag = len(failed) == 0

    if pass_flag:
        summary = f'All {total} print-preflight criteria passed — print-ready.'
    elif len(failed) <= 2:
        summary = f'{len(failed)} of {total} criteria failed — likely digital-intent or minor preflight gaps.'
    else:
        summary = f'{len(failed)} of {total} criteria failed — not print-ready as-is.'

    response_lines = [summary, '']
    for c in criteria:
        marker = '✓' if c['passed'] else '✗'
        response_lines.append(f"  {marker} {c['code']} — {c['title']}: {c['note']}")
    response = '\n'.join(response_lines)

    return {
        'check_name': 'axa_print_preflight',
        'scope': 'document',
        'score': score,
        'pass': pass_flag,
        'summary': summary,
        'findings': {
            'criteria': criteria,
            'criteria_total': total,
            'criteria_passed': len(passed),
            'criteria_failed': len(failed),
        },
        'response': response,
    }