ai_qc/backend/document_mode/accessibility_checks.py

"""PDF accessibility checks aligned to PDF/UA-1 + WCAG-AAA-relevant subset.

Deterministic Python implementation using PyMuPDF — no Java/veraPDF needed
to ship Phase 4. Once veraPDF is installed on the host, _run_verapdf() can
be wired in as an additional validation layer (see __doc__ for that fn).

Criteria checked (subset of the 30+ rules in PDF/UA-1 §7):
  • C1  Tagged PDF — document has a /StructTreeRoot
  • C2  Marked — /MarkInfo /Marked is true
  • C3  Title — metadata /Title set and non-empty
  • C4  Language — document /Lang specified
  • C5  No password protection — /Encrypt absent or accessibility-friendly
  • C6  Fonts embedded — every font flagged as embedded
  • C7  PDF version — 1.5+ recommended (older versions can't carry full
        accessibility tagging features)
  • C8  XMP UA-conformance — XMP metadata declares pdfuaid:part
  • C9  Image alt text — sampled images have /Alt or /ActualText in the
        structure tree (heuristic: looks for /Alt anywhere in the catalog
        graph; not a full structure-tree walk).

Each criterion gets a pass/fail and a short observation. The check's
overall score = (passing_criteria / total_criteria) * 10.
"""

from __future__ import annotations

import re
from typing import Dict, List, Optional

import fitz  # PyMuPDF


# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────


def _catalog_object(doc: fitz.Document) -> str:
    """Return the catalog object dump as a string (PyMuPDF returns the
    PDF dictionary as a text representation we can grep)."""
    try:
        return doc.xref_object(doc.pdf_catalog())
    except Exception:
        return ''


def _xmp_metadata(doc: fitz.Document) -> str:
    """Return the XMP metadata stream as a string, or '' if absent."""
    try:
        meta = doc.get_xml_metadata()
        return meta or ''
    except Exception:
        return ''


def _criterion(code: str, title: str, passed: bool, note: str = '', detail: Optional[Dict] = None) -> Dict:
    return {
        'code': code,
        'title': title,
        'passed': passed,
        'note': note,
        'detail': detail or {},
    }


# ─────────────────────────────────────────────────────────────────────────────
# Criterion implementations
# ─────────────────────────────────────────────────────────────────────────────


def _check_tagged(doc: fitz.Document) -> Dict:
    catalog = _catalog_object(doc)
    has_struct = '/StructTreeRoot' in catalog
    return _criterion(
        'C1', 'Tagged PDF (StructTreeRoot present)',
        has_struct,
        'StructTreeRoot found in catalog.' if has_struct
        else 'PDF has no structure tree — screen readers will fall back to raw text. PDF/UA fail.',
    )


def _check_marked(doc: fitz.Document) -> Dict:
    catalog = _catalog_object(doc)
    has_markinfo = '/MarkInfo' in catalog
    # /Marked must be true within /MarkInfo. PyMuPDF dump returns it as a
    # nested dict; we look for the literal "Marked true" pattern.
    is_marked = bool(re.search(r'/Marked\s+true', catalog))
    if has_markinfo and is_marked:
        return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', True,
                          '/MarkInfo /Marked = true.')
    if has_markinfo:
        return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', False,
                          '/MarkInfo present but /Marked is not true.')
    return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', False,
                      '/MarkInfo dictionary missing.')


def _check_title(doc: fitz.Document) -> Dict:
    md = doc.metadata or {}
    title = (md.get('title') or '').strip()
    if title:
        return _criterion('C3', 'Document title metadata', True,
                          f'Title: "{title[:80]}"')
    return _criterion('C3', 'Document title metadata', False,
                      'Title metadata missing or empty.')


def _check_language(doc: fitz.Document) -> Dict:
    lang = (doc.language or '').strip()
    if not lang:
        # Sometimes language is in the catalog but not exposed via doc.language
        catalog = _catalog_object(doc)
        m = re.search(r'/Lang\s*\(([^)]+)\)', catalog) or re.search(r'/Lang\s*<([^>]+)>', catalog)
        if m:
            lang = m.group(1)
    if lang:
        return _criterion('C4', 'Document language (/Lang)', True,
                          f'Language: {lang}')
    return _criterion('C4', 'Document language (/Lang)', False,
                      '/Lang missing — assistive tech cannot pick a voice/locale.')


def _check_no_blocking_encryption(doc: fitz.Document) -> Dict:
    if doc.is_encrypted and doc.needs_pass:
        return _criterion('C5', 'No password protection blocking AT', False,
                          'Document is password-protected — assistive tech cannot read.')
    return _criterion('C5', 'No password protection blocking AT', True,
                      'No password block; assistive tech can read.')


def _check_font_embedding(doc: fitz.Document) -> Dict:
    """Walk every page, list every font, flag any not embedded."""
    seen: Dict[str, bool] = {}
    not_embedded: List[str] = []
    for i in range(doc.page_count):
        for f in doc.get_page_fonts(i):
            # PyMuPDF tuple: (xref, ext, type, basefont, name, encoding, embedded)
            basefont = f[3]
            ext = f[1]  # '' if not embedded, file extension if embedded
            embedded = bool(ext)
            if basefont not in seen:
                seen[basefont] = embedded
                if not embedded:
                    not_embedded.append(basefont)
    total = len(seen)
    embedded_count = sum(1 for v in seen.values() if v)
    if total == 0:
        return _criterion('C6', 'Fonts embedded', True, 'No fonts present.')
    if not_embedded:
        return _criterion('C6', 'Fonts embedded', False,
                          f'{len(not_embedded)} of {total} fonts are not embedded.',
                          {'not_embedded': not_embedded, 'total_fonts': total,
                           'embedded_count': embedded_count})
    return _criterion('C6', 'Fonts embedded', True,
                      f'All {total} fonts embedded.',
                      {'total_fonts': total, 'embedded_count': embedded_count})


def _check_pdf_version(doc: fitz.Document) -> Dict:
    md = doc.metadata or {}
    fmt = (md.get('format') or '').strip()
    m = re.search(r'PDF\s+(\d+\.\d+)', fmt)
    version = m.group(1) if m else None
    if not version:
        return _criterion('C7', 'PDF version', False, 'Could not determine PDF version.')
    try:
        version_num = float(version)
    except ValueError:
        return _criterion('C7', 'PDF version', False, f'Could not parse version: {fmt}')
    # PDF 1.5+ supports compressed cross-reference streams + most accessibility features
    if version_num >= 1.5:
        return _criterion('C7', 'PDF version', True, f'PDF {version} — supports modern tagging features.')
    return _criterion('C7', 'PDF version', False,
                      f'PDF {version} is older than 1.5 — may not support full accessibility tagging.')


def _check_xmp_ua_conformance(doc: fitz.Document) -> Dict:
    xmp = _xmp_metadata(doc)
    if not xmp:
        return _criterion('C8', 'XMP UA conformance declaration', False,
                          'No XMP metadata stream found.')
    # PDF/UA-1 conformance is declared via pdfuaid:part = 1 in XMP
    if re.search(r'pdfuaid:part\s*[>=]\s*[\'"]?1', xmp):
        return _criterion('C8', 'XMP UA conformance declaration', True,
                          'XMP declares PDF/UA-1 conformance.')
    if 'pdfuaid' in xmp:
        return _criterion('C8', 'XMP UA conformance declaration', False,
                          'XMP mentions pdfuaid namespace but does not declare PDF/UA-1.')
    return _criterion('C8', 'XMP UA conformance declaration', False,
                      'No PDF/UA conformance flag in XMP metadata.')


def _check_alt_text_sampling(doc: fitz.Document) -> Dict:
    """Sample-check the structure tree for /Alt entries when images are
    present. Heuristic: count images on the first 10 pages, and look for
    /Alt strings anywhere in the catalog graph. Not a full S→Figure walk,
    but a useful early signal — a doc with images and zero /Alt entries
    is almost certainly missing alt text.
    """
    image_count = 0
    pages_with_images = 0
    for i in range(min(doc.page_count, 30)):
        imgs = doc.get_page_images(i)
        if imgs:
            pages_with_images += 1
            image_count += len(imgs)

    if image_count == 0:
        return _criterion('C9', 'Alt text on images (sampling)', True,
                          'No raster images detected in first 30 pages — no alt-text needed.')

    # Search the catalog graph for /Alt(...) entries — coarse but effective
    alt_hits = 0
    sample_xrefs = list(range(1, min(doc.xref_length(), 500)))
    for xref in sample_xrefs:
        try:
            obj = doc.xref_object(xref)
        except Exception:
            continue
        if '/Alt' in obj or '/ActualText' in obj:
            alt_hits += 1

    if alt_hits == 0:
        return _criterion('C9', 'Alt text on images (sampling)', False,
                          f'{image_count} images detected but no /Alt or /ActualText found in sampled '
                          f'structure objects.',
                          {'image_count': image_count, 'pages_with_images': pages_with_images})
    return _criterion('C9', 'Alt text on images (sampling)', True,
                      f'{image_count} images detected; {alt_hits} alt-text entries found in sampled objects.',
                      {'image_count': image_count, 'pages_with_images': pages_with_images,
                       'alt_hits': alt_hits})


# ─────────────────────────────────────────────────────────────────────────────
# Top-level entry point
# ─────────────────────────────────────────────────────────────────────────────


def axa_pdf_accessibility(ingest_result: Dict, scope_args: Optional[Dict] = None) -> Dict:
    """Run the full PDF/UA-aligned check set on the ingested PDF.

    Requires `pdf_path` on ingest_result (set by the dispatcher). Falls
    back to a structured-error result if PDF can't be opened.
    """
    pdf_path = ingest_result.get('pdf_path')
    if not pdf_path:
        return {
            'check_name': 'axa_pdf_accessibility',
            'scope': 'document',
            'score': 0.0,
            'pass': False,
            'summary': 'Cannot run — pdf_path missing from ingest_result.',
            'findings': {'error': 'pdf_path_missing'},
            'response': '',
        }

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        return {
            'check_name': 'axa_pdf_accessibility',
            'scope': 'document',
            'score': 0.0,
            'pass': False,
            'summary': f'Failed to open PDF: {e}',
            'findings': {'error': str(e)},
            'response': '',
        }

    try:
        criteria = [
            _check_tagged(doc),
            _check_marked(doc),
            _check_title(doc),
            _check_language(doc),
            _check_no_blocking_encryption(doc),
            _check_font_embedding(doc),
            _check_pdf_version(doc),
            _check_xmp_ua_conformance(doc),
            _check_alt_text_sampling(doc),
        ]
    finally:
        doc.close()

    passed = [c for c in criteria if c['passed']]
    failed = [c for c in criteria if not c['passed']]
    total = len(criteria)
    score = round((len(passed) / total) * 10, 2) if total else 0.0
    pass_flag = len(failed) == 0

    if pass_flag:
        summary = f'All {total} accessibility criteria passed.'
    else:
        summary = f'{len(failed)} of {total} accessibility criteria failed.'

    response_lines = [summary, '']
    for c in criteria:
        marker = '✓' if c['passed'] else '✗'
        response_lines.append(f"  {marker} {c['code']} — {c['title']}: {c['note']}")
    response = '\n'.join(response_lines)

    return {
        'check_name': 'axa_pdf_accessibility',
        'scope': 'document',
        'score': score,
        'pass': pass_flag,
        'summary': summary,
        'findings': {
            'criteria': criteria,
            'criteria_total': total,
            'criteria_passed': len(passed),
            'criteria_failed': len(failed),
            'verapdf_run': False,  # set to True when veraPDF subprocess is wired in
        },
        'response': response,
    }


# ─────────────────────────────────────────────────────────────────────────────
# veraPDF integration stub — wire when Java is on the host
# ─────────────────────────────────────────────────────────────────────────────


def _run_verapdf(pdf_path: str) -> Optional[Dict]:
    """Stub for veraPDF subprocess validation.

    To enable:
        1. Install veraPDF on the host: https://verapdf.org/software/
           (requires JRE 8+; ~150MB total).
        2. Ensure `verapdf` binary is on PATH or set VERAPDF_BIN env var.
        3. Replace this stub with subprocess.run([verapdf, '--format', 'json',
           '--profile', 'ua1', pdf_path], capture_output=True). Parse the
           JSON output and merge into axa_pdf_accessibility's findings.
        4. Set findings['verapdf_run'] = True so the report shows it ran.

    Currently returns None so callers know veraPDF was not invoked.
    """
    return None