"""PDF accessibility checks aligned to PDF/UA-1 + WCAG-AAA-relevant subset. Deterministic Python implementation using PyMuPDF — no Java/veraPDF needed to ship Phase 4. Once veraPDF is installed on the host, _run_verapdf() can be wired in as an additional validation layer (see __doc__ for that fn). Criteria checked (subset of the 30+ rules in PDF/UA-1 §7): • C1 Tagged PDF — document has a /StructTreeRoot • C2 Marked — /MarkInfo /Marked is true • C3 Title — metadata /Title set and non-empty • C4 Language — document /Lang specified • C5 No password protection — /Encrypt absent or accessibility-friendly • C6 Fonts embedded — every font flagged as embedded • C7 PDF version — 1.5+ recommended (older versions can't carry full accessibility tagging features) • C8 XMP UA-conformance — XMP metadata declares pdfuaid:part • C9 Image alt text — sampled images have /Alt or /ActualText in the structure tree (heuristic: looks for /Alt anywhere in the catalog graph; not a full structure-tree walk). Each criterion gets a pass/fail and a short observation. The check's overall score = (passing_criteria / total_criteria) * 10. """ from __future__ import annotations import re from typing import Dict, List, Optional import fitz # PyMuPDF # ───────────────────────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────────────────────── def _catalog_object(doc: fitz.Document) -> str: """Return the catalog object dump as a string (PyMuPDF returns the PDF dictionary as a text representation we can grep).""" try: return doc.xref_object(doc.pdf_catalog()) except Exception: return '' def _xmp_metadata(doc: fitz.Document) -> str: """Return the XMP metadata stream as a string, or '' if absent.""" try: meta = doc.get_xml_metadata() return meta or '' except Exception: return '' def _criterion(code: str, title: str, passed: bool, note: str = '', detail: Optional[Dict] = None) -> Dict: return { 'code': code, 'title': title, 'passed': passed, 'note': note, 'detail': detail or {}, } # ───────────────────────────────────────────────────────────────────────────── # Criterion implementations # ───────────────────────────────────────────────────────────────────────────── def _check_tagged(doc: fitz.Document) -> Dict: catalog = _catalog_object(doc) has_struct = '/StructTreeRoot' in catalog return _criterion( 'C1', 'Tagged PDF (StructTreeRoot present)', has_struct, 'StructTreeRoot found in catalog.' if has_struct else 'PDF has no structure tree — screen readers will fall back to raw text. PDF/UA fail.', ) def _check_marked(doc: fitz.Document) -> Dict: catalog = _catalog_object(doc) has_markinfo = '/MarkInfo' in catalog # /Marked must be true within /MarkInfo. PyMuPDF dump returns it as a # nested dict; we look for the literal "Marked true" pattern. is_marked = bool(re.search(r'/Marked\s+true', catalog)) if has_markinfo and is_marked: return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', True, '/MarkInfo /Marked = true.') if has_markinfo: return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', False, '/MarkInfo present but /Marked is not true.') return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', False, '/MarkInfo dictionary missing.') def _check_title(doc: fitz.Document) -> Dict: md = doc.metadata or {} title = (md.get('title') or '').strip() if title: return _criterion('C3', 'Document title metadata', True, f'Title: "{title[:80]}"') return _criterion('C3', 'Document title metadata', False, 'Title metadata missing or empty.') def _check_language(doc: fitz.Document) -> Dict: lang = (doc.language or '').strip() if not lang: # Sometimes language is in the catalog but not exposed via doc.language catalog = _catalog_object(doc) m = re.search(r'/Lang\s*\(([^)]+)\)', catalog) or re.search(r'/Lang\s*<([^>]+)>', catalog) if m: lang = m.group(1) if lang: return _criterion('C4', 'Document language (/Lang)', True, f'Language: {lang}') return _criterion('C4', 'Document language (/Lang)', False, '/Lang missing — assistive tech cannot pick a voice/locale.') def _check_no_blocking_encryption(doc: fitz.Document) -> Dict: if doc.is_encrypted and doc.needs_pass: return _criterion('C5', 'No password protection blocking AT', False, 'Document is password-protected — assistive tech cannot read.') return _criterion('C5', 'No password protection blocking AT', True, 'No password block; assistive tech can read.') def _check_font_embedding(doc: fitz.Document) -> Dict: """Walk every page, list every font, flag any not embedded.""" seen: Dict[str, bool] = {} not_embedded: List[str] = [] for i in range(doc.page_count): for f in doc.get_page_fonts(i): # PyMuPDF tuple: (xref, ext, type, basefont, name, encoding, embedded) basefont = f[3] ext = f[1] # '' if not embedded, file extension if embedded embedded = bool(ext) if basefont not in seen: seen[basefont] = embedded if not embedded: not_embedded.append(basefont) total = len(seen) embedded_count = sum(1 for v in seen.values() if v) if total == 0: return _criterion('C6', 'Fonts embedded', True, 'No fonts present.') if not_embedded: return _criterion('C6', 'Fonts embedded', False, f'{len(not_embedded)} of {total} fonts are not embedded.', {'not_embedded': not_embedded, 'total_fonts': total, 'embedded_count': embedded_count}) return _criterion('C6', 'Fonts embedded', True, f'All {total} fonts embedded.', {'total_fonts': total, 'embedded_count': embedded_count}) def _check_pdf_version(doc: fitz.Document) -> Dict: md = doc.metadata or {} fmt = (md.get('format') or '').strip() m = re.search(r'PDF\s+(\d+\.\d+)', fmt) version = m.group(1) if m else None if not version: return _criterion('C7', 'PDF version', False, 'Could not determine PDF version.') try: version_num = float(version) except ValueError: return _criterion('C7', 'PDF version', False, f'Could not parse version: {fmt}') # PDF 1.5+ supports compressed cross-reference streams + most accessibility features if version_num >= 1.5: return _criterion('C7', 'PDF version', True, f'PDF {version} — supports modern tagging features.') return _criterion('C7', 'PDF version', False, f'PDF {version} is older than 1.5 — may not support full accessibility tagging.') def _check_xmp_ua_conformance(doc: fitz.Document) -> Dict: xmp = _xmp_metadata(doc) if not xmp: return _criterion('C8', 'XMP UA conformance declaration', False, 'No XMP metadata stream found.') # PDF/UA-1 conformance is declared via pdfuaid:part = 1 in XMP if re.search(r'pdfuaid:part\s*[>=]\s*[\'"]?1', xmp): return _criterion('C8', 'XMP UA conformance declaration', True, 'XMP declares PDF/UA-1 conformance.') if 'pdfuaid' in xmp: return _criterion('C8', 'XMP UA conformance declaration', False, 'XMP mentions pdfuaid namespace but does not declare PDF/UA-1.') return _criterion('C8', 'XMP UA conformance declaration', False, 'No PDF/UA conformance flag in XMP metadata.') def _check_alt_text_sampling(doc: fitz.Document) -> Dict: """Sample-check the structure tree for /Alt entries when images are present. Heuristic: count images on the first 10 pages, and look for /Alt strings anywhere in the catalog graph. Not a full S→Figure walk, but a useful early signal — a doc with images and zero /Alt entries is almost certainly missing alt text. """ image_count = 0 pages_with_images = 0 for i in range(min(doc.page_count, 30)): imgs = doc.get_page_images(i) if imgs: pages_with_images += 1 image_count += len(imgs) if image_count == 0: return _criterion('C9', 'Alt text on images (sampling)', True, 'No raster images detected in first 30 pages — no alt-text needed.') # Search the catalog graph for /Alt(...) entries — coarse but effective alt_hits = 0 sample_xrefs = list(range(1, min(doc.xref_length(), 500))) for xref in sample_xrefs: try: obj = doc.xref_object(xref) except Exception: continue if '/Alt' in obj or '/ActualText' in obj: alt_hits += 1 if alt_hits == 0: return _criterion('C9', 'Alt text on images (sampling)', False, f'{image_count} images detected but no /Alt or /ActualText found in sampled ' f'structure objects.', {'image_count': image_count, 'pages_with_images': pages_with_images}) return _criterion('C9', 'Alt text on images (sampling)', True, f'{image_count} images detected; {alt_hits} alt-text entries found in sampled objects.', {'image_count': image_count, 'pages_with_images': pages_with_images, 'alt_hits': alt_hits}) # ───────────────────────────────────────────────────────────────────────────── # Top-level entry point # ───────────────────────────────────────────────────────────────────────────── def axa_pdf_accessibility(ingest_result: Dict, scope_args: Optional[Dict] = None) -> Dict: """Run the full PDF/UA-aligned check set on the ingested PDF. Requires `pdf_path` on ingest_result (set by the dispatcher). Falls back to a structured-error result if PDF can't be opened. """ pdf_path = ingest_result.get('pdf_path') if not pdf_path: return { 'check_name': 'axa_pdf_accessibility', 'scope': 'document', 'score': 0.0, 'pass': False, 'summary': 'Cannot run — pdf_path missing from ingest_result.', 'findings': {'error': 'pdf_path_missing'}, 'response': '', } try: doc = fitz.open(pdf_path) except Exception as e: return { 'check_name': 'axa_pdf_accessibility', 'scope': 'document', 'score': 0.0, 'pass': False, 'summary': f'Failed to open PDF: {e}', 'findings': {'error': str(e)}, 'response': '', } try: criteria = [ _check_tagged(doc), _check_marked(doc), _check_title(doc), _check_language(doc), _check_no_blocking_encryption(doc), _check_font_embedding(doc), _check_pdf_version(doc), _check_xmp_ua_conformance(doc), _check_alt_text_sampling(doc), ] finally: doc.close() passed = [c for c in criteria if c['passed']] failed = [c for c in criteria if not c['passed']] total = len(criteria) score = round((len(passed) / total) * 10, 2) if total else 0.0 pass_flag = len(failed) == 0 if pass_flag: summary = f'All {total} accessibility criteria passed.' else: summary = f'{len(failed)} of {total} accessibility criteria failed.' response_lines = [summary, ''] for c in criteria: marker = '✓' if c['passed'] else '✗' response_lines.append(f" {marker} {c['code']} — {c['title']}: {c['note']}") response = '\n'.join(response_lines) return { 'check_name': 'axa_pdf_accessibility', 'scope': 'document', 'score': score, 'pass': pass_flag, 'summary': summary, 'findings': { 'criteria': criteria, 'criteria_total': total, 'criteria_passed': len(passed), 'criteria_failed': len(failed), 'verapdf_run': False, # set to True when veraPDF subprocess is wired in }, 'response': response, } # ───────────────────────────────────────────────────────────────────────────── # veraPDF integration stub — wire when Java is on the host # ───────────────────────────────────────────────────────────────────────────── def _run_verapdf(pdf_path: str) -> Optional[Dict]: """Stub for veraPDF subprocess validation. To enable: 1. Install veraPDF on the host: https://verapdf.org/software/ (requires JRE 8+; ~150MB total). 2. Ensure `verapdf` binary is on PATH or set VERAPDF_BIN env var. 3. Replace this stub with subprocess.run([verapdf, '--format', 'json', '--profile', 'ua1', pdf_path], capture_output=True). Parse the JSON output and merge into axa_pdf_accessibility's findings. 4. Set findings['verapdf_run'] = True so the report shows it ran. Currently returns None so callers know veraPDF was not invoked. """ return None