Multi-page PDF QC for AXA Ireland policy documents. Runs as a third mode alongside static + video, gated on profile.mode. New code isolated under backend/document_mode/ with new endpoints under /api/document/*. Phase 1 — Spine + 6 deterministic doc-scope checks ($0, runs in seconds): - Scope-aware dispatcher (document/targeted/page_sample/page_pair/page_each) - axa_font_inventory, axa_phone_inventory, axa_bold_words_definitions, axa_page_numbering, axa_print_code, axa_omg_versioning - Bootstrap bold-words dictionary extracted from Example 1 General Definitions Phase 3 — Old-vs-new diff (~$0.50/run, 3-5 min): - Page alignment via difflib SequenceMatcher (windowed fuzzy match) - Vision-LLM page-pair diff via Gemini 2.5 Pro (8 concurrent) - Two-slot upload UX, axa_policy_document_diff profile, mode=document_diff Phase 4 — PDF accessibility (PyMuPDF, $0): - 9 PDF/UA-1 aligned criteria (tagged structure, /MarkInfo, title, /Lang, encryption, font embedding, PDF version, XMP UA-conformance, alt-text) - _run_verapdf() stub for optional Java-based veraPDF integration later Phase 5 — Print preflight (PyMuPDF, $0): - 7 criteria (page geometry, bleed, image colour spaces, image DPI, transparency, PDF/X conformance, spot colours) Profile additions: - axa_policy_document — 8 deterministic checks, $0 cost - axa_policy_document_diff — 1 page-pair LLM check, ~$0.50/run API additions: - POST /api/document/start_analysis (single PDF) - POST /api/document/start_diff (old + new PDFs) Frontend additions: - Third profile.mode value (document_diff) in applyProfileMode() - Two-slot upload UX with PDF-only file pickers - checkFormValidity() branches by mode for the analyse-button gate Smoke-tested locally against Example 1 (Home Insurance V8, 86pp) and Example 2 (Landlord V1 vs V10, 68→74pp) with real findings caught including bold-words gaps, missing PDF/UA flag, transparency on press, V1→V10 bold-formatting fixes. Plan + integration map + gotchas in backend/AXA_DOCUMENT_MODE_PLAN.md. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
338 lines
14 KiB
Python
338 lines
14 KiB
Python
"""PDF accessibility checks aligned to PDF/UA-1 + WCAG-AAA-relevant subset.
|
|
|
|
Deterministic Python implementation using PyMuPDF — no Java/veraPDF needed
|
|
to ship Phase 4. Once veraPDF is installed on the host, _run_verapdf() can
|
|
be wired in as an additional validation layer (see __doc__ for that fn).
|
|
|
|
Criteria checked (subset of the 30+ rules in PDF/UA-1 §7):
|
|
• C1 Tagged PDF — document has a /StructTreeRoot
|
|
• C2 Marked — /MarkInfo /Marked is true
|
|
• C3 Title — metadata /Title set and non-empty
|
|
• C4 Language — document /Lang specified
|
|
• C5 No password protection — /Encrypt absent or accessibility-friendly
|
|
• C6 Fonts embedded — every font flagged as embedded
|
|
• C7 PDF version — 1.5+ recommended (older versions can't carry full
|
|
accessibility tagging features)
|
|
• C8 XMP UA-conformance — XMP metadata declares pdfuaid:part
|
|
• C9 Image alt text — sampled images have /Alt or /ActualText in the
|
|
structure tree (heuristic: looks for /Alt anywhere in the catalog
|
|
graph; not a full structure-tree walk).
|
|
|
|
Each criterion gets a pass/fail and a short observation. The check's
|
|
overall score = (passing_criteria / total_criteria) * 10.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Dict, List, Optional
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Helpers
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _catalog_object(doc: fitz.Document) -> str:
|
|
"""Return the catalog object dump as a string (PyMuPDF returns the
|
|
PDF dictionary as a text representation we can grep)."""
|
|
try:
|
|
return doc.xref_object(doc.pdf_catalog())
|
|
except Exception:
|
|
return ''
|
|
|
|
|
|
def _xmp_metadata(doc: fitz.Document) -> str:
|
|
"""Return the XMP metadata stream as a string, or '' if absent."""
|
|
try:
|
|
meta = doc.get_xml_metadata()
|
|
return meta or ''
|
|
except Exception:
|
|
return ''
|
|
|
|
|
|
def _criterion(code: str, title: str, passed: bool, note: str = '', detail: Optional[Dict] = None) -> Dict:
|
|
return {
|
|
'code': code,
|
|
'title': title,
|
|
'passed': passed,
|
|
'note': note,
|
|
'detail': detail or {},
|
|
}
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Criterion implementations
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _check_tagged(doc: fitz.Document) -> Dict:
|
|
catalog = _catalog_object(doc)
|
|
has_struct = '/StructTreeRoot' in catalog
|
|
return _criterion(
|
|
'C1', 'Tagged PDF (StructTreeRoot present)',
|
|
has_struct,
|
|
'StructTreeRoot found in catalog.' if has_struct
|
|
else 'PDF has no structure tree — screen readers will fall back to raw text. PDF/UA fail.',
|
|
)
|
|
|
|
|
|
def _check_marked(doc: fitz.Document) -> Dict:
|
|
catalog = _catalog_object(doc)
|
|
has_markinfo = '/MarkInfo' in catalog
|
|
# /Marked must be true within /MarkInfo. PyMuPDF dump returns it as a
|
|
# nested dict; we look for the literal "Marked true" pattern.
|
|
is_marked = bool(re.search(r'/Marked\s+true', catalog))
|
|
if has_markinfo and is_marked:
|
|
return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', True,
|
|
'/MarkInfo /Marked = true.')
|
|
if has_markinfo:
|
|
return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', False,
|
|
'/MarkInfo present but /Marked is not true.')
|
|
return _criterion('C2', 'Marked content (/MarkInfo /Marked true)', False,
|
|
'/MarkInfo dictionary missing.')
|
|
|
|
|
|
def _check_title(doc: fitz.Document) -> Dict:
|
|
md = doc.metadata or {}
|
|
title = (md.get('title') or '').strip()
|
|
if title:
|
|
return _criterion('C3', 'Document title metadata', True,
|
|
f'Title: "{title[:80]}"')
|
|
return _criterion('C3', 'Document title metadata', False,
|
|
'Title metadata missing or empty.')
|
|
|
|
|
|
def _check_language(doc: fitz.Document) -> Dict:
|
|
lang = (doc.language or '').strip()
|
|
if not lang:
|
|
# Sometimes language is in the catalog but not exposed via doc.language
|
|
catalog = _catalog_object(doc)
|
|
m = re.search(r'/Lang\s*\(([^)]+)\)', catalog) or re.search(r'/Lang\s*<([^>]+)>', catalog)
|
|
if m:
|
|
lang = m.group(1)
|
|
if lang:
|
|
return _criterion('C4', 'Document language (/Lang)', True,
|
|
f'Language: {lang}')
|
|
return _criterion('C4', 'Document language (/Lang)', False,
|
|
'/Lang missing — assistive tech cannot pick a voice/locale.')
|
|
|
|
|
|
def _check_no_blocking_encryption(doc: fitz.Document) -> Dict:
|
|
if doc.is_encrypted and doc.needs_pass:
|
|
return _criterion('C5', 'No password protection blocking AT', False,
|
|
'Document is password-protected — assistive tech cannot read.')
|
|
return _criterion('C5', 'No password protection blocking AT', True,
|
|
'No password block; assistive tech can read.')
|
|
|
|
|
|
def _check_font_embedding(doc: fitz.Document) -> Dict:
|
|
"""Walk every page, list every font, flag any not embedded."""
|
|
seen: Dict[str, bool] = {}
|
|
not_embedded: List[str] = []
|
|
for i in range(doc.page_count):
|
|
for f in doc.get_page_fonts(i):
|
|
# PyMuPDF tuple: (xref, ext, type, basefont, name, encoding, embedded)
|
|
basefont = f[3]
|
|
ext = f[1] # '' if not embedded, file extension if embedded
|
|
embedded = bool(ext)
|
|
if basefont not in seen:
|
|
seen[basefont] = embedded
|
|
if not embedded:
|
|
not_embedded.append(basefont)
|
|
total = len(seen)
|
|
embedded_count = sum(1 for v in seen.values() if v)
|
|
if total == 0:
|
|
return _criterion('C6', 'Fonts embedded', True, 'No fonts present.')
|
|
if not_embedded:
|
|
return _criterion('C6', 'Fonts embedded', False,
|
|
f'{len(not_embedded)} of {total} fonts are not embedded.',
|
|
{'not_embedded': not_embedded, 'total_fonts': total,
|
|
'embedded_count': embedded_count})
|
|
return _criterion('C6', 'Fonts embedded', True,
|
|
f'All {total} fonts embedded.',
|
|
{'total_fonts': total, 'embedded_count': embedded_count})
|
|
|
|
|
|
def _check_pdf_version(doc: fitz.Document) -> Dict:
|
|
md = doc.metadata or {}
|
|
fmt = (md.get('format') or '').strip()
|
|
m = re.search(r'PDF\s+(\d+\.\d+)', fmt)
|
|
version = m.group(1) if m else None
|
|
if not version:
|
|
return _criterion('C7', 'PDF version', False, 'Could not determine PDF version.')
|
|
try:
|
|
version_num = float(version)
|
|
except ValueError:
|
|
return _criterion('C7', 'PDF version', False, f'Could not parse version: {fmt}')
|
|
# PDF 1.5+ supports compressed cross-reference streams + most accessibility features
|
|
if version_num >= 1.5:
|
|
return _criterion('C7', 'PDF version', True, f'PDF {version} — supports modern tagging features.')
|
|
return _criterion('C7', 'PDF version', False,
|
|
f'PDF {version} is older than 1.5 — may not support full accessibility tagging.')
|
|
|
|
|
|
def _check_xmp_ua_conformance(doc: fitz.Document) -> Dict:
|
|
xmp = _xmp_metadata(doc)
|
|
if not xmp:
|
|
return _criterion('C8', 'XMP UA conformance declaration', False,
|
|
'No XMP metadata stream found.')
|
|
# PDF/UA-1 conformance is declared via pdfuaid:part = 1 in XMP
|
|
if re.search(r'pdfuaid:part\s*[>=]\s*[\'"]?1', xmp):
|
|
return _criterion('C8', 'XMP UA conformance declaration', True,
|
|
'XMP declares PDF/UA-1 conformance.')
|
|
if 'pdfuaid' in xmp:
|
|
return _criterion('C8', 'XMP UA conformance declaration', False,
|
|
'XMP mentions pdfuaid namespace but does not declare PDF/UA-1.')
|
|
return _criterion('C8', 'XMP UA conformance declaration', False,
|
|
'No PDF/UA conformance flag in XMP metadata.')
|
|
|
|
|
|
def _check_alt_text_sampling(doc: fitz.Document) -> Dict:
|
|
"""Sample-check the structure tree for /Alt entries when images are
|
|
present. Heuristic: count images on the first 10 pages, and look for
|
|
/Alt strings anywhere in the catalog graph. Not a full S→Figure walk,
|
|
but a useful early signal — a doc with images and zero /Alt entries
|
|
is almost certainly missing alt text.
|
|
"""
|
|
image_count = 0
|
|
pages_with_images = 0
|
|
for i in range(min(doc.page_count, 30)):
|
|
imgs = doc.get_page_images(i)
|
|
if imgs:
|
|
pages_with_images += 1
|
|
image_count += len(imgs)
|
|
|
|
if image_count == 0:
|
|
return _criterion('C9', 'Alt text on images (sampling)', True,
|
|
'No raster images detected in first 30 pages — no alt-text needed.')
|
|
|
|
# Search the catalog graph for /Alt(...) entries — coarse but effective
|
|
alt_hits = 0
|
|
sample_xrefs = list(range(1, min(doc.xref_length(), 500)))
|
|
for xref in sample_xrefs:
|
|
try:
|
|
obj = doc.xref_object(xref)
|
|
except Exception:
|
|
continue
|
|
if '/Alt' in obj or '/ActualText' in obj:
|
|
alt_hits += 1
|
|
|
|
if alt_hits == 0:
|
|
return _criterion('C9', 'Alt text on images (sampling)', False,
|
|
f'{image_count} images detected but no /Alt or /ActualText found in sampled '
|
|
f'structure objects.',
|
|
{'image_count': image_count, 'pages_with_images': pages_with_images})
|
|
return _criterion('C9', 'Alt text on images (sampling)', True,
|
|
f'{image_count} images detected; {alt_hits} alt-text entries found in sampled objects.',
|
|
{'image_count': image_count, 'pages_with_images': pages_with_images,
|
|
'alt_hits': alt_hits})
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Top-level entry point
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def axa_pdf_accessibility(ingest_result: Dict, scope_args: Optional[Dict] = None) -> Dict:
|
|
"""Run the full PDF/UA-aligned check set on the ingested PDF.
|
|
|
|
Requires `pdf_path` on ingest_result (set by the dispatcher). Falls
|
|
back to a structured-error result if PDF can't be opened.
|
|
"""
|
|
pdf_path = ingest_result.get('pdf_path')
|
|
if not pdf_path:
|
|
return {
|
|
'check_name': 'axa_pdf_accessibility',
|
|
'scope': 'document',
|
|
'score': 0.0,
|
|
'pass': False,
|
|
'summary': 'Cannot run — pdf_path missing from ingest_result.',
|
|
'findings': {'error': 'pdf_path_missing'},
|
|
'response': '',
|
|
}
|
|
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
except Exception as e:
|
|
return {
|
|
'check_name': 'axa_pdf_accessibility',
|
|
'scope': 'document',
|
|
'score': 0.0,
|
|
'pass': False,
|
|
'summary': f'Failed to open PDF: {e}',
|
|
'findings': {'error': str(e)},
|
|
'response': '',
|
|
}
|
|
|
|
try:
|
|
criteria = [
|
|
_check_tagged(doc),
|
|
_check_marked(doc),
|
|
_check_title(doc),
|
|
_check_language(doc),
|
|
_check_no_blocking_encryption(doc),
|
|
_check_font_embedding(doc),
|
|
_check_pdf_version(doc),
|
|
_check_xmp_ua_conformance(doc),
|
|
_check_alt_text_sampling(doc),
|
|
]
|
|
finally:
|
|
doc.close()
|
|
|
|
passed = [c for c in criteria if c['passed']]
|
|
failed = [c for c in criteria if not c['passed']]
|
|
total = len(criteria)
|
|
score = round((len(passed) / total) * 10, 2) if total else 0.0
|
|
pass_flag = len(failed) == 0
|
|
|
|
if pass_flag:
|
|
summary = f'All {total} accessibility criteria passed.'
|
|
else:
|
|
summary = f'{len(failed)} of {total} accessibility criteria failed.'
|
|
|
|
response_lines = [summary, '']
|
|
for c in criteria:
|
|
marker = '✓' if c['passed'] else '✗'
|
|
response_lines.append(f" {marker} {c['code']} — {c['title']}: {c['note']}")
|
|
response = '\n'.join(response_lines)
|
|
|
|
return {
|
|
'check_name': 'axa_pdf_accessibility',
|
|
'scope': 'document',
|
|
'score': score,
|
|
'pass': pass_flag,
|
|
'summary': summary,
|
|
'findings': {
|
|
'criteria': criteria,
|
|
'criteria_total': total,
|
|
'criteria_passed': len(passed),
|
|
'criteria_failed': len(failed),
|
|
'verapdf_run': False, # set to True when veraPDF subprocess is wired in
|
|
},
|
|
'response': response,
|
|
}
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# veraPDF integration stub — wire when Java is on the host
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _run_verapdf(pdf_path: str) -> Optional[Dict]:
|
|
"""Stub for veraPDF subprocess validation.
|
|
|
|
To enable:
|
|
1. Install veraPDF on the host: https://verapdf.org/software/
|
|
(requires JRE 8+; ~150MB total).
|
|
2. Ensure `verapdf` binary is on PATH or set VERAPDF_BIN env var.
|
|
3. Replace this stub with subprocess.run([verapdf, '--format', 'json',
|
|
'--profile', 'ua1', pdf_path], capture_output=True). Parse the
|
|
JSON output and merge into axa_pdf_accessibility's findings.
|
|
4. Set findings['verapdf_run'] = True so the report shows it ran.
|
|
|
|
Currently returns None so callers know veraPDF was not invoked.
|
|
"""
|
|
return None
|