diff --git a/CLAUDE_AXA.md b/CLAUDE_AXA.md index d243e74..1942324 100644 --- a/CLAUDE_AXA.md +++ b/CLAUDE_AXA.md @@ -20,7 +20,7 @@ Multi-page policy document QC. `mode: document`, scopes vary per check. | `axa_phone_inventory` | Extracts phone numbers across pages, validates format and approved-list membership | 1.0 | | `axa_bold_words_definitions` | Bold-word inventory + definition cross-check (seed list at `backend/document_mode/data/axa_bold_words_seed.json`) | 2.0 | | `axa_page_numbering` | Page numbering format and continuity | 1.0 | -| `axa_pdf_accessibility` | Tagged-PDF / accessibility checks | 2.0 | +| `axa_pdf_accessibility` | PDF/UA-1 validation via veraPDF (matches axes4 PAC), with deterministic PyMuPDF fallback if veraPDF is not installed | 2.0 | | `axa_print_preflight` | Print-preflight checks (color space, embedded fonts, image resolution) | 1.0 | | `axa_print_code` | Print code presence + format | 1.0 | | `axa_omg_versioning` | OMG version footer/header presence and consistency | 1.0 | @@ -50,6 +50,17 @@ Boots Production Pack reuses this entire spine — so any infra changes here aff - Phase 2 (any further check expansion) deferred until after show-and-tell - Canonical AXA font list / approved phone list / OMG version reference data may need expansion as test PDFs surface gaps +## veraPDF deployment + +`axa_pdf_accessibility` runs the **veraPDF** PDF/UA-1 validator as a subprocess when the binary is available. veraPDF implements the Matterhorn Protocol — the same rule set axes4 PAC uses — so its verdict is the closest open-source equivalent to PAC. + +Binary resolution order (in `accessibility_checks._resolve_verapdf_binary`): +1. `VERAPDF_BIN` env var +2. `verapdf` on PATH +3. `/opt/ai_qc/vendor/verapdf/verapdf` (project-local production install) + +If veraPDF isn't installed the check falls back to the 9-criterion deterministic PyMuPDF layer — no breakage, just less depth. **Production install pattern** is a project-local bundled-JRE tarball under `/opt/ai_qc/vendor/verapdf/` to avoid touching system Java or other projects on shared servers. + ## Key files - `backend/AXA_DOCUMENT_MODE_PLAN.md` — full design plan and phase breakdown diff --git a/backend/document_mode/accessibility_checks.py b/backend/document_mode/accessibility_checks.py index ca84919..b0f7602 100644 --- a/backend/document_mode/accessibility_checks.py +++ b/backend/document_mode/accessibility_checks.py @@ -1,35 +1,46 @@ -"""PDF accessibility checks aligned to PDF/UA-1 + WCAG-AAA-relevant subset. +"""PDF accessibility checks aligned to PDF/UA-1. -Deterministic Python implementation using PyMuPDF — no Java/veraPDF needed -to ship Phase 4. Once veraPDF is installed on the host, _run_verapdf() can -be wired in as an additional validation layer (see __doc__ for that fn). +Two layers, applied in order: + 1. veraPDF subprocess — full PDF/UA-1 (ISO 14289-1) validation via the + Matterhorn Protocol. This is the same protocol PAC uses, so its + verdict is the authoritative one when veraPDF is available on the + host. When it runs, its result drives the score and pass flag. + 2. Deterministic PyMuPDF criteria (C1-C9) — fast surface checks that + run regardless. They give the AXA team a quick visual sanity-pass + (tagged? language set? fonts embedded?) and are the sole source of + truth when veraPDF is not installed. -Criteria checked (subset of the 30+ rules in PDF/UA-1 §7): +Deterministic criteria: • C1 Tagged PDF — document has a /StructTreeRoot • C2 Marked — /MarkInfo /Marked is true • C3 Title — metadata /Title set and non-empty • C4 Language — document /Lang specified • C5 No password protection — /Encrypt absent or accessibility-friendly • C6 Fonts embedded — every font flagged as embedded - • C7 PDF version — 1.5+ recommended (older versions can't carry full - accessibility tagging features) + • C7 PDF version — 1.5+ recommended • C8 XMP UA-conformance — XMP metadata declares pdfuaid:part - • C9 Image alt text — sampled images have /Alt or /ActualText in the - structure tree (heuristic: looks for /Alt anywhere in the catalog - graph; not a full structure-tree walk). - -Each criterion gets a pass/fail and a short observation. The check's -overall score = (passing_criteria / total_criteria) * 10. + • C9 Image alt text — sampled images have /Alt or /ActualText """ from __future__ import annotations +import os import re +import shutil +import subprocess +import xml.etree.ElementTree as ET from typing import Dict, List, Optional import fitz # PyMuPDF +# Project-local install path for the production server (see vendor dir +# under /opt/ai_qc/vendor/verapdf/). Falls back to PATH lookup or +# VERAPDF_BIN env var. +_VERAPDF_VENDOR_PATH = '/opt/ai_qc/vendor/verapdf/verapdf' +_VERAPDF_TIMEOUT_SECONDS = 180 + + # ───────────────────────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────────────────────── @@ -237,10 +248,11 @@ def _check_alt_text_sampling(doc: fitz.Document) -> Dict: def axa_pdf_accessibility(ingest_result: Dict, scope_args: Optional[Dict] = None) -> Dict: - """Run the full PDF/UA-aligned check set on the ingested PDF. + """Run PDF/UA-1 accessibility validation on the ingested PDF. - Requires `pdf_path` on ingest_result (set by the dispatcher). Falls - back to a structured-error result if PDF can't be opened. + When veraPDF is installed on the host, its PDF/UA-1 verdict is the + authoritative score driver. The deterministic PyMuPDF criteria run + in either case as a quick sanity layer. """ pdf_path = ingest_result.get('pdf_path') if not pdf_path: @@ -282,22 +294,24 @@ def axa_pdf_accessibility(ingest_result: Dict, scope_args: Optional[Dict] = None finally: doc.close() - passed = [c for c in criteria if c['passed']] - failed = [c for c in criteria if not c['passed']] - total = len(criteria) - score = round((len(passed) / total) * 10, 2) if total else 0.0 - pass_flag = len(failed) == 0 + crit_passed = [c for c in criteria if c['passed']] + crit_failed = [c for c in criteria if not c['passed']] + crit_total = len(criteria) - if pass_flag: - summary = f'All {total} accessibility criteria passed.' + verapdf = _run_verapdf(pdf_path) + verapdf_ok = bool(verapdf and verapdf.get('available') and not verapdf.get('error')) + + if verapdf_ok: + score, pass_flag, summary = _score_from_verapdf(verapdf) else: - summary = f'{len(failed)} of {total} accessibility criteria failed.' + score = round((len(crit_passed) / crit_total) * 10, 2) if crit_total else 0.0 + pass_flag = len(crit_failed) == 0 + if pass_flag: + summary = f'All {crit_total} fast accessibility criteria passed (veraPDF unavailable — install for full PDF/UA-1 validation).' + else: + summary = f'{len(crit_failed)} of {crit_total} fast accessibility criteria failed (veraPDF unavailable).' - response_lines = [summary, ''] - for c in criteria: - marker = '✓' if c['passed'] else '✗' - response_lines.append(f" {marker} {c['code']} — {c['title']}: {c['note']}") - response = '\n'.join(response_lines) + response = _build_response_text(summary, criteria, verapdf if verapdf_ok else None) return { 'check_name': 'axa_pdf_accessibility', @@ -307,32 +321,182 @@ def axa_pdf_accessibility(ingest_result: Dict, scope_args: Optional[Dict] = None 'summary': summary, 'findings': { 'criteria': criteria, - 'criteria_total': total, - 'criteria_passed': len(passed), - 'criteria_failed': len(failed), - 'verapdf_run': False, # set to True when veraPDF subprocess is wired in + 'criteria_total': crit_total, + 'criteria_passed': len(crit_passed), + 'criteria_failed': len(crit_failed), + 'verapdf_run': verapdf_ok, + 'verapdf': verapdf if verapdf else None, }, 'response': response, } +def _score_from_verapdf(verapdf: Dict) -> tuple: + """Map veraPDF UA-1 verdict to (score, pass_flag, summary). + + Severity ladder: any rule failure means the document is not PDF/UA-1, + so pass_flag is False whenever veraPDF marks the file non-compliant. + Score grades the depth of failure so partially-compliant documents + still produce a meaningful number for trend tracking. + """ + if verapdf.get('compliant'): + n_rules = verapdf.get('passed_rules', 0) + return 10.0, True, f'PDF/UA-1 compliant per veraPDF ({n_rules} rules passed).' + + n_failed = verapdf.get('failed_rules', 0) + n_failed_checks = verapdf.get('failed_checks', 0) + if n_failed <= 1: + score = 5.0 + elif n_failed == 2: + score = 3.0 + else: + score = 0.0 + summary = ( + f'PDF/UA-1 non-compliant per veraPDF: {n_failed} rule(s) failed ' + f'across {n_failed_checks} individual check(s).' + ) + return score, False, summary + + +def _build_response_text(summary: str, criteria: List[Dict], verapdf: Optional[Dict]) -> str: + """Plain-text response shown in the QC report's response block.""" + lines = [summary, ''] + + if verapdf: + lines.append('── veraPDF PDF/UA-1 ──') + verdict = 'COMPLIANT' if verapdf.get('compliant') else 'NOT COMPLIANT' + lines.append(f' Verdict: {verdict}') + lines.append( + f' Rules: {verapdf.get("passed_rules", 0)} passed / ' + f'{verapdf.get("failed_rules", 0)} failed' + ) + lines.append( + f' Checks: {verapdf.get("passed_checks", 0)} passed / ' + f'{verapdf.get("failed_checks", 0)} failed' + ) + for r in verapdf.get('failed_rule_details', []): + tag_str = ', '.join(r.get('tags') or []) or '—' + lines.append('') + lines.append( + f' ✗ Clause {r["clause"]}-{r["test_number"]} ' + f'(×{r["failed_checks"]}, {tag_str})' + ) + lines.append(f' {r["description"]}') + for s in r.get('sample_errors', [])[:1]: + lines.append(f' e.g. {s}') + lines.append('') + + lines.append('── Fast deterministic criteria ──') + for c in criteria: + marker = '✓' if c['passed'] else '✗' + lines.append(f" {marker} {c['code']} — {c['title']}: {c['note']}") + + return '\n'.join(lines) + + # ───────────────────────────────────────────────────────────────────────────── -# veraPDF integration stub — wire when Java is on the host +# veraPDF integration # ───────────────────────────────────────────────────────────────────────────── +def _resolve_verapdf_binary() -> Optional[str]: + """Locate the veraPDF executable. Order: VERAPDF_BIN env > PATH > + project-local vendor install. Returns None if veraPDF is not + installed; the check then falls back to deterministic-only mode. + """ + env_path = os.environ.get('VERAPDF_BIN') + if env_path and os.path.isfile(env_path) and os.access(env_path, os.X_OK): + return env_path + path_lookup = shutil.which('verapdf') + if path_lookup: + return path_lookup + if os.path.isfile(_VERAPDF_VENDOR_PATH) and os.access(_VERAPDF_VENDOR_PATH, os.X_OK): + return _VERAPDF_VENDOR_PATH + return None + + def _run_verapdf(pdf_path: str) -> Optional[Dict]: - """Stub for veraPDF subprocess validation. - - To enable: - 1. Install veraPDF on the host: https://verapdf.org/software/ - (requires JRE 8+; ~150MB total). - 2. Ensure `verapdf` binary is on PATH or set VERAPDF_BIN env var. - 3. Replace this stub with subprocess.run([verapdf, '--format', 'json', - '--profile', 'ua1', pdf_path], capture_output=True). Parse the - JSON output and merge into axa_pdf_accessibility's findings. - 4. Set findings['verapdf_run'] = True so the report shows it ran. - - Currently returns None so callers know veraPDF was not invoked. + """Run veraPDF PDF/UA-1 validation. Returns a structured result dict + or None when veraPDF is not installed. Returns a dict with 'error' + populated if the subprocess ran but failed in some recoverable way. """ - return None + binary = _resolve_verapdf_binary() + if not binary: + return None + + try: + result = subprocess.run( + [binary, '-f', 'ua1', '--format', 'xml', '--maxfailuresdisplayed', '3', pdf_path], + capture_output=True, + text=True, + timeout=_VERAPDF_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired: + return {'available': True, 'binary': binary, 'error': f'veraPDF timed out after {_VERAPDF_TIMEOUT_SECONDS}s'} + except Exception as e: + return {'available': True, 'binary': binary, 'error': f'veraPDF subprocess failed: {e}'} + + if not result.stdout: + return { + 'available': True, + 'binary': binary, + 'error': 'veraPDF produced no output', + 'stderr': (result.stderr or '')[:500], + } + + try: + root = ET.fromstring(result.stdout) + except ET.ParseError as e: + return { + 'available': True, + 'binary': binary, + 'error': f'Could not parse veraPDF XML: {e}', + } + + vr = root.find('.//validationReport') + if vr is None: + return { + 'available': True, + 'binary': binary, + 'error': 'No validationReport in veraPDF output', + } + + details = vr.find('details') + rules: List[Dict] = [] + if details is not None: + for rule in details.findall('rule'): + tags = (rule.get('tags') or '').split(',') + tags = [t for t in tags if t] + rules.append({ + 'specification': rule.get('specification'), + 'clause': rule.get('clause'), + 'test_number': rule.get('testNumber'), + 'tags': tags, + 'failed_checks': int(rule.get('failedChecks') or 0), + 'description': (rule.findtext('description') or '').strip(), + 'sample_errors': [ + (c.findtext('errorMessage') or '').strip() + for c in rule.findall('check')[:2] + ], + }) + + def _detail_int(name: str) -> int: + if details is None: + return 0 + try: + return int(details.get(name) or 0) + except (TypeError, ValueError): + return 0 + + return { + 'available': True, + 'binary': binary, + 'compliant': vr.get('isCompliant') == 'true', + 'profile': vr.get('profileName', 'PDF/UA-1'), + 'statement': vr.get('statement', ''), + 'passed_rules': _detail_int('passedRules'), + 'failed_rules': _detail_int('failedRules'), + 'passed_checks': _detail_int('passedChecks'), + 'failed_checks': _detail_int('failedChecks'), + 'failed_rule_details': rules, + } diff --git a/backend/document_mode/result_writer.py b/backend/document_mode/result_writer.py index 2eb21ef..a112402 100644 --- a/backend/document_mode/result_writer.py +++ b/backend/document_mode/result_writer.py @@ -235,14 +235,60 @@ def _render_pdf_accessibility(findings: Dict) -> str: passed = findings.get('criteria_passed', 0) total = findings.get('criteria_total', 0) verapdf_run = findings.get('verapdf_run', False) + verapdf = findings.get('verapdf') or {} + + if verapdf_run: + verapdf_label = 'enabled' + elif verapdf.get('error'): + verapdf_label = f'error: {html.escape(verapdf["error"])}' + else: + verapdf_label = 'not installed on host' head = f"""
- {passed} / {total} PDF/UA-aligned criteria passed - · veraPDF: {'enabled' if verapdf_run else 'not run (Java not installed)'} + {passed} / {total} fast criteria passed + · veraPDF PDF/UA-1: {verapdf_label}
""" + verapdf_block = '' + if verapdf_run: + compliant = verapdf.get('compliant') + verdict_html = ( + "COMPLIANT" if compliant + else "NOT COMPLIANT" + ) + rule_rows = [] + for r in verapdf.get('failed_rule_details') or []: + tags = ', '.join(r.get('tags') or []) or '—' + samples = r.get('sample_errors') or [] + sample_html = '' + if samples: + sample_html = ( + "e.g. " + html.escape(samples[0]) + ""
+ )
+ rule_rows.append(f"""
+ {html.escape(str(r.get('clause', '')))}-{html.escape(str(r.get('test_number', '')))}{html.escape(tags)}veraPDF verdict: {verdict_html} · + {verapdf.get('passed_rules', 0)} rules passed / {verapdf.get('failed_rules', 0)} failed · + {verapdf.get('passed_checks', 0)} checks passed / {verapdf.get('failed_checks', 0)} failed
+ """ + if rule_rows: + verapdf_block += f""" +| Clause | Failures | Tags | Description |
|---|
| Code | Criterion | Observation |
|---|