From 112719b2c504d99d47006ecf94130a699beaa7f8 Mon Sep 17 00:00:00 2001 From: Vadym Samoilenko Date: Wed, 25 Feb 2026 18:12:44 +0000 Subject: [PATCH] Add Docker stack, frontend redesign, and visual page inspector fix - Redesigned frontend with Outfit/Figtree typography, coral accent palette, noise texture, glassmorphism header, and staggered animations - Split monolithic index.html into modular JS (app, api, upload, batch, results, page-viewer, utils) and extracted CSS - Fixed worker.py to generate page images for Visual Page Inspector - Added Docker Compose stack (web, worker, redis, postgres) - Added batch upload, HTML report export, rate limiting, and Redis queue - Extended test suite with checker, remediation, worker, and DB tests Co-Authored-By: Claude Opus 4.6 --- .dockerignore | 25 + .env.example | 22 + .gitignore | 15 + Dockerfile.web | 33 + Dockerfile.worker | 31 + Test_files/sample_poor_remediated.pdf | 122 ++ api.php | 531 ++++++-- auth.php | 19 +- create_test_pdf_with_images.py | 2 +- css/styles.css | 987 ++++++++++++++ db/init.sql | 36 + db_manager.py | 146 ++ deploy.sh | 217 +++ docker-compose.prod.yml | 66 + docker-compose.yml | 69 + docker-entrypoint-web.sh | 12 + enterprise_pdf_checker.py | 269 +++- index.html | 1770 ++----------------------- js/api.js | 86 ++ js/app.js | 123 ++ js/batch.js | 275 ++++ js/page-viewer.js | 180 +++ js/results.js | 225 ++++ js/upload.js | 193 +++ js/utils.js | 72 + nginx.conf | 38 + pdf_remediation.py | 2 +- redis_queue.py | 92 ++ report_generator.py | 254 ++++ requirements.txt | 4 + tests/conftest.py | 8 + tests/test_api.py | 14 +- tests/test_checker.py | 6 +- tests/test_checker_extended.py | 593 +++++++++ tests/test_db_manager.py | 312 +++++ tests/test_redis_queue.py | 204 +++ tests/test_remediation_extended.py | 196 +++ tests/test_retry_extended.py | 168 +++ tests/test_worker.py | 133 ++ worker.py | 163 +++ 40 files changed, 5915 insertions(+), 1798 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile.web create mode 100644 Dockerfile.worker create mode 100644 Test_files/sample_poor_remediated.pdf create mode 100644 css/styles.css create mode 100644 db/init.sql create mode 100644 db_manager.py create mode 100755 deploy.sh create mode 100644 docker-compose.prod.yml create mode 100644 docker-compose.yml create mode 100644 docker-entrypoint-web.sh create mode 100644 js/api.js create mode 100644 js/app.js create mode 100644 js/batch.js create mode 100644 js/page-viewer.js create mode 100644 js/results.js create mode 100644 js/upload.js create mode 100644 js/utils.js create mode 100644 nginx.conf create mode 100644 redis_queue.py create mode 100644 report_generator.py create mode 100644 tests/test_checker_extended.py create mode 100644 tests/test_db_manager.py create mode 100644 tests/test_redis_queue.py create mode 100644 tests/test_remediation_extended.py create mode 100644 tests/test_retry_extended.py create mode 100644 tests/test_worker.py create mode 100644 worker.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..aeaf0c0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,25 @@ +.git +.gitignore +.env +.keys +.api_keys +.coverage +.cache +.pytest_cache +__pycache__ +venv/ +env/ +htmlcov/ +*.pyc +*.pyo +.DS_Store +Thumbs.db +.vscode/ +.idea/ +logs/ +results/ +uploads/ +*.md +docs_req/ +README's/ +ENTERPRISE_ROADMAP.md diff --git a/.env.example b/.env.example index 2fd2dc8..8d8a91b 100644 --- a/.env.example +++ b/.env.example @@ -16,3 +16,25 @@ GOOGLE_API_KEY=AIzaSyDWVxBWiDTeECqapiUpbXJadrxqcoA9tus # Note: You only need ONE of the Google options above, not both # The credentials file method is recommended for production use + +# Development mode - set to 'true' for localhost auth bypass +DEV_MODE=true + +# Database (PostgreSQL) - used in Docker setup +DB_HOST=postgres +DB_PORT=5432 +DB_NAME=pdf_checker +DB_USER=pdf_checker +DB_PASSWORD=change_me_in_production + +# Redis - used for job queue in Docker setup +REDIS_HOST=redis +REDIS_PORT=6379 + +# Worker configuration +WORKER_COUNT=2 + +# Azure AD / MSAL Authentication +AZURE_TENANT_ID=e519c2e6-bc6d-4fdf-8d9c-923c2f002385 +AZURE_CLIENT_ID=9079054c-9620-4757-a256-23413042f1ef +AZURE_REDIRECT_URI=https://ai-sandbox.oliver.solutions/pdf-accessibility diff --git a/.gitignore b/.gitignore index 232ee43..4d0bd68 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Environment variables (contains API keys) .env +.keys +.api_keys # Python __pycache__/ @@ -28,3 +30,16 @@ reports/ # OS .DS_Store Thumbs.db + +# Docker volumes (local data) +pg-data/ +redis-data/ + +# Coverage +.coverage +htmlcov/ + +# Uploads and results (runtime data) +uploads/ +results/ +logs/ diff --git a/Dockerfile.web b/Dockerfile.web new file mode 100644 index 0000000..a152a68 --- /dev/null +++ b/Dockerfile.web @@ -0,0 +1,33 @@ +FROM php:8.2-fpm-alpine + +# Install Nginx, Python (for report generation), PostgreSQL libs, and PHP extensions +RUN apk add --no-cache nginx python3 postgresql-dev && \ + docker-php-ext-install pdo pdo_pgsql + +# Install php-redis via PECL +RUN apk add --no-cache --virtual .build-deps $PHPIZE_DEPS && \ + pecl install redis && \ + docker-php-ext-enable redis && \ + apk del .build-deps + +# Copy Nginx config +COPY nginx.conf /etc/nginx/http.d/default.conf + +# Copy application files +WORKDIR /app +COPY api.php auth.php index.html ./ +COPY report_generator.py ./ +COPY css/ css/ +COPY js/ js/ + +# Create directories +RUN mkdir -p /app/uploads /app/results /app/logs && \ + chown -R www-data:www-data /app/uploads /app/results /app/logs + +# Start both Nginx and PHP-FPM +COPY docker-entrypoint-web.sh /docker-entrypoint-web.sh +RUN chmod +x /docker-entrypoint-web.sh + +EXPOSE 80 + +CMD ["/docker-entrypoint-web.sh"] diff --git a/Dockerfile.worker b/Dockerfile.worker new file mode 100644 index 0000000..e91be9c --- /dev/null +++ b/Dockerfile.worker @@ -0,0 +1,31 @@ +FROM python:3.11-slim + +# Install system dependencies for PDF processing +RUN apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + tesseract-ocr-eng \ + poppler-utils \ + ghostscript \ + libgl1 \ + libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY enterprise_pdf_checker.py . +COPY pdf_remediation.py . +COPY logger_config.py . +COPY retry_helper.py . +COPY redis_queue.py . +COPY db_manager.py . +COPY worker.py . + +# Create directories +RUN mkdir -p /app/uploads /app/results /app/logs + +CMD ["python", "worker.py"] diff --git a/Test_files/sample_poor_remediated.pdf b/Test_files/sample_poor_remediated.pdf new file mode 100644 index 0000000..fa49fca --- /dev/null +++ b/Test_files/sample_poor_remediated.pdf @@ -0,0 +1,122 @@ +%PDF-1.3 +%βγΟΣ +1 0 obj +<< +/Producer (ReportLab PDF Library \055 www\056reportlab\056com) +/Author (anonymous) +/CreationDate (D\07220251020135612\05300\04700\047) +/Creator (ReportLab PDF Library \055 www\056reportlab\056com) +/Keywords () +/ModDate (D\07220251020135612\05300\04700\047) +/Subject (unspecified) +/Title (untitled) +/Trapped (\057False) +>> +endobj +2 0 obj +<< +/Type /Pages +/Count 2 +/Kids [ 4 0 R 9 0 R ] +>> +endobj +3 0 obj +<< +/Type /Catalog +/Pages 2 0 R +>> +endobj +4 0 obj +<< +/Contents 5 0 R +/MediaBox [ 0 0 612 792 ] +/Resources << +/Font 6 0 R +/ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> +/Rotate 0 +/Trans << +>> +/Type /Page +/Parent 2 0 R +>> +endobj +5 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] +/Length 242 +>> +stream +Gas3,9+&Ni'SYMVX#NH]e0\.o%RgOe`'H9mj)#`LXE\XqGAho&(/t>Q*:eSVM!Cc'[gU"$@'EI()CC/qq_?;%F47_h)EPV"3pA$\>s/K/72V$M0VCQZ>nuQG3.&cPA?L_M0RK2T9De]]6]3%TaZX,i>9LB`lPqYVXY7=lE'0E?Jc\`:qFf5DU)uu +endstream +endobj +6 0 obj +<< +/F1 7 0 R +/F2 8 0 R +>> +endobj +7 0 obj +<< +/BaseFont /Helvetica +/Encoding /WinAnsiEncoding +/Name /F1 +/Subtype /Type1 +/Type /Font +>> +endobj +8 0 obj +<< +/BaseFont /Helvetica-Bold +/Encoding /WinAnsiEncoding +/Name /F2 +/Subtype /Type1 +/Type /Font +>> +endobj +9 0 obj +<< +/Contents 10 0 R +/MediaBox [ 0 0 612 792 ] +/Resources << +/Font 6 0 R +/ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> +/Rotate 0 +/Trans << +>> +/Type /Page +/Parent 2 0 R +>> +endobj +10 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] +/Length 107 +>> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_M(M8&8HllJUrE@,u?n1Jjr"7HE)RZ6?7N]8SVRgVF!h>6AQCJ]`JuM=h>P"~> +endstream +endobj +xref +0 11 +0000000000 65535 f +0000000015 00000 n +0000000355 00000 n +0000000420 00000 n +0000000469 00000 n +0000000658 00000 n +0000000991 00000 n +0000001032 00000 n +0000001139 00000 n +0000001251 00000 n +0000001441 00000 n +trailer +<< +/Size 11 +/Root 3 0 R +/Info 1 0 R +>> +startxref +1640 +%%EOF diff --git a/api.php b/api.php index fe78c90..4d5cbee 100644 --- a/api.php +++ b/api.php @@ -5,6 +5,23 @@ * Handles file uploads, job processing, and result retrieval */ +// Load .env file if getenv doesn't work (Apache doesn't set env vars by default) +$envFile = __DIR__ . '/.env'; +if (file_exists($envFile)) { + $lines = file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + foreach ($lines as $line) { + $line = trim($line); + if ($line === '' || $line[0] === '#') continue; + if (strpos($line, '=') === false) continue; + list($key, $val) = explode('=', $line, 2); + $key = trim($key); + $val = trim($val); + if (!getenv($key)) { + putenv("$key=$val"); + } + } +} + // Configuration define('UPLOAD_DIR', __DIR__ . '/uploads'); define('RESULTS_DIR', __DIR__ . '/results'); @@ -12,12 +29,73 @@ define('PYTHON_SCRIPT', __DIR__ . '/enterprise_pdf_checker.py'); define('MAX_FILE_SIZE', 50 * 1024 * 1024); // 50MB define('ALLOWED_EXTENSIONS', ['pdf']); +// Redis configuration +define('REDIS_HOST', getenv('REDIS_HOST') ?: 'localhost'); +define('REDIS_PORT', intval(getenv('REDIS_PORT') ?: 6379)); +define('REDIS_QUEUE', 'pdf:queue'); +define('REDIS_STATUS_PREFIX', 'pdf:status:'); +define('REDIS_RATE_PREFIX', 'pdf:rate:'); + // Create directories if they don't exist if (!is_dir(UPLOAD_DIR)) mkdir(UPLOAD_DIR, 0755, true); if (!is_dir(RESULTS_DIR)) mkdir(RESULTS_DIR, 0755, true); +/** + * Get Redis connection (lazy singleton) + */ +function getRedis() { + static $redis = null; + if ($redis === null) { + $redis = new Redis(); + $redis->connect(REDIS_HOST, REDIS_PORT); + } + return $redis; +} + +/** + * Check rate limit via Redis. Returns true if allowed. + */ +function checkRateLimit($action, $limit, $window) { + try { + $redis = getRedis(); + $ip = $_SERVER['REMOTE_ADDR'] ?? 'unknown'; + $key = REDIS_RATE_PREFIX . $ip . ':' . $action; + $current = $redis->incr($key); + if ($current === 1) { + $redis->expire($key, $window); + } + return $current <= $limit; + } catch (Exception $e) { + return true; // Allow if Redis is down + } +} + +/** + * Sanitize job ID to prevent path traversal attacks + */ +function sanitizeJobId($job_id) { + if (!preg_match('/^pdf_[a-f0-9]+$/', $job_id)) { + error('Invalid job ID format'); + } + return $job_id; +} + // CORS headers for API -header('Access-Control-Allow-Origin: *'); +$allowed_origins = [ + 'https://ai-sandbox.oliver.solutions', + 'http://localhost:8888', + 'http://127.0.0.1:8888', + 'http://localhost:8000', + 'http://127.0.0.1:8000', +]; +$origin = $_SERVER['HTTP_ORIGIN'] ?? ''; +if (in_array($origin, $allowed_origins) || (function_exists('isDevelopmentMode') && isDevelopmentMode())) { + header('Access-Control-Allow-Origin: ' . ($origin ?: '*')); +} else if ($origin) { + header('Access-Control-Allow-Origin: null'); +} else { + header('Access-Control-Allow-Origin: ' . ($allowed_origins[0])); +} header('Access-Control-Allow-Methods: POST, GET, OPTIONS, DELETE'); header('Access-Control-Allow-Headers: Content-Type, X-API-Key, Authorization'); header('Content-Type: application/json'); @@ -65,6 +143,18 @@ switch ($action) { case 'download': handleDownload(); break; + case 'stats': + handleStats(); + break; + case 'batch_upload': + handleBatchUpload(); + break; + case 'batch_status': + handleBatchStatus(); + break; + case 'export': + handleExport(); + break; default: error('Invalid action'); } @@ -73,6 +163,13 @@ switch ($action) { * Handle file upload */ function handleUpload() { + // Rate limit: 10 uploads/hour per IP + if (!checkRateLimit('upload', 10, 3600)) { + http_response_code(429); + echo json_encode(['success' => false, 'error' => 'Upload rate limit exceeded. Try again later.']); + exit; + } + if (!isset($_FILES['pdf'])) { error('No file uploaded'); } @@ -92,9 +189,15 @@ function handleUpload() { if (!in_array($ext, ALLOWED_EXTENSIONS)) { error('Invalid file type. Only PDF files allowed.'); } - - // Generate unique ID - $job_id = uniqid('pdf_', true); + + // Validate PDF magic bytes + $header = file_get_contents($file['tmp_name'], false, null, 0, 5); + if ($header !== '%PDF-') { + error('File is not a valid PDF (invalid file header)'); + } + + // Generate cryptographically secure job ID + $job_id = 'pdf_' . bin2hex(random_bytes(16)); $filename = $job_id . '.pdf'; $filepath = UPLOAD_DIR . '/' . $filename; @@ -126,91 +229,105 @@ function handleUpload() { } /** - * Handle PDF accessibility check + * Handle PDF accessibility check β€” push job to Redis queue */ function handleCheck() { $job_id = $_POST['job_id'] ?? ''; - + if (empty($job_id)) { error('Job ID required'); } - + $job_id = sanitizeJobId($job_id); + + // Rate limit: 30 checks/hour per IP + if (!checkRateLimit('check', 30, 3600)) { + http_response_code(429); + echo json_encode(['success' => false, 'error' => 'Rate limit exceeded. Try again later.']); + exit; + } + $meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json'; - + if (!file_exists($meta_file)) { error('Job not found'); } - + $job_data = json_decode(file_get_contents($meta_file), true); - - // Build command - use venv Python with absolute path - $pdf_path = $job_data['filepath']; - $output_path = RESULTS_DIR . '/' . $job_id . '.result.json'; - // Use absolute venv path for MAMP - $venv_python = __DIR__ . '/venv/bin/python3'; - $python_bin = file_exists($venv_python) ? $venv_python : 'python3'; - - // Note: Python script will auto-generate page images when --output is specified - $cmd = escapeshellcmd($python_bin . ' ' . PYTHON_SCRIPT) . ' ' . - escapeshellarg($pdf_path) . ' ' . - '--output ' . escapeshellarg($output_path); - - // Handle quick mode $quick_mode = $_POST['quick_mode'] ?? false; - if ($quick_mode) { - $cmd .= ' --quick'; - } - // Handle API keys - accept both formats - $anthropic_key = $_POST['anthropic_key'] ?? getenv('ANTHROPIC_API_KEY'); - $google_key = $_POST['google_key'] ?? $_POST['google_credentials'] ?? getenv('GOOGLE_API_KEY') ?? getenv('GOOGLE_APPLICATION_CREDENTIALS'); - - if ($anthropic_key) { - $cmd .= ' --anthropic-key ' . escapeshellarg($anthropic_key); - } - - if ($google_key) { - // Check if it's a file path or an API key - if (file_exists($google_key)) { - // It's a JSON credentials file - $cmd .= ' --google-credentials ' . escapeshellarg($google_key); - } else { - // It's an API key string - $cmd .= ' --google-key ' . escapeshellarg($google_key); + // Push job to Redis queue for worker processing + try { + $redis = getRedis(); + $payload = json_encode([ + 'job_id' => $job_id, + 'pdf_path' => $job_data['filepath'], + 'original_filename' => $job_data['original_filename'] ?? '', + 'options' => [ + 'quick_mode' => (bool)$quick_mode, + ], + 'queued_at' => time() + ]); + $redis->lPush(REDIS_QUEUE, $payload); + + // Set initial status in Redis + $redis->setex(REDIS_STATUS_PREFIX . $job_id, 86400, json_encode([ + 'status' => 'queued', + 'progress' => 0, + 'message' => 'Waiting in queue', + 'updated_at' => time() + ])); + } catch (Exception $e) { + // Fallback to direct exec if Redis is unavailable (local dev without Docker) + $pdf_path = $job_data['filepath']; + $output_path = RESULTS_DIR . '/' . $job_id . '.result.json'; + $venv_python = __DIR__ . '/venv/bin/python3'; + $python_bin = file_exists($venv_python) ? $venv_python : 'python3'; + + $cmd = escapeshellcmd($python_bin . ' ' . PYTHON_SCRIPT) . ' ' . + escapeshellarg($pdf_path) . ' ' . + '--output ' . escapeshellarg($output_path); + + if ($quick_mode) { + $cmd .= ' --quick'; } + + $anthropic_key = $_POST['anthropic_key'] ?? getenv('ANTHROPIC_API_KEY'); + $google_key = $_POST['google_key'] ?? $_POST['google_credentials'] ?? getenv('GOOGLE_API_KEY'); + + if ($anthropic_key) { + $cmd .= ' --anthropic-key ' . escapeshellarg($anthropic_key); + } + if ($google_key) { + if (file_exists($google_key)) { + $cmd .= ' --google-credentials ' . escapeshellarg($google_key); + } else { + $cmd .= ' --google-key ' . escapeshellarg($google_key); + } + } + + $env_path = getenv('PATH'); + putenv("PATH=/opt/homebrew/bin:/usr/local/bin:{$env_path}"); + + $error_log = RESULTS_DIR . '/' . $job_id . '.error.log'; + $cmd .= ' > ' . escapeshellarg($error_log) . ' 2>&1 &'; + exec($cmd, $output, $return_code); } - - // Update status - $job_data['status'] = 'processing'; + + // Update meta file + $job_data['status'] = 'queued'; $job_data['started_at'] = date('Y-m-d H:i:s'); - $job_data['command'] = $cmd; // Store for debugging file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT)); - // Set PATH to include Homebrew (for poppler) - $env_path = getenv('PATH'); - $poppler_paths = '/opt/homebrew/bin:/usr/local/bin'; - putenv("PATH={$poppler_paths}:{$env_path}"); - - // Log errors to a file for debugging - $error_log = RESULTS_DIR . '/' . $job_id . '.error.log'; - $cmd .= ' > ' . escapeshellarg($error_log) . ' 2>&1 &'; - - exec($cmd, $output, $return_code); - success([ 'job_id' => $job_id, - 'status' => 'processing', - 'message' => 'Check started', - 'debug' => [ - 'command' => $cmd, - 'return_code' => $return_code - ] + 'status' => 'queued', + 'message' => 'Check queued for processing' ]); } /** - * Check job status + * Check job status β€” reads from Redis (real-time) with file fallback */ function handleStatus() { $job_id = $_GET['job_id'] ?? ''; @@ -218,6 +335,7 @@ function handleStatus() { if (empty($job_id)) { error('Job ID required'); } + $job_id = sanitizeJobId($job_id); $meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json'; $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; @@ -229,23 +347,33 @@ function handleStatus() { $job_data = json_decode(file_get_contents($meta_file), true); - // Check if result exists + // Try Redis first for real-time progress + try { + $redis = getRedis(); + $redis_status = $redis->get(REDIS_STATUS_PREFIX . $job_id); + if ($redis_status) { + $status_data = json_decode($redis_status, true); + $job_data['status'] = $status_data['status']; + $job_data['progress'] = $status_data['progress'] ?? 0; + $job_data['status_message'] = $status_data['message'] ?? ''; + } + } catch (Exception $e) { + // Redis unavailable β€” fall through to file-based check + } + + // File-based fallback: check if result exists if (file_exists($result_file)) { $job_data['status'] = 'completed'; $job_data['completed_at'] = date('Y-m-d H:i:s', filemtime($result_file)); - - // Update meta file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT)); - } else if (file_exists($error_log)) { - // Check if there are errors + } else if (file_exists($error_log) && $job_data['status'] === 'processing') { $error_content = file_get_contents($error_log); - if (!empty($error_content) && $job_data['status'] == 'processing') { - // Check if it's been more than 5 minutes - $started = strtotime($job_data['started_at']); + if (!empty($error_content)) { + $started = strtotime($job_data['started_at'] ?? 'now'); if (time() - $started > 300) { $job_data['status'] = 'failed'; $job_data['error'] = 'Process timeout or error'; - $job_data['error_log'] = substr($error_content, -1000); // Last 1000 chars + $job_data['error_log'] = substr($error_content, -1000); } } } @@ -258,10 +386,11 @@ function handleStatus() { */ function handleResult() { $job_id = $_GET['job_id'] ?? ''; - + if (empty($job_id)) { error('Job ID required'); } + $job_id = sanitizeJobId($job_id); $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; @@ -307,10 +436,11 @@ function handleList() { */ function handleDelete() { $job_id = $_POST['job_id'] ?? $_GET['job_id'] ?? ''; - + if (empty($job_id)) { error('Job ID required'); } + $job_id = sanitizeJobId($job_id); $meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json'; @@ -332,11 +462,18 @@ function handleDelete() { * Debug endpoint */ function handleDebug() { + // Debug endpoint only available in development mode + require_once __DIR__ . '/auth.php'; + if (!isDevelopmentMode()) { + error('Debug endpoint disabled in production'); + } + $job_id = $_GET['job_id'] ?? ''; if (empty($job_id)) { error('Job ID required'); } + $job_id = sanitizeJobId($job_id); $meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json'; $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; @@ -380,6 +517,7 @@ function handleImage() { if (empty($job_id) || empty($page_num)) { error('Job ID and page number required'); } + $job_id = sanitizeJobId($job_id); // Find the image file $images_dir = RESULTS_DIR . '/' . $job_id . '.result_images'; @@ -408,6 +546,7 @@ function handleRemediate() { if (empty($job_id)) { error('Job ID required'); } + $job_id = sanitizeJobId($job_id); $meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json'; $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; @@ -480,6 +619,7 @@ function handleDownload() { if (empty($job_id)) { error('Job ID required'); } + $job_id = sanitizeJobId($job_id); $meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json'; @@ -508,6 +648,247 @@ function handleDownload() { exit; } +/** + * Get aggregate job statistics + */ +function handleStats() { + $stats = [ + 'total_jobs' => 0, + 'completed' => 0, + 'failed' => 0, + 'processing' => 0, + 'queue_length' => 0 + ]; + + // Count jobs from meta files + $files = glob(RESULTS_DIR . '/*.meta.json'); + foreach ($files as $file) { + $job = json_decode(file_get_contents($file), true); + $stats['total_jobs']++; + $result_file = str_replace('.meta.json', '.result.json', $file); + if (file_exists($result_file)) { + $stats['completed']++; + } else if (($job['status'] ?? '') === 'failed') { + $stats['failed']++; + } else { + $stats['processing']++; + } + } + + // Get queue length from Redis + try { + $redis = getRedis(); + $stats['queue_length'] = $redis->lLen(REDIS_QUEUE); + } catch (Exception $e) { + // Redis unavailable + } + + success($stats); +} + +/** + * Handle batch file upload β€” accepts multiple PDFs + */ +function handleBatchUpload() { + if (!checkRateLimit('upload', 10, 3600)) { + http_response_code(429); + echo json_encode(['success' => false, 'error' => 'Upload rate limit exceeded.']); + exit; + } + + if (!isset($_FILES['pdfs']) || !is_array($_FILES['pdfs']['name'])) { + error('No files uploaded. Use "pdfs[]" as the file field name.'); + } + + $batch_id = 'batch_' . bin2hex(random_bytes(8)); + $file_count = count($_FILES['pdfs']['name']); + $uploaded = []; + $errors = []; + + for ($i = 0; $i < $file_count; $i++) { + $name = $_FILES['pdfs']['name'][$i]; + $tmp = $_FILES['pdfs']['tmp_name'][$i]; + $size = $_FILES['pdfs']['size'][$i]; + $err = $_FILES['pdfs']['error'][$i]; + + if ($err !== UPLOAD_ERR_OK) { + $errors[] = ['filename' => $name, 'error' => "Upload error code: $err"]; + continue; + } + if ($size > MAX_FILE_SIZE) { + $errors[] = ['filename' => $name, 'error' => 'File too large']; + continue; + } + $ext = strtolower(pathinfo($name, PATHINFO_EXTENSION)); + if (!in_array($ext, ALLOWED_EXTENSIONS)) { + $errors[] = ['filename' => $name, 'error' => 'Not a PDF file']; + continue; + } + $header = file_get_contents($tmp, false, null, 0, 5); + if ($header !== '%PDF-') { + $errors[] = ['filename' => $name, 'error' => 'Invalid PDF header']; + continue; + } + + $job_id = 'pdf_' . bin2hex(random_bytes(16)); + $filename = $job_id . '.pdf'; + $filepath = UPLOAD_DIR . '/' . $filename; + + if (!move_uploaded_file($tmp, $filepath)) { + $errors[] = ['filename' => $name, 'error' => 'Failed to save']; + continue; + } + + $job_data = [ + 'job_id' => $job_id, + 'batch_id' => $batch_id, + 'original_filename' => $name, + 'uploaded_at' => date('Y-m-d H:i:s'), + 'file_size' => $size, + 'status' => 'uploaded', + 'filepath' => $filepath + ]; + file_put_contents( + RESULTS_DIR . '/' . $job_id . '.meta.json', + json_encode($job_data, JSON_PRETTY_PRINT) + ); + + $uploaded[] = ['job_id' => $job_id, 'filename' => $name]; + } + + // Save batch manifest + $batch_data = [ + 'batch_id' => $batch_id, + 'created_at' => date('Y-m-d H:i:s'), + 'total_files' => $file_count, + 'jobs' => array_column($uploaded, 'job_id'), + ]; + file_put_contents( + RESULTS_DIR . '/' . $batch_id . '.batch.json', + json_encode($batch_data, JSON_PRETTY_PRINT) + ); + + success([ + 'batch_id' => $batch_id, + 'uploaded' => $uploaded, + 'errors' => $errors, + 'message' => count($uploaded) . ' of ' . $file_count . ' files uploaded' + ]); +} + +/** + * Get status of a batch job + */ +function handleBatchStatus() { + $batch_id = $_GET['batch_id'] ?? ''; + if (empty($batch_id) || !preg_match('/^batch_[a-f0-9]+$/', $batch_id)) { + error('Invalid batch ID'); + } + + $batch_file = RESULTS_DIR . '/' . $batch_id . '.batch.json'; + if (!file_exists($batch_file)) { + error('Batch not found'); + } + + $batch = json_decode(file_get_contents($batch_file), true); + $jobs = []; + $completed = 0; + $failed = 0; + + foreach ($batch['jobs'] as $job_id) { + $meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json'; + $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; + + $status = 'unknown'; + $score = null; + $filename = ''; + + if (file_exists($meta_file)) { + $meta = json_decode(file_get_contents($meta_file), true); + $status = $meta['status'] ?? 'uploaded'; + $filename = $meta['original_filename'] ?? ''; + } + if (file_exists($result_file)) { + $status = 'completed'; + $result = json_decode(file_get_contents($result_file), true); + $score = $result['accessibility_score'] ?? null; + $completed++; + } else if ($status === 'failed') { + $failed++; + } + + $jobs[] = [ + 'job_id' => $job_id, + 'filename' => $filename, + 'status' => $status, + 'score' => $score + ]; + } + + $total = count($batch['jobs']); + $overall_status = ($completed === $total) ? 'completed' : + (($completed + $failed === $total) ? 'finished' : 'processing'); + + success([ + 'batch_id' => $batch_id, + 'status' => $overall_status, + 'total' => $total, + 'completed' => $completed, + 'failed' => $failed, + 'jobs' => $jobs + ]); +} + +/** + * Export results as HTML or JSON + */ +function handleExport() { + $job_id = $_GET['job_id'] ?? ''; + $format = $_GET['format'] ?? 'json'; + + if (empty($job_id)) { + error('Job ID required'); + } + $job_id = sanitizeJobId($job_id); + + $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; + if (!file_exists($result_file)) { + error('Results not found'); + } + + $result = json_decode(file_get_contents($result_file), true); + + if ($format === 'html') { + // Generate HTML report via Python + $venv_python = __DIR__ . '/venv/bin/python3'; + $python_bin = file_exists($venv_python) ? $venv_python : 'python3'; + $report_script = __DIR__ . '/report_generator.py'; + + $html_file = RESULTS_DIR . '/' . $job_id . '.report.html'; + + $cmd = escapeshellcmd($python_bin . ' ' . $report_script) . + ' --input ' . escapeshellarg($result_file) . + ' --output ' . escapeshellarg($html_file); + + exec($cmd . ' 2>&1', $output, $return_code); + + if ($return_code !== 0 || !file_exists($html_file)) { + error('Report generation failed'); + } + + header('Content-Type: text/html; charset=utf-8'); + header('Content-Disposition: attachment; filename="accessibility_report_' . $job_id . '.html"'); + readfile($html_file); + exit; + } + + // Default: JSON download + header('Content-Type: application/json'); + header('Content-Disposition: attachment; filename="accessibility_report_' . $job_id . '.json"'); + echo json_encode($result, JSON_PRETTY_PRINT); + exit; +} + /** * Send success response */ diff --git a/auth.php b/auth.php index 9ac0ff1..d63db01 100644 --- a/auth.php +++ b/auth.php @@ -38,10 +38,15 @@ function authenticate() { * @return bool True if development mode */ function isDevelopmentMode() { + // Require DEV_MODE env var to be explicitly set for localhost bypass + $dev_mode = getenv('DEV_MODE'); + if ($dev_mode !== 'true' && $dev_mode !== '1') { + return false; + } $host = $_SERVER['HTTP_HOST'] ?? $_SERVER['SERVER_NAME'] ?? 'unknown'; - - // Allow localhost and 127.0.0.1 without auth - return in_array($host, ['localhost:8000', 'localhost', '127.0.0.1:8000', '127.0.0.1']); + // Match localhost or 127.0.0.1 on any port + $hostname = explode(':', $host)[0]; + return in_array($hostname, ['localhost', '127.0.0.1']); } /** @@ -67,8 +72,8 @@ function extractApiKey() { return trim($_SERVER['HTTP_X_API_KEY']); } - // Check query parameter (least secure) - if (isset($_GET['api_key'])) { + // Check query parameter (least secure - dev only) + if (isDevelopmentMode() && isset($_GET['api_key'])) { return trim($_GET['api_key']); } @@ -108,8 +113,8 @@ function getValidApiKeys() { } } - // Fallback to dev key if no keys configured (DEV MODE ONLY) - if (empty($keys)) { + // Fallback to dev key only in development mode + if (empty($keys) && isDevelopmentMode()) { error_log("WARNING: Using default dev API key. Configure proper API keys for production!"); $keys[] = 'dev_key_12345'; } diff --git a/create_test_pdf_with_images.py b/create_test_pdf_with_images.py index 7ca22a2..d0cf709 100644 --- a/create_test_pdf_with_images.py +++ b/create_test_pdf_with_images.py @@ -17,7 +17,7 @@ def create_image_with_text(text, width=300, height=100, bg_color='red', text_col # Try to use a decent font try: font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) - except: + except (OSError, IOError): font = ImageFont.load_default() # Draw text on image diff --git a/css/styles.css b/css/styles.css new file mode 100644 index 0000000..fbe25a0 --- /dev/null +++ b/css/styles.css @@ -0,0 +1,987 @@ +/* Enterprise PDF Accessibility Checker β€” Redesigned */ +/* Aesthetic: Precision Observatory β€” utilitarian elegance with warm accents */ + +@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&family=Figtree:wght@400;500;600;700&display=swap'); + +*, *::before, *::after { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +/* ── Design Tokens ── */ +:root { + /* Typography */ + --font-display: 'Outfit', sans-serif; + --font-body: 'Figtree', sans-serif; + + /* Core palette */ + --accent: #e8553d; + --accent-hover: #d44a33; + --accent-glow: rgba(232, 85, 61, 0.15); + --accent-subtle: rgba(232, 85, 61, 0.08); + + /* Semantic */ + --success: #059669; + --success-bg: rgba(5, 150, 105, 0.08); + --warning: #d97706; + --warning-bg: rgba(217, 119, 6, 0.08); + --error: #ef4444; + --error-bg: rgba(239, 68, 68, 0.08); + --critical: #dc2626; + --critical-bg: rgba(220, 38, 38, 0.08); + --info: #3b82f6; + --info-bg: rgba(37, 99, 235, 0.08); + + /* Surfaces β€” Light */ + --bg: #f5f3f0; + --bg-subtle: #eae7e2; + --surface: #ffffff; + --surface-raised: #ffffff; + --surface-alt: #f9f8f6; + --text: #1a1a2e; + --text-light: #555566; + --text-secondary: #555566; + --text-muted: #8888a0; + --border: #e0ddd8; + --border-subtle: #eae8e4; + --divider: #d4d0ca; + --log-bg: #faf9f7; + --primary: #e8553d; + --primary-dark: #d44a33; + --black: #1a1a2e; + + /* Shadows */ + --shadow-sm: 0 1px 2px rgba(26, 26, 46, 0.04); + --shadow-md: 0 4px 12px rgba(26, 26, 46, 0.06), 0 1px 3px rgba(26, 26, 46, 0.04); + --shadow-lg: 0 8px 32px rgba(26, 26, 46, 0.08), 0 2px 8px rgba(26, 26, 46, 0.04); + --shadow-glow: 0 0 0 1px var(--accent), 0 0 20px var(--accent-glow); + + /* Geometry */ + --radius-sm: 6px; + --radius-md: 10px; + --radius-lg: 16px; + --radius-xl: 24px; + + /* Transitions */ + --ease-out: cubic-bezier(0.16, 1, 0.3, 1); + --ease-spring: cubic-bezier(0.34, 1.56, 0.64, 1); +} + +/* ── Dark Mode ── */ +:root[data-theme="dark"] { + --bg: #0c0e16; + --bg-subtle: #131520; + --surface: #181b28; + --surface-raised: #1e2235; + --surface-alt: #141724; + --text: #e4e2dd; + --text-light: #9d9bb0; + --text-secondary: #9d9bb0; + --text-muted: #6b697f; + --border: #2a2d40; + --border-subtle: #222538; + --divider: #252840; + --log-bg: #0f1119; + --primary: #ff6b4a; + --primary-dark: #ff8066; + --black: #e4e2dd; + --accent: #ff6b4a; + --accent-hover: #ff8066; + --accent-glow: rgba(255, 107, 74, 0.2); + --accent-subtle: rgba(255, 107, 74, 0.1); + --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.2); + --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.3); + --shadow-lg: 0 8px 32px rgba(0, 0, 0, 0.4); +} + +/* ── Dev Banner ── */ +.dev-banner { + background: #dc2626; + color: #ffffff; + text-align: center; + padding: 6px 16px; + font-family: var(--font-display); + font-size: 12px; + font-weight: 700; + letter-spacing: 0.12em; + text-transform: uppercase; + position: sticky; + top: 0; + z-index: 200; +} + +/* ── Base ── */ +body { + font-family: var(--font-body); + background: var(--bg); + color: var(--text); + line-height: 1.6; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; + overflow-x: hidden; +} + +/* Subtle noise texture */ +body::before { + content: ''; + position: fixed; + inset: 0; + background-image: url("data:image/svg+xml,%3Csvg viewBox='0 0 256 256' xmlns='http://www.w3.org/2000/svg'%3E%3Cfilter id='n'%3E%3CfeTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='4' stitchTiles='stitch'/%3E%3C/filter%3E%3Crect width='100%25' height='100%25' filter='url(%23n)' opacity='0.03'/%3E%3C/svg%3E"); + pointer-events: none; + z-index: 0; +} + +.container { + max-width: 1200px; + margin: 0 auto; + padding: 24px; + position: relative; + z-index: 1; +} + +/* ── Header ── */ +header { + border-bottom: 1px solid var(--border); + padding: 0; + margin-bottom: 32px; + position: sticky; + top: 30px; + z-index: 100; + backdrop-filter: blur(16px); + -webkit-backdrop-filter: blur(16px); + background: rgba(245, 243, 240, 0.8); + box-shadow: var(--shadow-sm); + animation: slideDown 0.5s var(--ease-out); +} + +:root[data-theme="dark"] header { + background: rgba(12, 14, 22, 0.8); +} + +@keyframes slideDown { + from { opacity: 0; transform: translateY(-10px); } + to { opacity: 1; transform: translateY(0); } +} + +.header-inner { + display: flex; + justify-content: space-between; + align-items: center; + min-height: 64px; +} + +h1 { + font-family: var(--font-display); + font-size: 22px; + font-weight: 700; + color: var(--text); + letter-spacing: -0.03em; + margin-bottom: 0; +} + +h1::before { + content: ''; + display: inline-block; + width: 4px; + height: 20px; + background: var(--accent); + border-radius: 2px; + margin-right: 12px; + vertical-align: middle; +} + +.subtitle { + font-family: var(--font-body); + font-size: 13px; + color: var(--text-muted); + font-weight: 400; + margin-top: 2px; + letter-spacing: 0.01em; +} + +.header-actions { + display: flex; + gap: 8px; + align-items: center; +} + +.header-actions button { + font-family: var(--font-body); + background: var(--surface-alt); + border: 1px solid var(--border); + color: var(--text-secondary); + padding: 7px 14px; + border-radius: var(--radius-sm); + cursor: pointer; + font-size: 13px; + font-weight: 500; + transition: all 0.2s var(--ease-out); +} + +.header-actions button:hover { + border-color: var(--accent); + color: var(--accent); + background: var(--accent-subtle); +} + +.user-info { + color: var(--text-muted); + font-size: 13px; + font-weight: 500; +} + +/* ── Cards ── */ +.card { + background: var(--surface); + border-radius: var(--radius-lg); + padding: 28px; + margin-bottom: 20px; + border: 1px solid var(--border-subtle); + box-shadow: var(--shadow-sm); + animation: fadeUp 0.5s var(--ease-out) backwards; +} + +.card:nth-child(1) { animation-delay: 0.05s; } +.card:nth-child(2) { animation-delay: 0.1s; } +.card:nth-child(3) { animation-delay: 0.15s; } +.card:nth-child(4) { animation-delay: 0.2s; } + +@keyframes fadeUp { + from { opacity: 0; transform: translateY(16px); } + to { opacity: 1; transform: translateY(0); } +} + +.card h2 { + font-family: var(--font-display); + font-size: 18px; + font-weight: 600; + margin-bottom: 20px; + color: var(--text); + letter-spacing: -0.02em; +} + +/* ── Upload Area ── */ +.upload-area { + border: 2px dashed var(--border); + border-radius: var(--radius-lg); + padding: 64px 40px; + text-align: center; + transition: all 0.3s var(--ease-out); + cursor: pointer; + position: relative; + overflow: hidden; + background: var(--surface-alt); +} + +.upload-area::after { + content: ''; + position: absolute; + inset: 0; + background: radial-gradient(circle at center, var(--accent-glow) 0%, transparent 70%); + opacity: 0; + transition: opacity 0.4s; +} + +.upload-area:hover { + border-color: var(--accent); + box-shadow: var(--shadow-glow); +} + +.upload-area:hover::after { + opacity: 1; +} + +.upload-area.dragover { + border-color: var(--accent); + background: var(--accent-subtle); + box-shadow: var(--shadow-glow); + transform: scale(1.01); +} + +.upload-area.dragover::after { + opacity: 1; +} + +.upload-area input[type="file"] { + display: none; +} + +.upload-icon { + font-size: 48px; + margin-bottom: 16px; + position: relative; + z-index: 1; + filter: grayscale(0.2); + color: var(--text); +} + +:root[data-theme="dark"] .upload-icon { + color: var(--accent); +} + +.upload-text { + font-family: var(--font-display); + font-size: 16px; + font-weight: 500; + margin-bottom: 8px; + color: var(--text); + position: relative; + z-index: 1; +} + +.upload-hint { + font-size: 13px; + color: var(--text-muted); + position: relative; + z-index: 1; +} + +/* ── Buttons ── */ +.btn { + font-family: var(--font-display); + display: inline-flex; + align-items: center; + gap: 8px; + padding: 10px 20px; + border: none; + border-radius: var(--radius-sm); + font-size: 14px; + font-weight: 600; + cursor: pointer; + transition: all 0.2s var(--ease-out); + text-decoration: none; + letter-spacing: -0.01em; +} + +.btn-primary { + background: var(--accent); + color: #ffffff; + border: none; +} + +.btn-primary:hover { + background: var(--accent-hover); + box-shadow: 0 4px 16px var(--accent-glow); + transform: translateY(-1px); +} + +.btn-secondary { + background: var(--surface-alt); + color: var(--text); + border: 1px solid var(--border); +} + +.btn-secondary:hover { + border-color: var(--accent); + color: var(--accent); + background: var(--accent-subtle); +} + +.btn:disabled { + opacity: 0.4; + cursor: not-allowed; + transform: none !important; + box-shadow: none !important; +} + +/* ── Progress ── */ +.progress-container { + display: none; + padding: 24px; + background: var(--surface-alt); + border-radius: var(--radius-md); + margin-top: 24px; + border: 1px solid var(--border-subtle); + animation: fadeUp 0.4s var(--ease-out); +} + +.progress-header { + display: flex; + justify-content: space-between; + align-items: baseline; + margin-bottom: 12px; +} + +.progress-text { + font-family: var(--font-display); + font-size: 14px; + font-weight: 600; + color: var(--text); +} + +.progress-percent { + font-family: var(--font-display); + font-size: 24px; + font-weight: 700; + color: var(--accent); + letter-spacing: -0.03em; +} + +.progress-bar { + height: 6px; + background: var(--bg-subtle); + border-radius: 3px; + overflow: hidden; + margin-bottom: 20px; + position: relative; +} + +.progress-fill { + height: 100%; + background: linear-gradient(90deg, var(--accent) 0%, #ff8f66 100%); + transition: width 0.4s var(--ease-out); + border-radius: 3px; + position: relative; +} + +.progress-fill::after { + content: ''; + position: absolute; + right: 0; + top: -2px; + width: 10px; + height: 10px; + border-radius: 50%; + background: var(--accent); + box-shadow: 0 0 12px var(--accent-glow); + animation: pulse-dot 1.5s ease-in-out infinite; +} + +@keyframes pulse-dot { + 0%, 100% { transform: scale(1); opacity: 1; } + 50% { transform: scale(1.4); opacity: 0.6; } +} + +/* Processing log */ +.progress-log { + background: var(--log-bg); + border: 1px solid var(--border); + border-radius: var(--radius-md); + overflow: hidden; +} + +.log-header { + background: var(--text); + color: var(--bg); + padding: 10px 16px; + font-family: var(--font-display); + font-weight: 600; + font-size: 11px; + text-transform: uppercase; + letter-spacing: 0.1em; +} + +:root[data-theme="dark"] .log-header { + background: #252840; + color: var(--text); +} + +.log-content { + padding: 12px; + max-height: 240px; + overflow-y: auto; + font-size: 12px; + line-height: 1.6; +} + +.log-content::-webkit-scrollbar { + width: 4px; +} + +.log-content::-webkit-scrollbar-thumb { + background: var(--border); + border-radius: 2px; +} + +.log-entry { + padding: 6px 10px; + margin-bottom: 4px; + border-radius: var(--radius-sm); + background: var(--surface-alt); + border-left: 3px solid var(--border); + font-family: var(--font-body); + animation: logSlide 0.3s var(--ease-out); +} + +.log-entry.success { background: var(--success-bg); border-left-color: var(--success); color: #065f46; } +.log-entry.warning { background: var(--warning-bg); border-left-color: var(--warning); color: #92400e; } +.log-entry.error { background: var(--error-bg); border-left-color: var(--error); color: #991b1b; } +.log-entry.info { background: var(--info-bg); border-left-color: var(--info); color: #1e40af; } + +:root[data-theme="dark"] .log-entry.success { color: #6ee7b7; } +:root[data-theme="dark"] .log-entry.warning { color: #fcd34d; } +:root[data-theme="dark"] .log-entry.error { color: #fca5a5; } +:root[data-theme="dark"] .log-entry.info { color: #93c5fd; } + +@keyframes logSlide { + from { opacity: 0; transform: translateX(-8px); } + to { opacity: 1; transform: translateX(0); } +} + +/* ── Results ── */ +.results { display: none; } + +.score-display { + display: inline-flex; + align-items: center; + gap: 20px; + padding: 20px 32px; + background: var(--text); + border-radius: var(--radius-md); + color: #ffffff; + margin-bottom: 24px; + position: relative; + overflow: hidden; + animation: scoreReveal 0.6s var(--ease-out) backwards; + animation-delay: 0.2s; + border: none; +} + +.score-display::before { + content: ''; + position: absolute; + left: 0; + top: 0; + bottom: 0; + width: 4px; + background: var(--accent); +} + +.score-display::after { + content: ''; + position: absolute; + top: 0; + right: 0; + width: 120px; + height: 100%; + background: linear-gradient(90deg, transparent, var(--accent-glow)); + opacity: 0.5; +} + +@keyframes scoreReveal { + from { opacity: 0; transform: scale(0.95); } + to { opacity: 1; transform: scale(1); } +} + +.score-number { + font-family: var(--font-display); + font-size: 48px; + font-weight: 800; + line-height: 1; + letter-spacing: -0.04em; + position: relative; + z-index: 1; +} + +.score-label { + font-family: var(--font-display); + font-size: 12px; + font-weight: 500; + opacity: 0.7; + text-align: left; + text-transform: uppercase; + letter-spacing: 0.06em; + position: relative; + z-index: 1; +} + +/* Stats grid */ +.stats-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(120px, 1fr)); + gap: 10px; + margin-bottom: 16px; +} + +.stat-card { + padding: 16px; + border-radius: var(--radius-md); + text-align: center; + transition: transform 0.2s var(--ease-out), box-shadow 0.2s; + animation: fadeUp 0.4s var(--ease-out) backwards; +} + +.stat-card:nth-child(1) { animation-delay: 0.3s; } +.stat-card:nth-child(2) { animation-delay: 0.35s; } +.stat-card:nth-child(3) { animation-delay: 0.4s; } +.stat-card:nth-child(4) { animation-delay: 0.45s; } +.stat-card:nth-child(5) { animation-delay: 0.5s; } + +.stat-card:hover { + transform: translateY(-2px); + box-shadow: var(--shadow-md); +} + +.stat-card.critical { background: var(--critical-bg); border: 1px solid rgba(220, 38, 38, 0.15); } +.stat-card.error { background: var(--error-bg); border: 1px solid rgba(239, 68, 68, 0.15); } +.stat-card.warning { background: var(--warning-bg); border: 1px solid rgba(217, 119, 6, 0.15); } +.stat-card.info { background: var(--info-bg); border: 1px solid rgba(37, 99, 235, 0.15); } +.stat-card.success { background: var(--success-bg); border: 1px solid rgba(5, 150, 105, 0.15); } + +.stat-number { + font-family: var(--font-display); + font-size: 32px; + font-weight: 700; + margin-bottom: 4px; + letter-spacing: -0.03em; +} + +.stat-label { + font-family: var(--font-display); + font-size: 11px; + text-transform: uppercase; + letter-spacing: 0.08em; + font-weight: 600; + color: var(--text-secondary); +} + +/* ── Issues ── */ +.issues-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(340px, 1fr)); + gap: 10px; +} + +.issue { + padding: 14px 16px; + margin-bottom: 0; + border-radius: var(--radius-md); + border-left: 3px solid; + transition: transform 0.15s var(--ease-out), box-shadow 0.15s; +} + +.issue:hover { + transform: translateX(2px); + box-shadow: var(--shadow-sm); +} + +.issue.CRITICAL { background: var(--critical-bg); border-left-color: var(--critical); } +.issue.ERROR { background: var(--error-bg); border-left-color: var(--error); } +.issue.WARNING { background: var(--warning-bg); border-left-color: var(--warning); } +.issue.INFO { background: var(--info-bg); border-left-color: var(--info); } +.issue.SUCCESS { background: var(--success-bg); border-left-color: var(--success); } + +.issue-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 8px; +} + +.issue-category { + display: flex; + align-items: center; + gap: 6px; + font-family: var(--font-display); + font-size: 13px; + font-weight: 600; + color: var(--text); +} + +.issue-badge { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 3px 8px; + border-radius: 4px; + font-family: var(--font-display); + font-size: 10px; + font-weight: 700; + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.issue-badge.CRITICAL { background: var(--critical); color: white; } +.issue-badge.ERROR { background: var(--error); color: white; } +.issue-badge.WARNING { background: var(--warning); color: white; } +.issue-badge.INFO { background: var(--info); color: white; } +.issue-badge.SUCCESS { background: var(--success); color: white; } + +.issue-description { + color: var(--text); + margin-bottom: 6px; + line-height: 1.5; + font-size: 13px; +} + +.issue-meta { + display: flex; + gap: 12px; + font-size: 12px; + color: var(--text-muted); + margin-bottom: 6px; + font-weight: 500; +} + +.issue-recommendation { + background: var(--success-bg); + padding: 10px 12px; + border-radius: var(--radius-sm); + border-left: 2px solid var(--success); + font-size: 12px; + color: var(--text); + margin-top: 8px; + line-height: 1.5; +} + +.issue-recommendation strong { + color: var(--success); + font-weight: 600; +} + +/* ── Filters ── */ +.filters { + display: flex; + gap: 6px; + margin-bottom: 20px; + flex-wrap: wrap; +} + +.filter-btn { + font-family: var(--font-display); + padding: 7px 16px; + border: 1px solid var(--border); + border-radius: var(--radius-sm); + background: var(--surface); + cursor: pointer; + font-size: 13px; + font-weight: 600; + transition: all 0.2s var(--ease-out); + color: var(--text-secondary); +} + +.filter-btn.active { + background: var(--accent); + color: #ffffff; + border-color: var(--accent); +} + +.filter-btn:hover:not(.active) { + border-color: var(--accent); + color: var(--accent); +} + +/* ── Loading Spinner ── */ +.loading { + display: inline-block; + width: 18px; + height: 18px; + border: 2px solid rgba(255, 255, 255, 0.3); + border-radius: 50%; + border-top-color: white; + animation: spin 0.8s linear infinite; +} + +@keyframes spin { to { transform: rotate(360deg); } } + +/* ── Config / Form ── */ +.api-config { + margin-top: 24px; + padding: 20px; + background: var(--surface-alt); + border-radius: var(--radius-md); + border: 1px solid var(--border-subtle); +} + +.form-group { margin-bottom: 16px; } + +.form-group label { + display: block; + margin-bottom: 6px; + font-family: var(--font-display); + font-weight: 600; + font-size: 13px; + color: var(--text); +} + +.form-group input { + width: 100%; + padding: 10px 14px; + border: 1px solid var(--border); + border-radius: var(--radius-sm); + font-family: var(--font-body); + font-size: 14px; + background: var(--surface); + color: var(--text); + transition: border-color 0.2s, box-shadow 0.2s; +} + +.form-group input:focus { + outline: none; + border-color: var(--accent); + box-shadow: 0 0 0 3px var(--accent-glow); +} + +.help-text { + font-size: 12px; + color: var(--text-muted); + margin-top: 6px; + line-height: 1.5; +} + +/* ── Auth Overlay ── */ +.auth-overlay { + display: none; + position: fixed; + inset: 0; + background: rgba(12, 14, 22, 0.75); + backdrop-filter: blur(8px); + -webkit-backdrop-filter: blur(8px); + z-index: 1000; + justify-content: center; + align-items: center; +} + +.auth-overlay.active { + display: flex; +} + +.auth-card { + background: var(--surface); + border-radius: var(--radius-xl); + padding: 48px; + text-align: center; + max-width: 420px; + width: 90%; + box-shadow: var(--shadow-lg); + border: 1px solid var(--border-subtle); + animation: scaleIn 0.4s var(--ease-spring); +} + +@keyframes scaleIn { + from { opacity: 0; transform: scale(0.92); } + to { opacity: 1; transform: scale(1); } +} + +.auth-card h2 { + font-family: var(--font-display); + color: var(--text); + margin-bottom: 8px; + font-size: 22px; +} + +.auth-card p { + color: var(--text-muted); + margin-bottom: 28px; + font-size: 14px; +} + +.btn-microsoft { + background: var(--text); + color: var(--bg); + border: none; + padding: 14px 28px; + border-radius: var(--radius-sm); + font-family: var(--font-display); + font-size: 15px; + font-weight: 600; + cursor: pointer; + display: inline-flex; + align-items: center; + gap: 12px; + transition: all 0.2s var(--ease-out); +} + +.btn-microsoft:hover { + transform: translateY(-1px); + box-shadow: var(--shadow-md); +} + +:root[data-theme="dark"] .btn-microsoft { + background: #ffffff; + color: #1a1a2e; +} + +/* ── Upload Mode Tabs ── */ +.upload-mode-tabs { + display: flex; + gap: 0; + margin-bottom: 24px; + border-bottom: 1px solid var(--border); +} + +.upload-tab { + font-family: var(--font-display); + padding: 10px 20px; + border: none; + background: none; + font-size: 13px; + font-weight: 600; + color: var(--text-muted); + cursor: pointer; + border-bottom: 2px solid transparent; + margin-bottom: -1px; + transition: color 0.2s, border-color 0.2s; + letter-spacing: -0.01em; +} + +.upload-tab:hover { + color: var(--text); +} + +.upload-tab.active { + color: var(--accent); + border-bottom-color: var(--accent); +} + +/* ── Responsive ── */ +@media (max-width: 768px) { + .container { padding: 12px; } + h1 { font-size: 18px; } + h1::before { height: 16px; margin-right: 8px; } + .card { padding: 20px; border-radius: var(--radius-md); } + .stats-grid { grid-template-columns: 1fr 1fr; } + .issues-grid { grid-template-columns: 1fr; } + .header-inner { flex-direction: column; gap: 10px; align-items: flex-start; } + .upload-area { padding: 40px 20px; } + .score-display { padding: 16px 20px; gap: 14px; } + .score-number { font-size: 36px; } + + .page-viewer-layout { + flex-direction: column !important; + } + + .page-selector-wrap { + flex-shrink: unset !important; + min-width: unset !important; + } + + #pageSelector { + flex-direction: row !important; + overflow-x: auto; + } +} + +/* ── Utility ── */ +.hidden { display: none !important; } + +/* ── Selection & Focus ── */ +::selection { + background: var(--accent); + color: white; +} + +:focus-visible { + outline: 2px solid var(--accent); + outline-offset: 2px; +} + +/* ── Custom scrollbar ── */ +::-webkit-scrollbar { + width: 6px; + height: 6px; +} + +::-webkit-scrollbar-track { + background: transparent; +} + +::-webkit-scrollbar-thumb { + background: var(--border); + border-radius: 3px; +} + +::-webkit-scrollbar-thumb:hover { + background: var(--text-muted); +} diff --git a/db/init.sql b/db/init.sql new file mode 100644 index 0000000..e87d104 --- /dev/null +++ b/db/init.sql @@ -0,0 +1,36 @@ +-- PDF Accessibility Checker - PostgreSQL Schema +-- Run automatically on first Docker Compose startup + +CREATE TABLE IF NOT EXISTS jobs ( + id SERIAL PRIMARY KEY, + job_id VARCHAR(64) UNIQUE NOT NULL, + filename VARCHAR(255), + status VARCHAR(20) DEFAULT 'queued', + score INTEGER, + grade CHAR(1), + total_issues INTEGER, + critical_count INTEGER, + error_count INTEGER, + warning_count INTEGER, + result_json JSONB, + created_at TIMESTAMP DEFAULT NOW(), + completed_at TIMESTAMP, + processing_time FLOAT, + api_key_hash VARCHAR(64), + ip_address INET +); + +CREATE TABLE IF NOT EXISTS audit_log ( + id SERIAL PRIMARY KEY, + job_id VARCHAR(64), + action VARCHAR(50), + details JSONB, + created_at TIMESTAMP DEFAULT NOW(), + ip_address INET +); + +CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); +CREATE INDEX IF NOT EXISTS idx_jobs_created ON jobs(created_at); +CREATE INDEX IF NOT EXISTS idx_jobs_job_id ON jobs(job_id); +CREATE INDEX IF NOT EXISTS idx_audit_job ON audit_log(job_id); +CREATE INDEX IF NOT EXISTS idx_audit_created ON audit_log(created_at); diff --git a/db_manager.py b/db_manager.py new file mode 100644 index 0000000..788c3fd --- /dev/null +++ b/db_manager.py @@ -0,0 +1,146 @@ +""" +PostgreSQL Database Manager β€” CRUD for jobs and audit logging +""" + +import json +import os +import hashlib +import time +import psycopg2 +from psycopg2.extras import RealDictCursor +from contextlib import contextmanager + +DB_HOST = os.getenv('DB_HOST', 'localhost') +DB_PORT = int(os.getenv('DB_PORT', 5432)) +DB_NAME = os.getenv('DB_NAME', 'pdf_checker') +DB_USER = os.getenv('DB_USER', 'pdf_checker') +DB_PASSWORD = os.getenv('DB_PASSWORD', 'dev_password') + + +@contextmanager +def get_conn(): + """Get a database connection (context manager).""" + conn = psycopg2.connect( + host=DB_HOST, + port=DB_PORT, + dbname=DB_NAME, + user=DB_USER, + password=DB_PASSWORD + ) + try: + yield conn + conn.commit() + except Exception: + conn.rollback() + raise + finally: + conn.close() + + +def create_job(job_id: str, filename: str, ip: str = None, api_key: str = None): + """Create a new job record.""" + key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:16] if api_key else None + with get_conn() as conn: + with conn.cursor() as cur: + cur.execute( + """INSERT INTO jobs (job_id, filename, status, api_key_hash, ip_address) + VALUES (%s, %s, 'queued', %s, %s)""", + (job_id, filename, key_hash, ip) + ) + + +def update_job_status(job_id: str, status: str, result_json: dict = None, + score: int = None, grade: str = None, + total_issues: int = None, critical_count: int = None, + error_count: int = None, warning_count: int = None, + processing_time: float = None): + """Update job status and optionally store results.""" + with get_conn() as conn: + with conn.cursor() as cur: + fields = ["status = %s"] + values = [status] + + if result_json is not None: + fields.append("result_json = %s") + values.append(json.dumps(result_json)) + if score is not None: + fields.append("score = %s") + values.append(score) + if grade is not None: + fields.append("grade = %s") + values.append(grade) + if total_issues is not None: + fields.append("total_issues = %s") + values.append(total_issues) + if critical_count is not None: + fields.append("critical_count = %s") + values.append(critical_count) + if error_count is not None: + fields.append("error_count = %s") + values.append(error_count) + if warning_count is not None: + fields.append("warning_count = %s") + values.append(warning_count) + if processing_time is not None: + fields.append("processing_time = %s") + values.append(processing_time) + if status == 'completed': + fields.append("completed_at = NOW()") + + values.append(job_id) + cur.execute( + f"UPDATE jobs SET {', '.join(fields)} WHERE job_id = %s", + values + ) + + +def get_job(job_id: str) -> dict: + """Get a job by ID.""" + with get_conn() as conn: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute("SELECT * FROM jobs WHERE job_id = %s", (job_id,)) + row = cur.fetchone() + return dict(row) if row else None + + +def list_jobs(limit: int = 50, offset: int = 0, status_filter: str = None) -> list: + """List jobs with optional filtering.""" + with get_conn() as conn: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + query = "SELECT job_id, filename, status, score, grade, total_issues, created_at, completed_at, processing_time FROM jobs" + values = [] + if status_filter: + query += " WHERE status = %s" + values.append(status_filter) + query += " ORDER BY created_at DESC LIMIT %s OFFSET %s" + values.extend([limit, offset]) + cur.execute(query, values) + return [dict(row) for row in cur.fetchall()] + + +def log_audit(job_id: str, action: str, details: dict = None, ip: str = None): + """Log an audit event.""" + with get_conn() as conn: + with conn.cursor() as cur: + cur.execute( + """INSERT INTO audit_log (job_id, action, details, ip_address) + VALUES (%s, %s, %s, %s)""", + (job_id, action, json.dumps(details or {}), ip) + ) + + +def get_stats() -> dict: + """Get aggregate statistics.""" + with get_conn() as conn: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute(""" + SELECT + COUNT(*) as total_jobs, + COUNT(*) FILTER (WHERE status = 'completed') as completed_jobs, + COUNT(*) FILTER (WHERE status = 'failed') as failed_jobs, + COUNT(*) FILTER (WHERE status = 'processing') as active_jobs, + ROUND(AVG(score) FILTER (WHERE score IS NOT NULL)) as avg_score, + ROUND(AVG(processing_time) FILTER (WHERE processing_time IS NOT NULL)::numeric, 2) as avg_processing_time + FROM jobs + """) + return dict(cur.fetchone()) diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 0000000..8ffc38e --- /dev/null +++ b/deploy.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash +# +# deploy.sh β€” Idempotent deployment script for PDF Accessibility Checker +# +# Usage: +# cd /opt/pdf-accessibility && ./deploy.sh +# +# Architecture: +# - Apache (host) serves frontend + api.php from /var/www/html/pdf-accessibility +# - Docker Compose runs: worker (Python), Redis, PostgreSQL +# - Redis/PostgreSQL exposed on localhost for api.php access +# +set -euo pipefail + +# ── Configuration ───────────────────────────────────────────────── + +REPO_DIR="$(cd "$(dirname "$0")" && pwd)" +WEB_DIR="/var/www/html/pdf-accessibility" +COMPOSE_FILE="docker-compose.prod.yml" +ENV_FILE="${REPO_DIR}/.env" +MIN_PHP_VERSION="8.0" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log() { echo -e "${GREEN}[DEPLOY]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +err() { echo -e "${RED}[ERROR]${NC} $*"; } + +# ── Preflight Checks ───────────────────────────────────────────── + +log "Starting deployment from ${REPO_DIR}" + +# Check Docker +if ! command -v docker &>/dev/null; then + err "Docker is not installed. Install it first:" + err " curl -fsSL https://get.docker.com | sh" + err " sudo usermod -aG docker \$USER" + exit 1 +fi + +# Check Docker Compose (v2 plugin) +if ! docker compose version &>/dev/null; then + err "Docker Compose v2 is not available. Install it:" + err " sudo apt-get install docker-compose-plugin" + exit 1 +fi + +# Check PHP +if ! command -v php &>/dev/null; then + warn "PHP is not installed. api.php requires PHP ${MIN_PHP_VERSION}+ with extensions:" + warn " sudo apt-get install php8.2 php8.2-redis php8.2-pgsql php8.2-curl php8.2-mbstring" +else + PHP_VER=$(php -r 'echo PHP_MAJOR_VERSION . "." . PHP_MINOR_VERSION;') + log "PHP version: ${PHP_VER}" + + # Check required extensions + MISSING_EXT="" + php -m | grep -qi redis || MISSING_EXT="${MISSING_EXT} php-redis" + php -m | grep -qi pgsql || MISSING_EXT="${MISSING_EXT} php-pgsql" + php -m | grep -qi curl || MISSING_EXT="${MISSING_EXT} php-curl" + + if [ -n "${MISSING_EXT}" ]; then + warn "Missing PHP extensions:${MISSING_EXT}" + warn "Install with: sudo apt-get install${MISSING_EXT}" + fi +fi + +# ── Pull Latest Code ───────────────────────────────────────────── + +log "Pulling latest code..." +cd "${REPO_DIR}" + +if [ -d .git ]; then + git fetch --all + git reset --hard origin/$(git rev-parse --abbrev-ref HEAD) + log "Code updated to $(git log --oneline -1)" +else + warn "Not a git repo β€” using existing files" +fi + +# ── Environment File ───────────────────────────────────────────── + +if [ ! -f "${ENV_FILE}" ]; then + log "Creating .env from .env.example (first run)..." + cp "${REPO_DIR}/.env.example" "${ENV_FILE}" + + # Override Docker hostnames with localhost for host-side PHP + # (Worker uses Docker internal names via docker-compose.prod.yml) + sed -i 's/^DB_HOST=postgres/DB_HOST=127.0.0.1/' "${ENV_FILE}" + sed -i 's/^REDIS_HOST=redis/REDIS_HOST=127.0.0.1/' "${ENV_FILE}" + sed -i 's/^DEV_MODE=true/DEV_MODE=false/' "${ENV_FILE}" + + warn "Review and update ${ENV_FILE} with production values:" + warn " - DB_PASSWORD (change from default!)" + warn " - ANTHROPIC_API_KEY" + warn " - GOOGLE_API_KEY" + warn " - AZURE_* settings" +else + log "Using existing .env file" +fi + +# ── Build Docker Containers ────────────────────────────────────── + +log "Building Docker containers (using cache)..." +docker compose -f "${COMPOSE_FILE}" build + +log "Starting/restarting Docker services..." +docker compose -f "${COMPOSE_FILE}" up -d --remove-orphans + +# Wait for PostgreSQL to be ready +log "Waiting for PostgreSQL to be healthy..." +RETRIES=30 +until docker compose -f "${COMPOSE_FILE}" exec -T postgres pg_isready -U pdf_checker &>/dev/null || [ $RETRIES -eq 0 ]; do + sleep 1 + RETRIES=$((RETRIES - 1)) +done + +if [ $RETRIES -eq 0 ]; then + err "PostgreSQL failed to start. Check logs:" + err " docker compose -f ${COMPOSE_FILE} logs postgres" + exit 1 +fi + +log "PostgreSQL is ready" + +# Database init.sql runs automatically on first compose up via +# /docker-entrypoint-initdb.d/init.sql β€” no migration tool needed. +# For future migrations, add numbered SQL files to db/ and apply: +if [ -d "${REPO_DIR}/db/migrations" ]; then + for migration in "${REPO_DIR}"/db/migrations/*.sql; do + [ -f "$migration" ] || continue + MIGRATION_NAME=$(basename "$migration") + log "Applying migration: ${MIGRATION_NAME}" + docker compose -f "${COMPOSE_FILE}" exec -T postgres \ + psql -U pdf_checker -d pdf_checker -f "/dev/stdin" < "$migration" 2>/dev/null || \ + warn "Migration ${MIGRATION_NAME} may have already been applied" + done +fi + +# ── Deploy Frontend Files ───────────────────────────────────────── + +log "Deploying frontend to ${WEB_DIR}..." + +# Create web directory if it doesn't exist +sudo mkdir -p "${WEB_DIR}" + +# Clean old frontend files (but preserve uploads, results, .env, logs) +log "Cleaning old frontend files..." +sudo rm -f "${WEB_DIR}/index.html" +sudo rm -rf "${WEB_DIR}/css" "${WEB_DIR}/js" +sudo rm -f "${WEB_DIR}/api.php" "${WEB_DIR}/auth.php" + +# Copy frontend files +sudo cp "${REPO_DIR}/index.html" "${WEB_DIR}/" +sudo cp -r "${REPO_DIR}/css" "${WEB_DIR}/" +sudo cp -r "${REPO_DIR}/js" "${WEB_DIR}/" + +# Copy PHP backend files +sudo cp "${REPO_DIR}/api.php" "${WEB_DIR}/" +sudo cp "${REPO_DIR}/auth.php" "${WEB_DIR}/" + +# Copy Python scripts (needed if api.php fallback exec() is used) +sudo cp "${REPO_DIR}/enterprise_pdf_checker.py" "${WEB_DIR}/" +sudo cp "${REPO_DIR}/pdf_remediation.py" "${WEB_DIR}/" +sudo cp "${REPO_DIR}/logger_config.py" "${WEB_DIR}/" +sudo cp "${REPO_DIR}/retry_helper.py" "${WEB_DIR}/" + +# Copy .env for PHP (if not already there) +if [ ! -f "${WEB_DIR}/.env" ]; then + sudo cp "${ENV_FILE}" "${WEB_DIR}/.env" + log "Copied .env to web directory" +else + # Update .env in web dir from repo .env + sudo cp "${ENV_FILE}" "${WEB_DIR}/.env" +fi + +# Create runtime directories +sudo mkdir -p "${WEB_DIR}/uploads" "${WEB_DIR}/results" "${WEB_DIR}/logs" + +# Set ownership for Apache +sudo chown -R www-data:www-data "${WEB_DIR}" +sudo chmod -R 755 "${WEB_DIR}" +sudo chmod -R 775 "${WEB_DIR}/uploads" "${WEB_DIR}/results" "${WEB_DIR}/logs" + +# ── Verify ──────────────────────────────────────────────────────── + +log "" +log "=============================================" +log " Deployment complete!" +log "=============================================" +log "" +log "Services status:" +docker compose -f "${COMPOSE_FILE}" ps --format "table {{.Name}}\t{{.Status}}\t{{.Ports}}" +log "" +log "Frontend: ${WEB_DIR}" +log "Docker: worker + Redis (127.0.0.1:6379) + PostgreSQL (127.0.0.1:5432)" +log "" + +# Quick health check +if curl -sf http://127.0.0.1:6379 &>/dev/null || redis-cli -h 127.0.0.1 ping &>/dev/null 2>&1; then + log "Redis: OK" +fi + +if docker compose -f "${COMPOSE_FILE}" exec -T postgres pg_isready -U pdf_checker &>/dev/null; then + log "PostgreSQL: OK" +fi + +log "" +log "Next steps:" +log " 1. Configure Apache vhost for https://ai-sandbox.oliver.solutions/pdf-accessibility" +log " 2. Review ${WEB_DIR}/.env (especially DB_PASSWORD and API keys)" +log " 3. Restart Apache: sudo systemctl reload apache2" +log "" diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..2faed34 --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,66 @@ +# Production Docker Compose β€” worker + Redis + PostgreSQL only +# Apache on host serves PHP + frontend files natively +# Redis/PostgreSQL ports exposed to localhost for api.php access + +services: + worker: + build: + context: . + dockerfile: Dockerfile.worker + volumes: + - ${WEB_DIR:-/var/www/html/pdf-accessibility}/uploads:/app/uploads + - ${WEB_DIR:-/var/www/html/pdf-accessibility}/results:/app/results + - ./logs:/app/logs + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + environment: + - REDIS_HOST=redis + - REDIS_PORT=6379 + - DB_HOST=postgres + - DB_PORT=5432 + - DB_NAME=${DB_NAME:-pdf_checker} + - DB_USER=${DB_USER:-pdf_checker} + - DB_PASSWORD=${DB_PASSWORD:-dev_password} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} + - GOOGLE_API_KEY=${GOOGLE_API_KEY:-} + deploy: + replicas: ${WORKER_COUNT:-2} + restart: unless-stopped + + redis: + image: redis:7-alpine + ports: + - "127.0.0.1:6379:6379" + volumes: + - redis-data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 3s + retries: 3 + restart: unless-stopped + + postgres: + image: postgres:16-alpine + ports: + - "127.0.0.1:5432:5432" + volumes: + - pg-data:/var/lib/postgresql/data + - ./db/init.sql:/docker-entrypoint-initdb.d/init.sql + environment: + POSTGRES_DB: ${DB_NAME:-pdf_checker} + POSTGRES_USER: ${DB_USER:-pdf_checker} + POSTGRES_PASSWORD: ${DB_PASSWORD:-dev_password} + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${DB_USER:-pdf_checker}"] + interval: 10s + timeout: 3s + retries: 3 + restart: unless-stopped + +volumes: + redis-data: + pg-data: diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..5700171 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,69 @@ +services: + web: + build: + context: . + dockerfile: Dockerfile.web + ports: + - "8000:80" + volumes: + - pdf-uploads:/app/uploads + - pdf-results:/app/results + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + env_file: .env + restart: unless-stopped + + worker: + build: + context: . + dockerfile: Dockerfile.worker + volumes: + - pdf-uploads:/app/uploads + - pdf-results:/app/results + - pdf-logs:/app/logs + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + env_file: .env + deploy: + replicas: ${WORKER_COUNT:-2} + restart: unless-stopped + + redis: + image: redis:7-alpine + volumes: + - redis-data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 3s + retries: 3 + restart: unless-stopped + + postgres: + image: postgres:16-alpine + volumes: + - pg-data:/var/lib/postgresql/data + - ./db/init.sql:/docker-entrypoint-initdb.d/init.sql + environment: + POSTGRES_DB: ${DB_NAME:-pdf_checker} + POSTGRES_USER: ${DB_USER:-pdf_checker} + POSTGRES_PASSWORD: ${DB_PASSWORD:-dev_password} + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${DB_USER:-pdf_checker}"] + interval: 10s + timeout: 3s + retries: 3 + restart: unless-stopped + +volumes: + pdf-uploads: + pdf-results: + pdf-logs: + redis-data: + pg-data: diff --git a/docker-entrypoint-web.sh b/docker-entrypoint-web.sh new file mode 100644 index 0000000..f776ac2 --- /dev/null +++ b/docker-entrypoint-web.sh @@ -0,0 +1,12 @@ +#!/bin/sh +set -e + +# Allow PHP-FPM to inherit environment variables (needed for getenv() in PHP) +# By default PHP-FPM clears the environment; this disables that behavior +echo 'clear_env = no' >> /usr/local/etc/php-fpm.d/www.conf + +# Start PHP-FPM in background +php-fpm -D + +# Start Nginx in foreground +nginx -g 'daemon off;' diff --git a/enterprise_pdf_checker.py b/enterprise_pdf_checker.py index de7992b..1b3e86d 100644 --- a/enterprise_pdf_checker.py +++ b/enterprise_pdf_checker.py @@ -162,7 +162,7 @@ class CacheManager: try: with open(cache_file, 'r') as f: return json.load(f) - except: + except (json.JSONDecodeError, IOError, OSError): return None return None @@ -209,25 +209,26 @@ class ColorContrastChecker: """Sample image for contrast issues""" if image.mode != 'RGB': image = image.convert('RGB') - + width, height = image.size samples = [] - + rng = np.random.default_rng(seed=42) + for _ in range(min(sample_size, width * height // 100)): - x = np.random.randint(0, max(1, width - 2)) - y = np.random.randint(0, max(1, height - 1)) - + x = rng.integers(0, max(1, width - 2)) + y = rng.integers(0, max(1, height - 1)) + try: color1 = image.getpixel((x, y)) color2 = image.getpixel((min(x + 1, width - 1), y)) - + ratio = ColorContrastChecker.calculate_contrast_ratio(color1, color2) samples.append({ 'ratio': ratio, 'colors': (color1, color2), 'position': (x, y) }) - except: + except (IndexError, TypeError, ValueError): continue if not samples: @@ -324,9 +325,9 @@ class ReadabilityAnalyzer: class EnterprisePDFChecker: """Enterprise-grade PDF accessibility checker""" - def __init__(self, pdf_path: str, config: Dict[str, Any], quick_mode: bool = False, generate_images: bool = True): + def __init__(self, pdf_path: str, config: Dict[str, Any] = None, quick_mode: bool = False, generate_images: bool = True): self.pdf_path = Path(pdf_path) - self.config = config + self.config = config or {} self.quick_mode = quick_mode self.generate_images = generate_images self.issues: List[AccessibilityIssue] = [] @@ -344,6 +345,7 @@ class EnterprisePDFChecker: self.api_timeout = 10.0 # 10 second timeout for API calls # Initialize API clients + config = self.config google_creds_path = config.get('google_credentials_path') if google_creds_path and os.path.isfile(google_creds_path): # Valid credentials file exists @@ -351,27 +353,27 @@ class EnterprisePDFChecker: if vision: try: self.vision_client = vision.ImageAnnotatorClient() - print(f" βœ… Google Cloud Vision initialized with credentials file") + logger.info("Google Cloud Vision initialized with credentials file") except Exception as e: - print(f" ⚠️ Google Vision initialization failed: {str(e)}") + logger.warning(f"Google Vision initialization failed: {str(e)}") elif config.get('google_api_key'): # Use API key directly if vision: # Note: Vision API with API key requires different initialization # For now, store key for use in requests self.google_api_key = config['google_api_key'] - print(f" ℹ️ Using Google API key: {self.google_api_key[:20]}...") + logger.info(f"Using Google API key: {self.google_api_key[:20]}...") elif google_creds_path: # Path provided but file doesn't exist - print(f" ⚠️ Google credentials file not found: {google_creds_path}") - print(f" ⚠️ Skipping Google Cloud Vision (advanced OCR disabled)") + logger.warning(f"Google credentials file not found: {google_creds_path}") + logger.warning("Skipping Google Cloud Vision (advanced OCR disabled)") if config.get('anthropic_api_key') and anthropic: try: self.anthropic_client = anthropic.Anthropic(api_key=config['anthropic_api_key']) - print(f" βœ… Anthropic Claude initialized") + logger.info("Anthropic Claude initialized") except Exception as e: - print(f" ⚠️ Anthropic initialization failed: {str(e)}") + logger.warning(f"Anthropic initialization failed: {str(e)}") # Stats self.stats = { @@ -420,9 +422,9 @@ class EnterprisePDFChecker: def check_all(self) -> Dict[str, Any]: """Run all accessibility checks""" - print(f"πŸ” Enterprise PDF Accessibility Check") - print(f"πŸ“„ File: {self.pdf_path.name}") - print(f"{'='*60}\n") + logger.info("Enterprise PDF Accessibility Check") + logger.info(f"File: {self.pdf_path.name}") + logger.info("=" * 60) try: self.pdf_reader = PdfReader(str(self.pdf_path)) @@ -440,6 +442,8 @@ class EnterprisePDFChecker: (self._check_readability, "Content Readability"), (self._check_links, "Link Quality"), (self._check_headings, "Heading Structure"), + (self._check_tab_order, "Tab Order"), + (self._check_role_mapping, "Role Mapping"), (self._check_forms, "Form Accessibility"), (self._check_tables, "Table Structure"), (self._check_reading_order, "Reading Order"), @@ -450,10 +454,10 @@ class EnterprisePDFChecker: ] for check_func, check_name in checks: - print(f"⏳ Running: {check_name}...", end=' ') + logger.info(f"Running: {check_name}...") result = self.run_check(check_func, check_name) - status = "βœ…" if result.passed else "❌" - print(f"{status} ({result.duration:.2f}s)") + status = "PASS" if result.passed else "FAIL" + logger.info(f"{status} ({result.duration:.2f}s)") # Analyze remediation options self._analyze_remediation_options() @@ -618,10 +622,10 @@ class EnterprisePDFChecker: return if self.quick_mode: - print(" ⏩ Skipping OCR analysis (quick mode)") + logger.info("Skipping OCR analysis (quick mode)") return - print(" πŸ” Running OCR analysis...") + logger.info("Running OCR analysis...") try: # Reduced DPI from 300 to 150 for faster processing @@ -646,11 +650,11 @@ class EnterprisePDFChecker: details={'confidence': avg_confidence} ) except Exception as e: - print(f" ⚠️ OCR check skipped: {str(e)}") + logger.warning(f"OCR check skipped: {str(e)}") def _check_images_comprehensive(self): """Comprehensive image accessibility check with AI""" - print(" πŸ–ΌοΈ Analyzing images with AI...") + logger.info("Analyzing images with AI...") total_images = 0 analyzed_images = 0 @@ -674,7 +678,7 @@ class EnterprisePDFChecker: } image_tasks.append((image_data, page_num + 1, img_idx + 1, coords)) except Exception as e: - print(f" ⚠️ Failed to extract image on page {page_num + 1}: {str(e)}") + logger.warning(f"Failed to extract image on page {page_num + 1}: {str(e)}") if total_images == 0: self.add_issue( @@ -685,11 +689,11 @@ class EnterprisePDFChecker: ) return - print(f" πŸ“Š Found {total_images} images to analyze...") + logger.info(f"Found {total_images} images to analyze...") # Skip AI analysis in quick mode if self.quick_mode: - print(" ⏩ Skipping AI image analysis (quick mode)") + logger.info("Skipping AI image analysis (quick mode)") self.add_issue( Severity.INFO, "Images", @@ -743,7 +747,7 @@ class EnterprisePDFChecker: result = future.result() analyzed_images += 1 cache_status = " (cached)" if result.get('cached') else "" - print(f" πŸ“· Analyzed image {analyzed_images}/{total_images} (Page {result['page']}){cache_status}") + logger.info(f"Analyzed image {analyzed_images}/{total_images} (Page {result['page']}){cache_status}") if result.get('analyzed'): self._process_image_analysis(result['analysis'], result['page'], result['img'], result.get('coords')) @@ -757,12 +761,12 @@ class EnterprisePDFChecker: self._process_google_vision_results(result['vision_analysis'], result['page'], result['img'], result.get('coords')) if result.get('error'): - print(f" ⚠️ Error analyzing image on page {result['page']}: {result['error']}") + logger.warning(f"Error analyzing image on page {result['page']}: {result['error']}") except Exception as e: - print(f" ⚠️ Image analysis error: {str(e)}") + logger.warning(f"Image analysis error: {str(e)}") - print(f" βœ… Completed analysis of {analyzed_images}/{total_images} images") + logger.info(f"Completed analysis of {analyzed_images}/{total_images} images") @retry_with_backoff(max_retries=3, initial_delay=1.0) def _analyze_image_with_claude(self, image_bytes: bytes) -> Optional[Dict]: @@ -943,10 +947,10 @@ Respond in JSON format: def _check_color_contrast(self): """Check color contrast using image analysis""" - print(" 🎨 Checking color contrast...") + logger.info("Checking color contrast...") if self.quick_mode: - print(" ⏩ Skipping detailed contrast analysis (quick mode)") + logger.info("Skipping detailed contrast analysis (quick mode)") return try: @@ -982,7 +986,7 @@ Respond in JSON format: ) except Exception as e: - print(f" ⚠️ Contrast check skipped: {str(e)}") + logger.warning(f"Contrast check skipped: {str(e)}") def _check_readability(self): """Check content readability""" @@ -1067,28 +1071,153 @@ Respond in JSON format: break def _check_headings(self): - """Check heading structure""" + """Check heading structure and hierarchy""" catalog = self.pdf_reader.trailer.get("/Root", {}) - + if "/StructTreeRoot" not in catalog: self.add_issue( - Severity.ERROR, - "Headings", + Severity.ERROR, "Headings", "No structure tree - cannot verify heading hierarchy", wcag_criterion="1.3.1", - recommendation="Tag document with proper heading structure" - ) + recommendation="Tag document with proper heading structure") return - - # Try to parse heading structure - # This is complex and PDF-specific - self.add_issue( - Severity.INFO, - "Headings", - "Structure tree present - manual verification of heading hierarchy recommended", - wcag_criterion="1.3.1", - recommendation="Use Adobe Acrobat to verify H1-H6 hierarchy" + + struct_tree = catalog["/StructTreeRoot"] + headings = [] + + def walk_tree(element): + try: + if hasattr(element, 'get_object'): + element = element.get_object() + if isinstance(element, dict): + tag = str(element.get("/S", "")) + if tag in ["/H1", "/H2", "/H3", "/H4", "/H5", "/H6"]: + headings.append(int(tag[2])) + kids = element.get("/K", []) + if isinstance(kids, list): + for kid in kids: + walk_tree(kid) + elif kids: + walk_tree(kids) + except (AttributeError, TypeError, KeyError): + pass + + try: + walk_tree(struct_tree) + except Exception as e: + logger.warning(f"Could not fully parse structure tree: {e}") + + if not headings: + self.add_issue( + Severity.WARNING, "Headings", + "No heading tags (H1-H6) found in structure tree", + wcag_criterion="1.3.1", + recommendation="Add heading tags to establish document hierarchy") + return + + if headings[0] != 1: + self.add_issue( + Severity.ERROR, "Headings", + f"Document does not start with H1 (starts with H{headings[0]})", + wcag_criterion="1.3.1", + recommendation="First heading should be H1") + + for i in range(1, len(headings)): + if headings[i] > headings[i - 1] + 1: + self.add_issue( + Severity.WARNING, "Headings", + f"Heading level skipped: H{headings[i - 1]} to H{headings[i]}", + wcag_criterion="1.3.1", + recommendation="Do not skip heading levels") + + heading_str = ", ".join(f"H{h}" for h in headings[:10]) + if len(headings) > 10: + heading_str += "..." + has_issues = any( + i.severity in [Severity.ERROR, Severity.WARNING] + for i in self.issues if i.category == "Headings" ) + self.add_issue( + Severity.INFO if has_issues else Severity.SUCCESS, "Headings", + f"Found {len(headings)} headings: {heading_str}", + wcag_criterion="1.3.1") + + def _check_tab_order(self): + """Check tab order is set for pages""" + pages_without_tabs = [] + for i, page in enumerate(self.pdf_reader.pages): + if "/Tabs" not in page: + pages_without_tabs.append(i + 1) + + if pages_without_tabs: + if len(pages_without_tabs) == len(self.pdf_reader.pages): + self.add_issue( + Severity.ERROR, "Tab Order", + "No pages have tab order defined", + wcag_criterion="2.4.3", + recommendation="Set /Tabs to /S (structure order) for all pages") + else: + self.add_issue( + Severity.WARNING, "Tab Order", + f"{len(pages_without_tabs)} page(s) missing tab order", + wcag_criterion="2.4.3", + recommendation="Set /Tabs entry on all pages") + else: + tab_types = set() + for page in self.pdf_reader.pages: + tab_types.add(str(page.get("/Tabs", ""))) + self.add_issue( + Severity.SUCCESS, "Tab Order", + f"Tab order set on all pages (types: {', '.join(tab_types)})", + wcag_criterion="2.4.3") + + def _check_role_mapping(self): + """Check role mapping for custom tags""" + catalog = self.pdf_reader.trailer.get("/Root", {}) + + if "/StructTreeRoot" not in catalog: + return # Already flagged by heading/structure checks + + struct_tree = catalog["/StructTreeRoot"] + if hasattr(struct_tree, 'get_object'): + struct_tree = struct_tree.get_object() + + if "/RoleMap" in struct_tree: + role_map = struct_tree["/RoleMap"] + if hasattr(role_map, 'get_object'): + role_map = role_map.get_object() + + standard_roles = { + "/P", "/H1", "/H2", "/H3", "/H4", "/H5", "/H6", + "/Table", "/TR", "/TD", "/TH", "/L", "/LI", "/Lbl", + "/LBody", "/Span", "/Link", "/Figure", "/Form", + "/Sect", "/Art", "/Div", "/BlockQuote", "/TOC", "/TOCI" + } + + mapped = {} + try: + for key, value in role_map.items(): + mapped[key] = str(value) + except (AttributeError, TypeError): + pass + + unmapped = {k: v for k, v in mapped.items() if v not in standard_roles} + if unmapped: + self.add_issue( + Severity.WARNING, "Role Mapping", + f"{len(unmapped)} custom role(s) map to non-standard tags", + wcag_criterion="1.3.1", + recommendation="Ensure all custom roles map to standard PDF tags") + else: + self.add_issue( + Severity.SUCCESS, "Role Mapping", + f"All {len(mapped)} custom roles correctly mapped", + wcag_criterion="1.3.1") + else: + self.add_issue( + Severity.INFO, "Role Mapping", + "No custom role mapping (document uses standard tags only)", + wcag_criterion="1.3.1") def _check_forms(self): """Check form field accessibility""" @@ -1246,17 +1375,17 @@ Respond in JSON format: def _check_verapdf_validation(self): """Run veraPDF PDF/UA validation""" if not VeraPDFValidator: - print(" ⚠️ veraPDF not available - skipping") + logger.warning("veraPDF not available - skipping") return - print("\n πŸ“‹ Running veraPDF PDF/UA validation...") + logger.info("Running veraPDF PDF/UA validation...") try: validator = VeraPDFValidator() results = validator.validate(str(self.pdf_path)) if 'error' in results: - print(f" ⚠️ veraPDF validation error: {results['error']}") + logger.warning(f"veraPDF validation error: {results['error']}") return self.verapdf_results = results @@ -1289,17 +1418,17 @@ Respond in JSON format: recommendation="Consult veraPDF documentation for this clause" ) - print(f" βœ… veraPDF: {results['passed_rules']} passed, {results['failed_rules']} failed") + logger.info(f"veraPDF: {results['passed_rules']} passed, {results['failed_rules']} failed") except Exception as e: - print(f" ⚠️ veraPDF check error: {str(e)}") + logger.warning(f"veraPDF check error: {str(e)}") def _analyze_remediation_options(self): """Analyze what can be auto-fixed""" if not PDFRemediator: return - print("\nπŸ”§ Analyzing auto-remediation options...") + logger.info("Analyzing auto-remediation options...") try: remediator = PDFRemediator(str(self.pdf_path)) @@ -1314,12 +1443,12 @@ Respond in JSON format: ) if total_fixable > 0: - print(f" βœ… {total_fixable} issues can be auto-fixed") + logger.info(f"{total_fixable} issues can be auto-fixed") else: - print(f" ℹ️ No auto-fixable issues found") + logger.info("No auto-fixable issues found") except Exception as e: - print(f" ⚠️ Remediation analysis error: {str(e)}") + logger.warning(f"Remediation analysis error: {str(e)}") # ==================== HELPER METHODS ==================== @@ -1348,12 +1477,12 @@ Respond in JSON format: if not self.generate_images: return - print(f"\nπŸ“Έ Generating page images for visual display...") + logger.info("Generating page images for visual display...") try: from pdf2image import convert_from_path except ImportError: - print(f" ⚠️ pdf2image not available - skipping page image generation") + logger.warning("pdf2image not available - skipping page image generation") return try: @@ -1374,12 +1503,12 @@ Respond in JSON format: image_path = output_dir / image_filename image.save(image_path, 'PNG') self.page_images[page_num] = image_filename - print(f" βœ… Page {page_num}/{len(images)}") + logger.info(f"Page {page_num}/{len(images)}") - print(f" βœ… Generated {len(images)} page images at {dpi} DPI") + logger.info(f"Generated {len(images)} page images at {dpi} DPI") except Exception as e: - print(f" ⚠️ Could not generate page images: {str(e)}") + logger.warning(f"Could not generate page images: {str(e)}") # ==================== REPORTING ==================== @@ -1445,6 +1574,14 @@ Respond in JSON format: summary = self._generate_summary() return json.dumps(summary, indent=2) + def run_full_check(self) -> Dict[str, Any]: + """Alias for check_all - maintains backward compatibility""" + return self.check_all() + + def to_dict(self) -> Dict[str, Any]: + """Convert results to dictionary""" + return self._generate_summary() + def main(): """Main entry point""" diff --git a/index.html b/index.html index da15854..c84c0b8 100644 --- a/index.html +++ b/index.html @@ -6,562 +6,43 @@ Enterprise PDF Accessibility Checker - - + + +
NOT FOR PRODUCTION USE
+ + + +
-

πŸ” Enterprise PDF Accessibility Checker

-

Comprehensive WCAG 2.1 compliance validation with AI-powered analysis

+
+
+

Enterprise PDF Accessibility Checker

+

Comprehensive WCAG 2.1 compliance validation with AI-powered analysis

+
+
+ + + +
+
@@ -569,41 +50,62 @@

Upload PDF Document

- -
-
πŸ“„
-
Drop your PDF here or click to browse
-
Maximum file size: 50MB
- + +
+ +
- + +
+
+
📄
+
Drop your PDF here or click to browse
+
Maximum file size: 50MB
+ +
+
+ + +
-

Check Options

-
- -
- -
+ +
Uploading...
0%
-
+
- - +
-
πŸ” Processing Details
-
-
⏳ Initializing...
+
Processing Details
+
+
Initializing...
@@ -612,1115 +114,101 @@
-
+

Accessibility Report

- +
+ + + +
-
--
+
--
Accessibility Score
-
- -
+
-