diff --git a/.env.example b/.env.example index 8d8a91b..5c46c1e 100644 --- a/.env.example +++ b/.env.example @@ -27,12 +27,13 @@ DB_NAME=pdf_checker DB_USER=pdf_checker DB_PASSWORD=change_me_in_production -# Redis - used for job queue in Docker setup -REDIS_HOST=redis -REDIS_PORT=6379 - -# Worker configuration -WORKER_COUNT=2 +# Cloud Run - PDF processing service +# Set this to your deployed Cloud Run URL (leave empty for local Python fallback) +CLOUD_RUN_URL=https://pdf-checker-bcb6ipdqka-uc.a.run.app +# Path to GCP service account key for authenticating to Cloud Run +GCP_SA_KEY_PATH=./pdf-api-invoker-key.json +# GCS bucket for page images +GCS_BUCKET_NAME=optical-pdf-images # Azure AD / MSAL Authentication AZURE_TENANT_ID=e519c2e6-bc6d-4fdf-8d9c-923c2f002385 diff --git a/.gitignore b/.gitignore index 4d0bd68..b3bd7ae 100644 --- a/.gitignore +++ b/.gitignore @@ -33,7 +33,13 @@ Thumbs.db # Docker volumes (local data) pg-data/ -redis-data/ + +# GCP service account keys +*-key.json +*-credentials.json + +# Rate limit data +rate_limits/ # Coverage .coverage diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..4e3b1f0 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,100 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +AI-powered PDF accessibility checker that validates documents against WCAG 2.1 Level A & AA standards. Combines traditional PDF analysis (pypdf, pdfplumber) with AI models (Anthropic Claude, Google Cloud Vision) for ~95% automated WCAG coverage. Branded for "Oliver" (Montserrat font, black/#FFC407 palette). + +## Commands + +### Testing +```bash +source venv/bin/activate +pytest tests/ -v # Run all tests (31 tests) +pytest tests/ --cov=. --cov-report=html # With coverage report +pytest tests/test_checker.py -v # Single test file +pytest tests/ -m "not integration" # Skip integration tests +``` + +### Running Locally +```bash +source venv/bin/activate +php -S localhost:8000 # Start PHP dev server +``` + +### Docker +```bash +docker-compose up # Development stack +docker-compose -f docker-compose.prod.yml up -d # Production stack +docker-compose exec worker pytest tests/ -v # Tests in container +``` + +### CLI Usage +```bash +python enterprise_pdf_checker.py document.pdf --output report.json # Full check +python enterprise_pdf_checker.py document.pdf --quick # Skip AI checks +python pdf_remediation.py document.pdf --output fixed.pdf --all # Auto-remediate +``` + +## Architecture + +### Three Interfaces +- **Web UI** (`index.html` + `js/` + `css/`) — vanilla JS, drag-drop upload, visual inspector +- **REST API** (`api.php`) — PHP endpoints: upload, check, status, result, remediate, download +- **CLI** (`enterprise_pdf_checker.py`) — direct Python execution + +### Request Flow (Docker/Production) +1. `api.php` receives upload, validates via `auth.php`, saves to `uploads/` +2. Job pushed to Redis queue (`pdf:queue`) and tracked in PostgreSQL +3. `worker.py` daemon pops jobs, runs `EnterprisePDFChecker.check_all()` +4. Results written to `results/{job_id}.result.json`, DB updated +5. Client polls `api.php?action=status` then fetches results + +### Key Source Files +| File | Purpose | +|------|---------| +| `enterprise_pdf_checker.py` | Core engine — 30+ WCAG checks, AI image analysis, scoring | +| `api.php` | REST API — file handling, job queue integration, CORS | +| `auth.php` | Authentication — Bearer/X-API-Key, dev mode localhost bypass | +| `worker.py` | Background daemon — Redis queue consumer, graceful shutdown | +| `db_manager.py` | PostgreSQL ORM — jobs CRUD, audit logging | +| `redis_queue.py` | Redis operations — job queue, status tracking, rate limiting | +| `pdf_remediation.py` | Auto-fix — metadata, tagging, language tags | +| `retry_helper.py` | Exponential backoff for external API calls | +| `report_generator.py` | Result formatting and report generation | +| `logger_config.py` | Structured logging with rotation (10MB max) | +| `cleanup.py` | File retention cleanup (24h for uploads/results) | + +### Data Layer +- **PostgreSQL** — `jobs` table (status, score, grade, result JSON), `audit_log` table. Schema in `db/init.sql` +- **Redis** — Job queue (`pdf:queue`), status tracking (`pdf:status:*`), rate limiting (`pdf:rate:*`) + +### External APIs +- **Anthropic Claude 3.5 Sonnet** — alt text validation, image classification, text-in-images +- **Google Cloud Vision** — OCR, text detection +- **veraPDF** (optional) — PDF/UA-1 compliance validation + +### Frontend Structure +`js/app.js` (controller), `js/upload.js` (drag-drop), `js/api.js` (HTTP client), `js/results.js` (display), `js/page-viewer.js` (PDF inspector), `js/batch.js` (batch processing), `js/utils.js` (helpers) + +## Tech Stack +- **Backend**: Python 3.11 (processing), PHP 8.2 (API) +- **Frontend**: Vanilla HTML/CSS/JS +- **Database**: PostgreSQL 16, Redis 7 +- **Infrastructure**: Docker, Nginx/Apache, PHP-FPM +- **System deps**: Tesseract OCR, Poppler, Ghostscript + +## Configuration +Environment variables via `.env` (see `.env.example`). Key settings: +- `ANTHROPIC_API_KEY` / `GOOGLE_API_KEY` — AI API credentials +- `DEV_MODE=true` — bypasses auth for localhost requests +- `DB_HOST`, `DB_PORT`, `REDIS_HOST`, `REDIS_PORT` — infrastructure endpoints +- Production uses ports 1220 (Redis) and 1221 (PostgreSQL) to avoid host conflicts + +## Testing +- pytest with markers: `integration`, `slow`, `api` +- Config in `pytest.ini` +- Fixtures in `tests/conftest.py` +- Sample PDFs in `Test_files/` +- No linter currently configured diff --git a/Dockerfile.cloudrun b/Dockerfile.cloudrun new file mode 100644 index 0000000..f4d8e9b --- /dev/null +++ b/Dockerfile.cloudrun @@ -0,0 +1,29 @@ +FROM python:3.11-slim + +# Install system dependencies for PDF processing +RUN apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + tesseract-ocr-eng \ + poppler-utils \ + ghostscript \ + libgl1 \ + libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install Python dependencies +COPY requirements-cloudrun.txt . +RUN pip install --no-cache-dir -r requirements-cloudrun.txt + +# Copy application code (no worker, redis_queue, or db_manager) +COPY cloudrun_service.py . +COPY enterprise_pdf_checker.py . +COPY pdf_remediation.py . +COPY logger_config.py . +COPY retry_helper.py . + +# Cloud Run sets $PORT; gunicorn binds to it +# --workers 1 --threads 1: Cloud Run concurrency=1, one request at a time +# --timeout 900: allow up to 15 minutes for large PDFs +CMD exec gunicorn --bind :$PORT --workers 1 --threads 1 --timeout 900 cloudrun_service:app diff --git a/Dockerfile.web b/Dockerfile.web index a152a68..aaaf196 100644 --- a/Dockerfile.web +++ b/Dockerfile.web @@ -4,12 +4,6 @@ FROM php:8.2-fpm-alpine RUN apk add --no-cache nginx python3 postgresql-dev && \ docker-php-ext-install pdo pdo_pgsql -# Install php-redis via PECL -RUN apk add --no-cache --virtual .build-deps $PHPIZE_DEPS && \ - pecl install redis && \ - docker-php-ext-enable redis && \ - apk del .build-deps - # Copy Nginx config COPY nginx.conf /etc/nginx/http.d/default.conf diff --git a/api.php b/api.php index 4d5cbee..b5df954 100644 --- a/api.php +++ b/api.php @@ -1,8 +1,10 @@ connect(REDIS_HOST, REDIS_PORT); - } - return $redis; -} - -/** - * Check rate limit via Redis. Returns true if allowed. + * Check rate limit via filesystem. Returns true if allowed. + * Stores timestamps in JSON files per IP+action. */ function checkRateLimit($action, $limit, $window) { - try { - $redis = getRedis(); - $ip = $_SERVER['REMOTE_ADDR'] ?? 'unknown'; - $key = REDIS_RATE_PREFIX . $ip . ':' . $action; - $current = $redis->incr($key); - if ($current === 1) { - $redis->expire($key, $window); + $ip = $_SERVER['REMOTE_ADDR'] ?? 'unknown'; + $key = preg_replace('/[^a-zA-Z0-9_-]/', '_', $ip . '_' . $action); + $file = RATE_LIMIT_DIR . '/' . $key . '.json'; + + $now = time(); + $timestamps = []; + + if (file_exists($file)) { + $data = json_decode(file_get_contents($file), true); + if (is_array($data)) { + // Filter to only timestamps within the window + $timestamps = array_filter($data, function($ts) use ($now, $window) { + return ($now - $ts) < $window; + }); } - return $current <= $limit; - } catch (Exception $e) { - return true; // Allow if Redis is down } + + if (count($timestamps) >= $limit) { + return false; + } + + $timestamps[] = $now; + file_put_contents($file, json_encode(array_values($timestamps))); + return true; } /** @@ -80,6 +90,171 @@ function sanitizeJobId($job_id) { return $job_id; } +/** + * Get an OIDC identity token for authenticating to Cloud Run. + * Uses a GCP service account key to create a self-signed JWT, + * then exchanges it for an identity token via Google's OAuth endpoint. + */ +function getCloudRunToken() { + static $cachedToken = null; + static $cachedExpiry = 0; + + // Return cached token if still valid (with 5-min buffer) + if ($cachedToken && time() < ($cachedExpiry - 300)) { + return $cachedToken; + } + + $keyPath = GCP_SA_KEY_PATH; + if (!file_exists($keyPath)) { + throw new Exception("GCP service account key not found: $keyPath"); + } + + $sa = json_decode(file_get_contents($keyPath), true); + if (!$sa || !isset($sa['client_email']) || !isset($sa['private_key'])) { + throw new Exception("Invalid service account key file"); + } + + $now = time(); + $expiry = $now + 3600; + + // Build JWT header and claims + $header = base64url_encode(json_encode(['alg' => 'RS256', 'typ' => 'JWT'])); + $claims = base64url_encode(json_encode([ + 'iss' => $sa['client_email'], + 'sub' => $sa['client_email'], + 'aud' => 'https://oauth2.googleapis.com/token', + 'iat' => $now, + 'exp' => $expiry, + 'target_audience' => CLOUD_RUN_URL, + ])); + + // Sign with RSA-SHA256 + $signingInput = "$header.$claims"; + $signature = ''; + $privateKey = openssl_pkey_get_private($sa['private_key']); + if (!$privateKey) { + throw new Exception("Failed to parse service account private key"); + } + openssl_sign($signingInput, $signature, $privateKey, OPENSSL_ALGO_SHA256); + $jwt = $signingInput . '.' . base64url_encode($signature); + + // Exchange JWT for identity token + $ch = curl_init('https://oauth2.googleapis.com/token'); + curl_setopt_array($ch, [ + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => http_build_query([ + 'grant_type' => 'urn:ietf:params:oauth:grant-type:jwt-bearer', + 'assertion' => $jwt, + ]), + CURLOPT_RETURNTRANSFER => true, + CURLOPT_TIMEOUT => 10, + ]); + $response = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($httpCode !== 200) { + throw new Exception("Failed to get identity token: HTTP $httpCode - $response"); + } + + $tokenData = json_decode($response, true); + if (!isset($tokenData['id_token'])) { + throw new Exception("No id_token in response: $response"); + } + + $cachedToken = $tokenData['id_token']; + $cachedExpiry = $expiry; + + return $cachedToken; +} + +/** + * Base64url encode (no padding, URL-safe) + */ +function base64url_encode($data) { + return rtrim(strtr(base64_encode($data), '+/', '-_'), '='); +} + +/** + * Get PostgreSQL PDO connection (lazy singleton) + */ +function getDB() { + static $pdo = null; + if ($pdo === null) { + $dsn = sprintf('pgsql:host=%s;port=%d;dbname=%s', DB_HOST, DB_PORT, DB_NAME); + $pdo = new PDO($dsn, DB_USER, DB_PASSWORD, [ + PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION, + ]); + } + return $pdo; +} + +/** + * Insert or update a job record in PostgreSQL + */ +function updateJobInDatabase($job_id, $filename, $status, $results = null) { + try { + $pdo = getDB(); + + $score = null; + $grade = null; + $total_issues = null; + $critical_count = null; + $error_count = null; + $warning_count = null; + $result_json = null; + $processing_time = null; + + if ($results) { + $score = $results['accessibility_score'] ?? null; + $grade = $results['grade'] ?? null; + $issues = $results['issues'] ?? []; + $total_issues = count($issues); + $critical_count = count(array_filter($issues, fn($i) => ($i['severity'] ?? '') === 'CRITICAL')); + $error_count = count(array_filter($issues, fn($i) => ($i['severity'] ?? '') === 'ERROR')); + $warning_count = count(array_filter($issues, fn($i) => ($i['severity'] ?? '') === 'WARNING')); + $result_json = json_encode($results); + $processing_time = $results['stats']['processing_time'] ?? null; + } + + $sql = "INSERT INTO jobs (job_id, filename, status, score, grade, total_issues, + critical_count, error_count, warning_count, result_json, processing_time, + completed_at) + VALUES (:job_id, :filename, :status, :score, :grade, :total_issues, + :critical_count, :error_count, :warning_count, :result_json::jsonb, :processing_time, + CASE WHEN :status2 = 'completed' THEN NOW() ELSE NULL END) + ON CONFLICT (job_id) DO UPDATE SET + status = EXCLUDED.status, + score = COALESCE(EXCLUDED.score, jobs.score), + grade = COALESCE(EXCLUDED.grade, jobs.grade), + total_issues = COALESCE(EXCLUDED.total_issues, jobs.total_issues), + critical_count = COALESCE(EXCLUDED.critical_count, jobs.critical_count), + error_count = COALESCE(EXCLUDED.error_count, jobs.error_count), + warning_count = COALESCE(EXCLUDED.warning_count, jobs.warning_count), + result_json = COALESCE(EXCLUDED.result_json, jobs.result_json), + processing_time = COALESCE(EXCLUDED.processing_time, jobs.processing_time), + completed_at = CASE WHEN EXCLUDED.status = 'completed' THEN NOW() ELSE jobs.completed_at END"; + + $stmt = $pdo->prepare($sql); + $stmt->execute([ + ':job_id' => $job_id, + ':filename' => $filename, + ':status' => $status, + ':score' => $score, + ':grade' => $grade, + ':total_issues' => $total_issues, + ':critical_count' => $critical_count, + ':error_count' => $error_count, + ':warning_count' => $warning_count, + ':result_json' => $result_json, + ':processing_time' => $processing_time, + ':status2' => $status, + ]); + } catch (Exception $e) { + error_log("DB update failed for $job_id: " . $e->getMessage()); + } +} + // CORS headers for API $allowed_origins = [ 'https://ai-sandbox.oliver.solutions', @@ -173,18 +348,18 @@ function handleUpload() { if (!isset($_FILES['pdf'])) { error('No file uploaded'); } - + $file = $_FILES['pdf']; - + // Validate file if ($file['error'] !== UPLOAD_ERR_OK) { error('Upload error: ' . $file['error']); } - + if ($file['size'] > MAX_FILE_SIZE) { error('File too large. Max size: ' . (MAX_FILE_SIZE / 1024 / 1024) . 'MB'); } - + $ext = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION)); if (!in_array($ext, ALLOWED_EXTENSIONS)) { error('Invalid file type. Only PDF files allowed.'); @@ -200,12 +375,12 @@ function handleUpload() { $job_id = 'pdf_' . bin2hex(random_bytes(16)); $filename = $job_id . '.pdf'; $filepath = UPLOAD_DIR . '/' . $filename; - + // Move file if (!move_uploaded_file($file['tmp_name'], $filepath)) { error('Failed to save file'); } - + // Create job metadata $job_data = [ 'job_id' => $job_id, @@ -215,12 +390,12 @@ function handleUpload() { 'status' => 'uploaded', 'filepath' => $filepath ]; - + file_put_contents( RESULTS_DIR . '/' . $job_id . '.meta.json', json_encode($job_data, JSON_PRETTY_PRINT) ); - + success([ 'job_id' => $job_id, 'filename' => $file['name'], @@ -229,9 +404,11 @@ function handleUpload() { } /** - * Handle PDF accessibility check — push job to Redis queue + * Handle PDF accessibility check — send PDF to Cloud Run synchronously */ function handleCheck() { + set_time_limit(900); // Allow up to 15 minutes + $job_id = $_POST['job_id'] ?? ''; if (empty($job_id)) { @@ -253,32 +430,98 @@ function handleCheck() { } $job_data = json_decode(file_get_contents($meta_file), true); - $quick_mode = $_POST['quick_mode'] ?? false; - // Push job to Redis queue for worker processing - try { - $redis = getRedis(); - $payload = json_encode([ - 'job_id' => $job_id, - 'pdf_path' => $job_data['filepath'], - 'original_filename' => $job_data['original_filename'] ?? '', - 'options' => [ - 'quick_mode' => (bool)$quick_mode, - ], - 'queued_at' => time() - ]); - $redis->lPush(REDIS_QUEUE, $payload); + // Update meta to processing + $job_data['status'] = 'processing'; + $job_data['started_at'] = date('Y-m-d H:i:s'); + file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT)); - // Set initial status in Redis - $redis->setex(REDIS_STATUS_PREFIX . $job_id, 86400, json_encode([ - 'status' => 'queued', - 'progress' => 0, - 'message' => 'Waiting in queue', - 'updated_at' => time() - ])); - } catch (Exception $e) { - // Fallback to direct exec if Redis is unavailable (local dev without Docker) + // If Cloud Run URL is configured, send to Cloud Run + if (!empty(CLOUD_RUN_URL)) { + try { + $token = getCloudRunToken(); + $pdf_path = $job_data['filepath']; + + if (!file_exists($pdf_path)) { + error('PDF file not found on server'); + } + + // Build multipart POST to Cloud Run + $ch = curl_init(CLOUD_RUN_URL . '/check'); + $postFields = [ + 'pdf' => new CURLFile($pdf_path, 'application/pdf', basename($pdf_path)), + 'job_id' => $job_id, + 'quick_mode' => $quick_mode ? 'true' : 'false', + 'original_filename' => $job_data['original_filename'] ?? '', + ]; + + curl_setopt_array($ch, [ + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $postFields, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_TIMEOUT => CLOUD_RUN_TIMEOUT, + CURLOPT_HTTPHEADER => [ + 'Authorization: Bearer ' . $token, + ], + ]); + + $response = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + $curlError = curl_error($ch); + curl_close($ch); + + if ($curlError) { + throw new Exception("Cloud Run request failed: $curlError"); + } + + if ($httpCode !== 200) { + $errorBody = json_decode($response, true); + $errorMsg = $errorBody['error'] ?? "HTTP $httpCode"; + throw new Exception("Cloud Run returned error: $errorMsg"); + } + + $result = json_decode($response, true); + if (!$result || !isset($result['success'])) { + throw new Exception("Invalid response from Cloud Run"); + } + + if (!$result['success']) { + throw new Exception($result['error'] ?? 'Unknown Cloud Run error'); + } + + $checkResult = $result['data']; + + // Write result JSON to disk + $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; + file_put_contents($result_file, json_encode($checkResult, JSON_PRETTY_PRINT)); + + // Update meta + $job_data['status'] = 'completed'; + $job_data['completed_at'] = date('Y-m-d H:i:s'); + file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT)); + + // Update PostgreSQL + updateJobInDatabase($job_id, $job_data['original_filename'] ?? '', 'completed', $checkResult); + + success([ + 'job_id' => $job_id, + 'status' => 'completed', + 'message' => 'Check completed' + ]); + + } catch (Exception $e) { + // Mark as failed + $job_data['status'] = 'failed'; + $job_data['error'] = $e->getMessage(); + file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT)); + + updateJobInDatabase($job_id, $job_data['original_filename'] ?? '', 'failed'); + + error('Processing failed: ' . $e->getMessage()); + } + } else { + // Fallback to local exec (development without Cloud Run) $pdf_path = $job_data['filepath']; $output_path = RESULTS_DIR . '/' . $job_id . '.result.json'; $venv_python = __DIR__ . '/venv/bin/python3'; @@ -312,22 +555,17 @@ function handleCheck() { $error_log = RESULTS_DIR . '/' . $job_id . '.error.log'; $cmd .= ' > ' . escapeshellarg($error_log) . ' 2>&1 &'; exec($cmd, $output, $return_code); + + success([ + 'job_id' => $job_id, + 'status' => 'processing', + 'message' => 'Check started (local mode)' + ]); } - - // Update meta file - $job_data['status'] = 'queued'; - $job_data['started_at'] = date('Y-m-d H:i:s'); - file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT)); - - success([ - 'job_id' => $job_id, - 'status' => 'queued', - 'message' => 'Check queued for processing' - ]); } /** - * Check job status — reads from Redis (real-time) with file fallback + * Check job status — pure file-based */ function handleStatus() { $job_id = $_GET['job_id'] ?? ''; @@ -347,30 +585,15 @@ function handleStatus() { $job_data = json_decode(file_get_contents($meta_file), true); - // Try Redis first for real-time progress - try { - $redis = getRedis(); - $redis_status = $redis->get(REDIS_STATUS_PREFIX . $job_id); - if ($redis_status) { - $status_data = json_decode($redis_status, true); - $job_data['status'] = $status_data['status']; - $job_data['progress'] = $status_data['progress'] ?? 0; - $job_data['status_message'] = $status_data['message'] ?? ''; - } - } catch (Exception $e) { - // Redis unavailable — fall through to file-based check - } - - // File-based fallback: check if result exists + // Check if result file exists (definitive completion signal) if (file_exists($result_file)) { $job_data['status'] = 'completed'; - $job_data['completed_at'] = date('Y-m-d H:i:s', filemtime($result_file)); - file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT)); - } else if (file_exists($error_log) && $job_data['status'] === 'processing') { + $job_data['completed_at'] = $job_data['completed_at'] ?? date('Y-m-d H:i:s', filemtime($result_file)); + } else if (file_exists($error_log) && in_array($job_data['status'], ['processing', 'queued'])) { $error_content = file_get_contents($error_log); if (!empty($error_content)) { $started = strtotime($job_data['started_at'] ?? 'now'); - if (time() - $started > 300) { + if (time() - $started > 900) { $job_data['status'] = 'failed'; $job_data['error'] = 'Process timeout or error'; $job_data['error_log'] = substr($error_content, -1000); @@ -391,15 +614,15 @@ function handleResult() { error('Job ID required'); } $job_id = sanitizeJobId($job_id); - + $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; - + if (!file_exists($result_file)) { error('Results not found. Check may still be processing.'); } - + $result = json_decode(file_get_contents($result_file), true); - + success($result); } @@ -408,26 +631,26 @@ function handleResult() { */ function handleList() { $jobs = []; - + $files = glob(RESULTS_DIR . '/*.meta.json'); - + foreach ($files as $file) { $job_data = json_decode(file_get_contents($file), true); - + // Check if completed $result_file = str_replace('.meta.json', '.result.json', $file); if (file_exists($result_file)) { $job_data['status'] = 'completed'; } - + $jobs[] = $job_data; } - + // Sort by upload time (newest first) usort($jobs, function($a, $b) { return strtotime($b['uploaded_at']) - strtotime($a['uploaded_at']); }); - + success(['jobs' => $jobs]); } @@ -441,20 +664,20 @@ function handleDelete() { error('Job ID required'); } $job_id = sanitizeJobId($job_id); - + $meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json'; - + if (!file_exists($meta_file)) { error('Job not found'); } - + $job_data = json_decode(file_get_contents($meta_file), true); - + // Delete files @unlink($job_data['filepath']); @unlink($meta_file); @unlink(RESULTS_DIR . '/' . $job_id . '.result.json'); - + success(['message' => 'Job deleted']); } @@ -484,6 +707,7 @@ function handleDebug() { 'meta_exists' => file_exists($meta_file), 'result_exists' => file_exists($result_file), 'error_log_exists' => file_exists($error_log), + 'cloud_run_url' => CLOUD_RUN_URL ?: '(not configured — local mode)', 'files' => [] ]; @@ -508,7 +732,7 @@ function handleDebug() { } /** - * Serve page images + * Serve page images — redirect to GCS URL or serve local file */ function handleImage() { $job_id = $_GET['job_id'] ?? ''; @@ -518,10 +742,28 @@ function handleImage() { error('Job ID and page number required'); } $job_id = sanitizeJobId($job_id); + $page_num = intval($page_num); - // Find the image file + // Check result JSON for GCS URLs + $result_file = RESULTS_DIR . '/' . $job_id . '.result.json'; + if (file_exists($result_file)) { + $result = json_decode(file_get_contents($result_file), true); + $page_images = $result['page_images'] ?? []; + + // Check if the page image value is a URL (GCS) + $image_value = $page_images[$page_num] ?? $page_images[strval($page_num)] ?? null; + if ($image_value && (strpos($image_value, 'http://') === 0 || strpos($image_value, 'https://') === 0)) { + // Redirect to GCS URL + header('HTTP/1.1 302 Found'); + header('Location: ' . $image_value); + header('Cache-Control: public, max-age=86400'); + exit; + } + } + + // Fallback: serve local image file $images_dir = RESULTS_DIR . '/' . $job_id . '.result_images'; - $image_file = $images_dir . '/page_' . intval($page_num) . '.png'; + $image_file = $images_dir . '/page_' . $page_num . '.png'; if (!file_exists($image_file)) { http_response_code(404); @@ -657,7 +899,6 @@ function handleStats() { 'completed' => 0, 'failed' => 0, 'processing' => 0, - 'queue_length' => 0 ]; // Count jobs from meta files @@ -675,14 +916,6 @@ function handleStats() { } } - // Get queue length from Redis - try { - $redis = getRedis(); - $stats['queue_length'] = $redis->lLen(REDIS_QUEUE); - } catch (Exception $e) { - // Redis unavailable - } - success($stats); } diff --git a/cleanup.py b/cleanup.py index 2a574bf..33a9037 100644 --- a/cleanup.py +++ b/cleanup.py @@ -2,8 +2,9 @@ """ PDF Accessibility Checker — File Cleanup -Deletes uploaded PDFs, result JSON files, result images, and error logs -older than RETENTION_HOURS (default 24h). +Deletes uploaded PDFs, result JSON files, error logs, and rate limit files +older than RETENTION_HOURS (default 24h). Page images are on GCS with +a 7-day lifecycle policy. Usage: python cleanup.py # dry-run (show what would be deleted) @@ -28,6 +29,7 @@ logger = logging.getLogger('cleanup') UPLOADS_DIR = Path(os.getenv('UPLOADS_DIR', '/opt/pdf-accessibility/uploads')) RESULTS_DIR = Path(os.getenv('RESULTS_DIR', '/opt/pdf-accessibility/results')) +RATE_LIMIT_DIR = Path(os.getenv('RATE_LIMIT_DIR', '/opt/pdf-accessibility/rate_limits')) RETENTION_HOURS = int(os.getenv('RETENTION_HOURS', '24')) @@ -109,8 +111,13 @@ def main(): total_deleted += d total_freed += f - # Clean results (JSON, error logs, image directories) - d, f = cleanup_directory(RESULTS_DIR, ['*.result.json', '*.error.log', '*.result_images'], dry_run) + # Clean results (JSON, error logs — page images are on GCS with 7-day lifecycle) + d, f = cleanup_directory(RESULTS_DIR, ['*.result.json', '*.error.log', '*.meta.json'], dry_run) + total_deleted += d + total_freed += f + + # Clean rate limit files + d, f = cleanup_directory(RATE_LIMIT_DIR, ['*.json'], dry_run) total_deleted += d total_freed += f diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..69a60ff --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,14 @@ +steps: + - name: 'gcr.io/cloud-builders/docker' + args: + - 'build' + - '-t' + - 'us-central1-docker.pkg.dev/optical-414516/pdf-accessibility/checker:latest' + - '-f' + - 'Dockerfile.cloudrun' + - '.' + +images: + - 'us-central1-docker.pkg.dev/optical-414516/pdf-accessibility/checker:latest' + +timeout: '600s' diff --git a/cloudrun_service.py b/cloudrun_service.py new file mode 100644 index 0000000..5b4f6f5 --- /dev/null +++ b/cloudrun_service.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +PDF Accessibility Checker — Cloud Run HTTP Service + +Flask app wrapping EnterprisePDFChecker for serverless execution. +Receives PDF via multipart POST, runs checks, uploads page images to GCS, +returns full result JSON. +""" + +import os +import json +import tempfile +import logging +from pathlib import Path + +from flask import Flask, request, jsonify +from google.cloud import storage + +from enterprise_pdf_checker import EnterprisePDFChecker + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [cloudrun] %(levelname)s: %(message)s' +) +logger = logging.getLogger('cloudrun') + +app = Flask(__name__) + +GCS_BUCKET_NAME = os.getenv('GCS_BUCKET_NAME', 'optical-pdf-images') + + +def upload_images_to_gcs(images_dir: Path, job_id: str) -> dict: + """Upload page images to GCS and return {page_num: public_url} mapping.""" + client = storage.Client() + bucket = client.bucket(GCS_BUCKET_NAME) + page_images = {} + + for image_file in sorted(images_dir.glob('page_*.png')): + # Extract page number from filename (page_1.png -> 1) + page_num = int(image_file.stem.split('_')[1]) + blob_name = f"{job_id}/{image_file.name}" + blob = bucket.blob(blob_name) + blob.upload_from_filename(str(image_file), content_type='image/png') + # Bucket has uniform bucket-level access with allUsers objectViewer, + # so objects are public by default — no need for blob.make_public() + public_url = f"https://storage.googleapis.com/{GCS_BUCKET_NAME}/{blob_name}" + page_images[page_num] = public_url + logger.info("Uploaded %s -> %s", image_file.name, public_url) + + return page_images + + +@app.route('/check', methods=['POST']) +def check_pdf(): + """Accept multipart PDF upload, run accessibility checks, return results.""" + pdf_file = request.files.get('pdf') + if not pdf_file: + return jsonify({'success': False, 'error': 'No PDF file provided'}), 400 + + job_id = request.form.get('job_id', 'unknown') + quick_mode = request.form.get('quick_mode', 'false').lower() in ('true', '1', 'yes') + original_filename = request.form.get('original_filename', pdf_file.filename or 'document.pdf') + + logger.info("Received job %s: %s (quick=%s)", job_id, original_filename, quick_mode) + + tmp_pdf = None + images_dir = None + + try: + # Save uploaded PDF to temp file + tmp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) + pdf_file.save(tmp_pdf) + tmp_pdf.close() + + # Run accessibility checks + config = { + 'anthropic_api_key': os.getenv('ANTHROPIC_API_KEY'), + 'google_api_key': os.getenv('GOOGLE_API_KEY'), + } + + checker = EnterprisePDFChecker(tmp_pdf.name, config, quick_mode=quick_mode) + checker.check_all() + + # Generate page images to a temp directory + images_dir = tempfile.mkdtemp(prefix='pdf_images_') + images_path = Path(images_dir) + checker._generate_page_images(images_path) + + # Get results before uploading images (page_images has local filenames) + results = checker.to_dict() + + # Upload images to GCS and replace local filenames with public URLs + if checker.page_images: + gcs_urls = upload_images_to_gcs(images_path, job_id) + results['page_images'] = gcs_urls + + # Add grade based on score + score = results.get('accessibility_score', 0) + if score >= 90: + results['grade'] = 'A' + elif score >= 80: + results['grade'] = 'B' + elif score >= 70: + results['grade'] = 'C' + elif score >= 60: + results['grade'] = 'D' + else: + results['grade'] = 'F' + + logger.info("Job %s completed: score=%s grade=%s issues=%d", + job_id, results['accessibility_score'], + results['grade'], results['total_issues']) + + return jsonify({'success': True, 'data': results}) + + except Exception as e: + logger.error("Job %s failed: %s", job_id, str(e), exc_info=True) + return jsonify({'success': False, 'error': str(e)}), 500 + + finally: + # Clean up temp files + if tmp_pdf and os.path.exists(tmp_pdf.name): + os.unlink(tmp_pdf.name) + if images_dir and os.path.exists(images_dir): + import shutil + shutil.rmtree(images_dir, ignore_errors=True) + + +@app.route('/health', methods=['GET']) +def health(): + return jsonify({'status': 'ok'}) + + +if __name__ == '__main__': + port = int(os.getenv('PORT', 8080)) + app.run(host='0.0.0.0', port=port, debug=False) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 3b6533c..72a8019 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -1,50 +1,9 @@ -# Production Docker Compose — worker + Redis + PostgreSQL +# Production Docker Compose — PostgreSQL only # Apache/Nginx on host serves PHP + frontend files natively -# Redis on 1220, PostgreSQL on 1221 to avoid host conflicts +# PDF processing handled by Cloud Run (no local worker) +# PostgreSQL on 1221 to avoid host conflicts services: - worker: - build: - context: . - dockerfile: Dockerfile.worker - volumes: - - ${WEB_DIR:-/opt/pdf-accessibility}/uploads:${WEB_DIR:-/opt/pdf-accessibility}/uploads - - ${WEB_DIR:-/opt/pdf-accessibility}/results:${WEB_DIR:-/opt/pdf-accessibility}/results - - ./logs:/app/logs - depends_on: - redis: - condition: service_healthy - postgres: - condition: service_healthy - environment: - - REDIS_HOST=redis - - REDIS_PORT=6379 - - DB_HOST=postgres - - DB_PORT=5432 - - DB_NAME=${DB_NAME:-pdf_checker} - - DB_USER=${DB_USER:-pdf_checker} - - DB_PASSWORD=${DB_PASSWORD:-dev_password} - - RESULTS_DIR=${WEB_DIR:-/opt/pdf-accessibility}/results - - UPLOADS_DIR=${WEB_DIR:-/opt/pdf-accessibility}/uploads - - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} - - GOOGLE_API_KEY=${GOOGLE_API_KEY:-} - deploy: - replicas: ${WORKER_COUNT:-2} - restart: unless-stopped - - redis: - image: redis:7-alpine - ports: - - "127.0.0.1:1220:6379" - volumes: - - redis-data:/data - healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 10s - timeout: 3s - retries: 3 - restart: unless-stopped - postgres: image: postgres:16-alpine ports: @@ -64,5 +23,4 @@ services: restart: unless-stopped volumes: - redis-data: pg-data: diff --git a/docker-compose.yml b/docker-compose.yml index 5700171..35a6a50 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,42 +9,11 @@ services: - pdf-uploads:/app/uploads - pdf-results:/app/results depends_on: - redis: - condition: service_healthy postgres: condition: service_healthy env_file: .env restart: unless-stopped - worker: - build: - context: . - dockerfile: Dockerfile.worker - volumes: - - pdf-uploads:/app/uploads - - pdf-results:/app/results - - pdf-logs:/app/logs - depends_on: - redis: - condition: service_healthy - postgres: - condition: service_healthy - env_file: .env - deploy: - replicas: ${WORKER_COUNT:-2} - restart: unless-stopped - - redis: - image: redis:7-alpine - volumes: - - redis-data:/data - healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 10s - timeout: 3s - retries: 3 - restart: unless-stopped - postgres: image: postgres:16-alpine volumes: @@ -64,6 +33,4 @@ services: volumes: pdf-uploads: pdf-results: - pdf-logs: - redis-data: pg-data: diff --git a/docker-entrypoint-web.sh b/docker-entrypoint-web.sh index f776ac2..20506a7 100644 --- a/docker-entrypoint-web.sh +++ b/docker-entrypoint-web.sh @@ -5,6 +5,9 @@ set -e # By default PHP-FPM clears the environment; this disables that behavior echo 'clear_env = no' >> /usr/local/etc/php-fpm.d/www.conf +# 15-minute timeout for Cloud Run PDF processing +echo 'request_terminate_timeout = 900' >> /usr/local/etc/php-fpm.d/www.conf + # Start PHP-FPM in background php-fpm -D diff --git a/js/page-viewer.js b/js/page-viewer.js index dba1092..7399367 100644 --- a/js/page-viewer.js +++ b/js/page-viewer.js @@ -47,7 +47,13 @@ function loadVisualPage(pageNum) { const img = document.getElementById('pageImage'); img.onload = () => drawMarkers(pageNum); - img.src = `api.php?action=image&job_id=${currentJobId}&page=${pageNum}`; + // Use GCS URL directly if available, otherwise fall back to api.php + const imageUrl = currentPageData.page_images[pageNum]; + if (imageUrl && (imageUrl.startsWith('http://') || imageUrl.startsWith('https://'))) { + img.src = imageUrl; + } else { + img.src = `api.php?action=image&job_id=${currentJobId}&page=${pageNum}`; + } } function drawMarkers(pageNum) { diff --git a/js/upload.js b/js/upload.js index 6e40b8c..c5c337a 100644 --- a/js/upload.js +++ b/js/upload.js @@ -78,13 +78,21 @@ async function beginCheck() { if (quickMode) addLog('Quick mode enabled — skipping expensive checks', 'info'); try { - updateProgress(30, 'Starting analysis...'); + updateProgress(30, 'Analyzing PDF (this may take a few minutes)...'); const result = await startCheck(currentJobId, quickMode); if (result.success) { - updateProgress(35, 'Analysis queued'); - addLog('Job queued for processing', 'success'); - pollJobStatus(); + if (result.data && result.data.status === 'completed') { + // Synchronous Cloud Run response — results are ready + updateProgress(98, 'Loading results...'); + addLog('Analysis complete!', 'success'); + loadResults(); + } else { + // Async/local mode fallback — poll for status + updateProgress(35, 'Analysis started'); + addLog('Job processing...', 'success'); + pollJobStatus(); + } } else { addLog('Check failed: ' + result.error, 'error'); alert('Check failed: ' + result.error); @@ -142,9 +150,9 @@ async function pollJobStatus() { if (data.error_log) addLog('Error: ' + data.error_log.substring(0, 500), 'error'); document.getElementById('progressContainer').style.display = 'none'; alert('Analysis failed. Check the error log for details.'); - } else if (pollCount > 150) { + } else if (pollCount > 450) { clearInterval(pollInterval); - addLog('Analysis timed out after 5 minutes', 'error'); + addLog('Analysis timed out after 15 minutes', 'error'); addLog('Try using Quick Mode for faster results', 'info'); document.getElementById('progressContainer').style.display = 'none'; } diff --git a/nginx.conf b/nginx.conf index 80aa4a0..2275361 100644 --- a/nginx.conf +++ b/nginx.conf @@ -17,6 +17,10 @@ server { fastcgi_index index.php; fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name; include fastcgi_params; + + # 15-minute timeout for Cloud Run PDF processing + fastcgi_read_timeout 900s; + fastcgi_send_timeout 900s; } # Serve page images from results diff --git a/requirements-cloudrun.txt b/requirements-cloudrun.txt new file mode 100644 index 0000000..3edea11 --- /dev/null +++ b/requirements-cloudrun.txt @@ -0,0 +1,33 @@ +# Cloud Run PDF Accessibility Checker - Python Dependencies + +# Core PDF processing +pypdf>=4.0.0 +pdfplumber>=0.11.0 + +# Image processing +Pillow>=10.0.0 +pdf2image>=1.16.0 + +# OCR +pytesseract>=0.3.10 + +# Scientific computing +numpy>=1.24.0 + +# NLP and readability +textblob>=0.17.1 + +# Google Cloud APIs +google-cloud-vision>=3.4.0 +google-cloud-documentai>=2.20.0 + +# Anthropic Claude API +anthropic>=0.18.0 + +# Additional utilities +python-dotenv>=1.0.0 + +# Cloud Run specific +flask>=3.0.0 +gunicorn>=21.2.0 +google-cloud-storage>=2.14.0