Replace the Redis queue + Python worker daemon with a synchronous HTTP call to a Cloud Run service, eliminating Redis and simplifying the infrastructure from 4 containers (web, worker, redis, postgres) to just web + postgres (with Cloud Run handling processing). - Add cloudrun_service.py: Flask app wrapping EnterprisePDFChecker with POST /check and GET /health endpoints, GCS image upload - Add Dockerfile.cloudrun + requirements-cloudrun.txt for Cloud Run image - Add cloudbuild.yaml for Cloud Build with custom Dockerfile - Rewrite api.php: remove all Redis code, add Cloud Run OIDC auth (getCloudRunToken), synchronous processing in handleCheck(), file-based rate limiting, GCS redirect in handleImage(), DB helper updateJobInDatabase() - Update js/upload.js: handle synchronous completed response from Cloud Run, increase poll timeout to 15 minutes - Update js/page-viewer.js: use GCS URLs directly for page images - Simplify docker-compose.yml and docker-compose.prod.yml: remove worker and redis services - Remove PHP Redis extension from Dockerfile.web - Set 900s timeouts across nginx, PHP-FPM, gunicorn, curl, and Cloud Run - Update cleanup.py: remove result_images pattern (now on GCS), add rate_limits cleanup - Update .env.example: replace Redis vars with Cloud Run/GCS config Cloud Run service deployed to: https://pdf-checker-bcb6ipdqka-uc.a.run.app GCS bucket: gs://optical-pdf-images (7-day lifecycle, public read) GCP project: optical-414516 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1146 lines
35 KiB
PHP
1146 lines
35 KiB
PHP
<?php
|
|
/**
|
|
* Enterprise PDF Accessibility Checker - API Backend
|
|
*
|
|
* Handles file uploads, sends PDFs to Cloud Run for processing,
|
|
* and serves results. No Redis dependency — uses Cloud Run for
|
|
* processing and file-based rate limiting.
|
|
*/
|
|
|
|
// Load .env file if getenv doesn't work (Apache doesn't set env vars by default)
|
|
$envFile = __DIR__ . '/.env';
|
|
if (file_exists($envFile)) {
|
|
$lines = file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
|
|
foreach ($lines as $line) {
|
|
$line = trim($line);
|
|
if ($line === '' || $line[0] === '#') continue;
|
|
if (strpos($line, '=') === false) continue;
|
|
list($key, $val) = explode('=', $line, 2);
|
|
$key = trim($key);
|
|
$val = trim($val);
|
|
if (!getenv($key)) {
|
|
putenv("$key=$val");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Configuration
|
|
define('UPLOAD_DIR', __DIR__ . '/uploads');
|
|
define('RESULTS_DIR', __DIR__ . '/results');
|
|
define('PYTHON_SCRIPT', __DIR__ . '/enterprise_pdf_checker.py');
|
|
define('MAX_FILE_SIZE', 50 * 1024 * 1024); // 50MB
|
|
define('ALLOWED_EXTENSIONS', ['pdf']);
|
|
|
|
// Cloud Run configuration
|
|
define('CLOUD_RUN_URL', getenv('CLOUD_RUN_URL') ?: '');
|
|
define('CLOUD_RUN_TIMEOUT', 900); // 15 minutes
|
|
define('GCP_SA_KEY_PATH', getenv('GCP_SA_KEY_PATH') ?: __DIR__ . '/pdf-api-invoker-key.json');
|
|
define('RATE_LIMIT_DIR', __DIR__ . '/rate_limits');
|
|
|
|
// Database configuration
|
|
define('DB_HOST', getenv('DB_HOST') ?: 'localhost');
|
|
define('DB_PORT', intval(getenv('DB_PORT') ?: 5432));
|
|
define('DB_NAME', getenv('DB_NAME') ?: 'pdf_checker');
|
|
define('DB_USER', getenv('DB_USER') ?: 'pdf_checker');
|
|
define('DB_PASSWORD', getenv('DB_PASSWORD') ?: 'dev_password');
|
|
|
|
// Create directories if they don't exist
|
|
if (!is_dir(UPLOAD_DIR)) mkdir(UPLOAD_DIR, 0755, true);
|
|
if (!is_dir(RESULTS_DIR)) mkdir(RESULTS_DIR, 0755, true);
|
|
if (!is_dir(RATE_LIMIT_DIR)) mkdir(RATE_LIMIT_DIR, 0755, true);
|
|
|
|
/**
|
|
* Check rate limit via filesystem. Returns true if allowed.
|
|
* Stores timestamps in JSON files per IP+action.
|
|
*/
|
|
function checkRateLimit($action, $limit, $window) {
|
|
$ip = $_SERVER['REMOTE_ADDR'] ?? 'unknown';
|
|
$key = preg_replace('/[^a-zA-Z0-9_-]/', '_', $ip . '_' . $action);
|
|
$file = RATE_LIMIT_DIR . '/' . $key . '.json';
|
|
|
|
$now = time();
|
|
$timestamps = [];
|
|
|
|
if (file_exists($file)) {
|
|
$data = json_decode(file_get_contents($file), true);
|
|
if (is_array($data)) {
|
|
// Filter to only timestamps within the window
|
|
$timestamps = array_filter($data, function($ts) use ($now, $window) {
|
|
return ($now - $ts) < $window;
|
|
});
|
|
}
|
|
}
|
|
|
|
if (count($timestamps) >= $limit) {
|
|
return false;
|
|
}
|
|
|
|
$timestamps[] = $now;
|
|
file_put_contents($file, json_encode(array_values($timestamps)));
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Sanitize job ID to prevent path traversal attacks
|
|
*/
|
|
function sanitizeJobId($job_id) {
|
|
if (!preg_match('/^pdf_[a-f0-9]+$/', $job_id)) {
|
|
error('Invalid job ID format');
|
|
}
|
|
return $job_id;
|
|
}
|
|
|
|
/**
|
|
* Get an OIDC identity token for authenticating to Cloud Run.
|
|
* Uses a GCP service account key to create a self-signed JWT,
|
|
* then exchanges it for an identity token via Google's OAuth endpoint.
|
|
*/
|
|
function getCloudRunToken() {
|
|
static $cachedToken = null;
|
|
static $cachedExpiry = 0;
|
|
|
|
// Return cached token if still valid (with 5-min buffer)
|
|
if ($cachedToken && time() < ($cachedExpiry - 300)) {
|
|
return $cachedToken;
|
|
}
|
|
|
|
$keyPath = GCP_SA_KEY_PATH;
|
|
if (!file_exists($keyPath)) {
|
|
throw new Exception("GCP service account key not found: $keyPath");
|
|
}
|
|
|
|
$sa = json_decode(file_get_contents($keyPath), true);
|
|
if (!$sa || !isset($sa['client_email']) || !isset($sa['private_key'])) {
|
|
throw new Exception("Invalid service account key file");
|
|
}
|
|
|
|
$now = time();
|
|
$expiry = $now + 3600;
|
|
|
|
// Build JWT header and claims
|
|
$header = base64url_encode(json_encode(['alg' => 'RS256', 'typ' => 'JWT']));
|
|
$claims = base64url_encode(json_encode([
|
|
'iss' => $sa['client_email'],
|
|
'sub' => $sa['client_email'],
|
|
'aud' => 'https://oauth2.googleapis.com/token',
|
|
'iat' => $now,
|
|
'exp' => $expiry,
|
|
'target_audience' => CLOUD_RUN_URL,
|
|
]));
|
|
|
|
// Sign with RSA-SHA256
|
|
$signingInput = "$header.$claims";
|
|
$signature = '';
|
|
$privateKey = openssl_pkey_get_private($sa['private_key']);
|
|
if (!$privateKey) {
|
|
throw new Exception("Failed to parse service account private key");
|
|
}
|
|
openssl_sign($signingInput, $signature, $privateKey, OPENSSL_ALGO_SHA256);
|
|
$jwt = $signingInput . '.' . base64url_encode($signature);
|
|
|
|
// Exchange JWT for identity token
|
|
$ch = curl_init('https://oauth2.googleapis.com/token');
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => http_build_query([
|
|
'grant_type' => 'urn:ietf:params:oauth:grant-type:jwt-bearer',
|
|
'assertion' => $jwt,
|
|
]),
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => 10,
|
|
]);
|
|
$response = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($httpCode !== 200) {
|
|
throw new Exception("Failed to get identity token: HTTP $httpCode - $response");
|
|
}
|
|
|
|
$tokenData = json_decode($response, true);
|
|
if (!isset($tokenData['id_token'])) {
|
|
throw new Exception("No id_token in response: $response");
|
|
}
|
|
|
|
$cachedToken = $tokenData['id_token'];
|
|
$cachedExpiry = $expiry;
|
|
|
|
return $cachedToken;
|
|
}
|
|
|
|
/**
|
|
* Base64url encode (no padding, URL-safe)
|
|
*/
|
|
function base64url_encode($data) {
|
|
return rtrim(strtr(base64_encode($data), '+/', '-_'), '=');
|
|
}
|
|
|
|
/**
|
|
* Get PostgreSQL PDO connection (lazy singleton)
|
|
*/
|
|
function getDB() {
|
|
static $pdo = null;
|
|
if ($pdo === null) {
|
|
$dsn = sprintf('pgsql:host=%s;port=%d;dbname=%s', DB_HOST, DB_PORT, DB_NAME);
|
|
$pdo = new PDO($dsn, DB_USER, DB_PASSWORD, [
|
|
PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
|
|
]);
|
|
}
|
|
return $pdo;
|
|
}
|
|
|
|
/**
|
|
* Insert or update a job record in PostgreSQL
|
|
*/
|
|
function updateJobInDatabase($job_id, $filename, $status, $results = null) {
|
|
try {
|
|
$pdo = getDB();
|
|
|
|
$score = null;
|
|
$grade = null;
|
|
$total_issues = null;
|
|
$critical_count = null;
|
|
$error_count = null;
|
|
$warning_count = null;
|
|
$result_json = null;
|
|
$processing_time = null;
|
|
|
|
if ($results) {
|
|
$score = $results['accessibility_score'] ?? null;
|
|
$grade = $results['grade'] ?? null;
|
|
$issues = $results['issues'] ?? [];
|
|
$total_issues = count($issues);
|
|
$critical_count = count(array_filter($issues, fn($i) => ($i['severity'] ?? '') === 'CRITICAL'));
|
|
$error_count = count(array_filter($issues, fn($i) => ($i['severity'] ?? '') === 'ERROR'));
|
|
$warning_count = count(array_filter($issues, fn($i) => ($i['severity'] ?? '') === 'WARNING'));
|
|
$result_json = json_encode($results);
|
|
$processing_time = $results['stats']['processing_time'] ?? null;
|
|
}
|
|
|
|
$sql = "INSERT INTO jobs (job_id, filename, status, score, grade, total_issues,
|
|
critical_count, error_count, warning_count, result_json, processing_time,
|
|
completed_at)
|
|
VALUES (:job_id, :filename, :status, :score, :grade, :total_issues,
|
|
:critical_count, :error_count, :warning_count, :result_json::jsonb, :processing_time,
|
|
CASE WHEN :status2 = 'completed' THEN NOW() ELSE NULL END)
|
|
ON CONFLICT (job_id) DO UPDATE SET
|
|
status = EXCLUDED.status,
|
|
score = COALESCE(EXCLUDED.score, jobs.score),
|
|
grade = COALESCE(EXCLUDED.grade, jobs.grade),
|
|
total_issues = COALESCE(EXCLUDED.total_issues, jobs.total_issues),
|
|
critical_count = COALESCE(EXCLUDED.critical_count, jobs.critical_count),
|
|
error_count = COALESCE(EXCLUDED.error_count, jobs.error_count),
|
|
warning_count = COALESCE(EXCLUDED.warning_count, jobs.warning_count),
|
|
result_json = COALESCE(EXCLUDED.result_json, jobs.result_json),
|
|
processing_time = COALESCE(EXCLUDED.processing_time, jobs.processing_time),
|
|
completed_at = CASE WHEN EXCLUDED.status = 'completed' THEN NOW() ELSE jobs.completed_at END";
|
|
|
|
$stmt = $pdo->prepare($sql);
|
|
$stmt->execute([
|
|
':job_id' => $job_id,
|
|
':filename' => $filename,
|
|
':status' => $status,
|
|
':score' => $score,
|
|
':grade' => $grade,
|
|
':total_issues' => $total_issues,
|
|
':critical_count' => $critical_count,
|
|
':error_count' => $error_count,
|
|
':warning_count' => $warning_count,
|
|
':result_json' => $result_json,
|
|
':processing_time' => $processing_time,
|
|
':status2' => $status,
|
|
]);
|
|
} catch (Exception $e) {
|
|
error_log("DB update failed for $job_id: " . $e->getMessage());
|
|
}
|
|
}
|
|
|
|
// CORS headers for API
|
|
$allowed_origins = [
|
|
'https://ai-sandbox.oliver.solutions',
|
|
'http://localhost:8888',
|
|
'http://127.0.0.1:8888',
|
|
'http://localhost:8000',
|
|
'http://127.0.0.1:8000',
|
|
];
|
|
$origin = $_SERVER['HTTP_ORIGIN'] ?? '';
|
|
if (in_array($origin, $allowed_origins) || (function_exists('isDevelopmentMode') && isDevelopmentMode())) {
|
|
header('Access-Control-Allow-Origin: ' . ($origin ?: '*'));
|
|
} else if ($origin) {
|
|
header('Access-Control-Allow-Origin: null');
|
|
} else {
|
|
header('Access-Control-Allow-Origin: ' . ($allowed_origins[0]));
|
|
}
|
|
header('Access-Control-Allow-Methods: POST, GET, OPTIONS, DELETE');
|
|
header('Access-Control-Allow-Headers: Content-Type, X-API-Key, Authorization');
|
|
header('Content-Type: application/json');
|
|
|
|
// Handle preflight
|
|
if ($_SERVER['REQUEST_METHOD'] === 'OPTIONS') {
|
|
exit(0);
|
|
}
|
|
|
|
// Require authentication for all API requests
|
|
require_once __DIR__ . '/auth.php';
|
|
requireAuth();
|
|
|
|
// Get action
|
|
$action = $_GET['action'] ?? $_POST['action'] ?? '';
|
|
|
|
switch ($action) {
|
|
case 'upload':
|
|
handleUpload();
|
|
break;
|
|
case 'check':
|
|
handleCheck();
|
|
break;
|
|
case 'status':
|
|
handleStatus();
|
|
break;
|
|
case 'result':
|
|
handleResult();
|
|
break;
|
|
case 'list':
|
|
handleList();
|
|
break;
|
|
case 'delete':
|
|
handleDelete();
|
|
break;
|
|
case 'debug':
|
|
handleDebug();
|
|
break;
|
|
case 'image':
|
|
handleImage();
|
|
break;
|
|
case 'remediate':
|
|
handleRemediate();
|
|
break;
|
|
case 'download':
|
|
handleDownload();
|
|
break;
|
|
case 'stats':
|
|
handleStats();
|
|
break;
|
|
case 'batch_upload':
|
|
handleBatchUpload();
|
|
break;
|
|
case 'batch_status':
|
|
handleBatchStatus();
|
|
break;
|
|
case 'export':
|
|
handleExport();
|
|
break;
|
|
default:
|
|
error('Invalid action');
|
|
}
|
|
|
|
/**
|
|
* Handle file upload
|
|
*/
|
|
function handleUpload() {
|
|
// Rate limit: 10 uploads/hour per IP
|
|
if (!checkRateLimit('upload', 10, 3600)) {
|
|
http_response_code(429);
|
|
echo json_encode(['success' => false, 'error' => 'Upload rate limit exceeded. Try again later.']);
|
|
exit;
|
|
}
|
|
|
|
if (!isset($_FILES['pdf'])) {
|
|
error('No file uploaded');
|
|
}
|
|
|
|
$file = $_FILES['pdf'];
|
|
|
|
// Validate file
|
|
if ($file['error'] !== UPLOAD_ERR_OK) {
|
|
error('Upload error: ' . $file['error']);
|
|
}
|
|
|
|
if ($file['size'] > MAX_FILE_SIZE) {
|
|
error('File too large. Max size: ' . (MAX_FILE_SIZE / 1024 / 1024) . 'MB');
|
|
}
|
|
|
|
$ext = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
|
|
if (!in_array($ext, ALLOWED_EXTENSIONS)) {
|
|
error('Invalid file type. Only PDF files allowed.');
|
|
}
|
|
|
|
// Validate PDF magic bytes
|
|
$header = file_get_contents($file['tmp_name'], false, null, 0, 5);
|
|
if ($header !== '%PDF-') {
|
|
error('File is not a valid PDF (invalid file header)');
|
|
}
|
|
|
|
// Generate cryptographically secure job ID
|
|
$job_id = 'pdf_' . bin2hex(random_bytes(16));
|
|
$filename = $job_id . '.pdf';
|
|
$filepath = UPLOAD_DIR . '/' . $filename;
|
|
|
|
// Move file
|
|
if (!move_uploaded_file($file['tmp_name'], $filepath)) {
|
|
error('Failed to save file');
|
|
}
|
|
|
|
// Create job metadata
|
|
$job_data = [
|
|
'job_id' => $job_id,
|
|
'original_filename' => $file['name'],
|
|
'uploaded_at' => date('Y-m-d H:i:s'),
|
|
'file_size' => $file['size'],
|
|
'status' => 'uploaded',
|
|
'filepath' => $filepath
|
|
];
|
|
|
|
file_put_contents(
|
|
RESULTS_DIR . '/' . $job_id . '.meta.json',
|
|
json_encode($job_data, JSON_PRETTY_PRINT)
|
|
);
|
|
|
|
success([
|
|
'job_id' => $job_id,
|
|
'filename' => $file['name'],
|
|
'message' => 'File uploaded successfully'
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* Handle PDF accessibility check — send PDF to Cloud Run synchronously
|
|
*/
|
|
function handleCheck() {
|
|
set_time_limit(900); // Allow up to 15 minutes
|
|
|
|
$job_id = $_POST['job_id'] ?? '';
|
|
|
|
if (empty($job_id)) {
|
|
error('Job ID required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
|
|
// Rate limit: 30 checks/hour per IP
|
|
if (!checkRateLimit('check', 30, 3600)) {
|
|
http_response_code(429);
|
|
echo json_encode(['success' => false, 'error' => 'Rate limit exceeded. Try again later.']);
|
|
exit;
|
|
}
|
|
|
|
$meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json';
|
|
|
|
if (!file_exists($meta_file)) {
|
|
error('Job not found');
|
|
}
|
|
|
|
$job_data = json_decode(file_get_contents($meta_file), true);
|
|
$quick_mode = $_POST['quick_mode'] ?? false;
|
|
|
|
// Update meta to processing
|
|
$job_data['status'] = 'processing';
|
|
$job_data['started_at'] = date('Y-m-d H:i:s');
|
|
file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT));
|
|
|
|
// If Cloud Run URL is configured, send to Cloud Run
|
|
if (!empty(CLOUD_RUN_URL)) {
|
|
try {
|
|
$token = getCloudRunToken();
|
|
$pdf_path = $job_data['filepath'];
|
|
|
|
if (!file_exists($pdf_path)) {
|
|
error('PDF file not found on server');
|
|
}
|
|
|
|
// Build multipart POST to Cloud Run
|
|
$ch = curl_init(CLOUD_RUN_URL . '/check');
|
|
$postFields = [
|
|
'pdf' => new CURLFile($pdf_path, 'application/pdf', basename($pdf_path)),
|
|
'job_id' => $job_id,
|
|
'quick_mode' => $quick_mode ? 'true' : 'false',
|
|
'original_filename' => $job_data['original_filename'] ?? '',
|
|
];
|
|
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => $postFields,
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => CLOUD_RUN_TIMEOUT,
|
|
CURLOPT_HTTPHEADER => [
|
|
'Authorization: Bearer ' . $token,
|
|
],
|
|
]);
|
|
|
|
$response = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
$curlError = curl_error($ch);
|
|
curl_close($ch);
|
|
|
|
if ($curlError) {
|
|
throw new Exception("Cloud Run request failed: $curlError");
|
|
}
|
|
|
|
if ($httpCode !== 200) {
|
|
$errorBody = json_decode($response, true);
|
|
$errorMsg = $errorBody['error'] ?? "HTTP $httpCode";
|
|
throw new Exception("Cloud Run returned error: $errorMsg");
|
|
}
|
|
|
|
$result = json_decode($response, true);
|
|
if (!$result || !isset($result['success'])) {
|
|
throw new Exception("Invalid response from Cloud Run");
|
|
}
|
|
|
|
if (!$result['success']) {
|
|
throw new Exception($result['error'] ?? 'Unknown Cloud Run error');
|
|
}
|
|
|
|
$checkResult = $result['data'];
|
|
|
|
// Write result JSON to disk
|
|
$result_file = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
file_put_contents($result_file, json_encode($checkResult, JSON_PRETTY_PRINT));
|
|
|
|
// Update meta
|
|
$job_data['status'] = 'completed';
|
|
$job_data['completed_at'] = date('Y-m-d H:i:s');
|
|
file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT));
|
|
|
|
// Update PostgreSQL
|
|
updateJobInDatabase($job_id, $job_data['original_filename'] ?? '', 'completed', $checkResult);
|
|
|
|
success([
|
|
'job_id' => $job_id,
|
|
'status' => 'completed',
|
|
'message' => 'Check completed'
|
|
]);
|
|
|
|
} catch (Exception $e) {
|
|
// Mark as failed
|
|
$job_data['status'] = 'failed';
|
|
$job_data['error'] = $e->getMessage();
|
|
file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT));
|
|
|
|
updateJobInDatabase($job_id, $job_data['original_filename'] ?? '', 'failed');
|
|
|
|
error('Processing failed: ' . $e->getMessage());
|
|
}
|
|
} else {
|
|
// Fallback to local exec (development without Cloud Run)
|
|
$pdf_path = $job_data['filepath'];
|
|
$output_path = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
$venv_python = __DIR__ . '/venv/bin/python3';
|
|
$python_bin = file_exists($venv_python) ? $venv_python : 'python3';
|
|
|
|
$cmd = escapeshellcmd($python_bin . ' ' . PYTHON_SCRIPT) . ' ' .
|
|
escapeshellarg($pdf_path) . ' ' .
|
|
'--output ' . escapeshellarg($output_path);
|
|
|
|
if ($quick_mode) {
|
|
$cmd .= ' --quick';
|
|
}
|
|
|
|
$anthropic_key = $_POST['anthropic_key'] ?? getenv('ANTHROPIC_API_KEY');
|
|
$google_key = $_POST['google_key'] ?? $_POST['google_credentials'] ?? getenv('GOOGLE_API_KEY');
|
|
|
|
if ($anthropic_key) {
|
|
$cmd .= ' --anthropic-key ' . escapeshellarg($anthropic_key);
|
|
}
|
|
if ($google_key) {
|
|
if (file_exists($google_key)) {
|
|
$cmd .= ' --google-credentials ' . escapeshellarg($google_key);
|
|
} else {
|
|
$cmd .= ' --google-key ' . escapeshellarg($google_key);
|
|
}
|
|
}
|
|
|
|
$env_path = getenv('PATH');
|
|
putenv("PATH=/opt/homebrew/bin:/usr/local/bin:{$env_path}");
|
|
|
|
$error_log = RESULTS_DIR . '/' . $job_id . '.error.log';
|
|
$cmd .= ' > ' . escapeshellarg($error_log) . ' 2>&1 &';
|
|
exec($cmd, $output, $return_code);
|
|
|
|
success([
|
|
'job_id' => $job_id,
|
|
'status' => 'processing',
|
|
'message' => 'Check started (local mode)'
|
|
]);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check job status — pure file-based
|
|
*/
|
|
function handleStatus() {
|
|
$job_id = $_GET['job_id'] ?? '';
|
|
|
|
if (empty($job_id)) {
|
|
error('Job ID required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
|
|
$meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json';
|
|
$result_file = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
$error_log = RESULTS_DIR . '/' . $job_id . '.error.log';
|
|
|
|
if (!file_exists($meta_file)) {
|
|
error('Job not found');
|
|
}
|
|
|
|
$job_data = json_decode(file_get_contents($meta_file), true);
|
|
|
|
// Check if result file exists (definitive completion signal)
|
|
if (file_exists($result_file)) {
|
|
$job_data['status'] = 'completed';
|
|
$job_data['completed_at'] = $job_data['completed_at'] ?? date('Y-m-d H:i:s', filemtime($result_file));
|
|
} else if (file_exists($error_log) && in_array($job_data['status'], ['processing', 'queued'])) {
|
|
$error_content = file_get_contents($error_log);
|
|
if (!empty($error_content)) {
|
|
$started = strtotime($job_data['started_at'] ?? 'now');
|
|
if (time() - $started > 900) {
|
|
$job_data['status'] = 'failed';
|
|
$job_data['error'] = 'Process timeout or error';
|
|
$job_data['error_log'] = substr($error_content, -1000);
|
|
}
|
|
}
|
|
}
|
|
|
|
success($job_data);
|
|
}
|
|
|
|
/**
|
|
* Get check results
|
|
*/
|
|
function handleResult() {
|
|
$job_id = $_GET['job_id'] ?? '';
|
|
|
|
if (empty($job_id)) {
|
|
error('Job ID required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
|
|
$result_file = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
|
|
if (!file_exists($result_file)) {
|
|
error('Results not found. Check may still be processing.');
|
|
}
|
|
|
|
$result = json_decode(file_get_contents($result_file), true);
|
|
|
|
success($result);
|
|
}
|
|
|
|
/**
|
|
* List all jobs
|
|
*/
|
|
function handleList() {
|
|
$jobs = [];
|
|
|
|
$files = glob(RESULTS_DIR . '/*.meta.json');
|
|
|
|
foreach ($files as $file) {
|
|
$job_data = json_decode(file_get_contents($file), true);
|
|
|
|
// Check if completed
|
|
$result_file = str_replace('.meta.json', '.result.json', $file);
|
|
if (file_exists($result_file)) {
|
|
$job_data['status'] = 'completed';
|
|
}
|
|
|
|
$jobs[] = $job_data;
|
|
}
|
|
|
|
// Sort by upload time (newest first)
|
|
usort($jobs, function($a, $b) {
|
|
return strtotime($b['uploaded_at']) - strtotime($a['uploaded_at']);
|
|
});
|
|
|
|
success(['jobs' => $jobs]);
|
|
}
|
|
|
|
/**
|
|
* Delete a job
|
|
*/
|
|
function handleDelete() {
|
|
$job_id = $_POST['job_id'] ?? $_GET['job_id'] ?? '';
|
|
|
|
if (empty($job_id)) {
|
|
error('Job ID required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
|
|
$meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json';
|
|
|
|
if (!file_exists($meta_file)) {
|
|
error('Job not found');
|
|
}
|
|
|
|
$job_data = json_decode(file_get_contents($meta_file), true);
|
|
|
|
// Delete files
|
|
@unlink($job_data['filepath']);
|
|
@unlink($meta_file);
|
|
@unlink(RESULTS_DIR . '/' . $job_id . '.result.json');
|
|
|
|
success(['message' => 'Job deleted']);
|
|
}
|
|
|
|
/**
|
|
* Debug endpoint
|
|
*/
|
|
function handleDebug() {
|
|
// Debug endpoint only available in development mode
|
|
require_once __DIR__ . '/auth.php';
|
|
if (!isDevelopmentMode()) {
|
|
error('Debug endpoint disabled in production');
|
|
}
|
|
|
|
$job_id = $_GET['job_id'] ?? '';
|
|
|
|
if (empty($job_id)) {
|
|
error('Job ID required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
|
|
$meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json';
|
|
$result_file = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
$error_log = RESULTS_DIR . '/' . $job_id . '.error.log';
|
|
|
|
$debug_info = [
|
|
'job_id' => $job_id,
|
|
'meta_exists' => file_exists($meta_file),
|
|
'result_exists' => file_exists($result_file),
|
|
'error_log_exists' => file_exists($error_log),
|
|
'cloud_run_url' => CLOUD_RUN_URL ?: '(not configured — local mode)',
|
|
'files' => []
|
|
];
|
|
|
|
if (file_exists($meta_file)) {
|
|
$debug_info['meta'] = json_decode(file_get_contents($meta_file), true);
|
|
}
|
|
|
|
if (file_exists($error_log)) {
|
|
$debug_info['error_log'] = file_get_contents($error_log);
|
|
}
|
|
|
|
if (file_exists($result_file)) {
|
|
$debug_info['result_size'] = filesize($result_file);
|
|
}
|
|
|
|
// Test Python
|
|
$venv_python = __DIR__ . '/venv/bin/python3';
|
|
exec($venv_python . ' --version 2>&1', $python_version);
|
|
$debug_info['python_version'] = implode("\n", $python_version);
|
|
|
|
success($debug_info);
|
|
}
|
|
|
|
/**
|
|
* Serve page images — redirect to GCS URL or serve local file
|
|
*/
|
|
function handleImage() {
|
|
$job_id = $_GET['job_id'] ?? '';
|
|
$page_num = $_GET['page'] ?? '';
|
|
|
|
if (empty($job_id) || empty($page_num)) {
|
|
error('Job ID and page number required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
$page_num = intval($page_num);
|
|
|
|
// Check result JSON for GCS URLs
|
|
$result_file = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
if (file_exists($result_file)) {
|
|
$result = json_decode(file_get_contents($result_file), true);
|
|
$page_images = $result['page_images'] ?? [];
|
|
|
|
// Check if the page image value is a URL (GCS)
|
|
$image_value = $page_images[$page_num] ?? $page_images[strval($page_num)] ?? null;
|
|
if ($image_value && (strpos($image_value, 'http://') === 0 || strpos($image_value, 'https://') === 0)) {
|
|
// Redirect to GCS URL
|
|
header('HTTP/1.1 302 Found');
|
|
header('Location: ' . $image_value);
|
|
header('Cache-Control: public, max-age=86400');
|
|
exit;
|
|
}
|
|
}
|
|
|
|
// Fallback: serve local image file
|
|
$images_dir = RESULTS_DIR . '/' . $job_id . '.result_images';
|
|
$image_file = $images_dir . '/page_' . $page_num . '.png';
|
|
|
|
if (!file_exists($image_file)) {
|
|
http_response_code(404);
|
|
header('Content-Type: application/json');
|
|
echo json_encode(['success' => false, 'error' => 'Image not found']);
|
|
exit;
|
|
}
|
|
|
|
// Serve the image
|
|
header('Content-Type: image/png');
|
|
header('Cache-Control: public, max-age=86400'); // Cache for 1 day
|
|
readfile($image_file);
|
|
exit;
|
|
}
|
|
|
|
/**
|
|
* Auto-remediate PDF accessibility issues
|
|
*/
|
|
function handleRemediate() {
|
|
$job_id = $_POST['job_id'] ?? '';
|
|
|
|
if (empty($job_id)) {
|
|
error('Job ID required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
|
|
$meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json';
|
|
$result_file = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
|
|
if (!file_exists($meta_file) || !file_exists($result_file)) {
|
|
error('Job not found');
|
|
}
|
|
|
|
$job_data = json_decode(file_get_contents($meta_file), true);
|
|
$result_data = json_decode(file_get_contents($result_file), true);
|
|
|
|
// Check if there are fixable issues
|
|
if (!isset($result_data['auto_fixable_count']) || $result_data['auto_fixable_count'] == 0) {
|
|
error('No auto-fixable issues found');
|
|
}
|
|
|
|
$original_pdf = $job_data['filepath'];
|
|
$remediated_pdf = UPLOAD_DIR . '/' . $job_id . '_remediated.pdf';
|
|
|
|
// Use absolute venv path
|
|
$venv_python = __DIR__ . '/venv/bin/python3';
|
|
$python_bin = file_exists($venv_python) ? $venv_python : 'python3';
|
|
$remediation_script = __DIR__ . '/pdf_remediation.py';
|
|
|
|
// Build command - apply all safe fixes
|
|
$cmd = escapeshellcmd($python_bin . ' ' . $remediation_script) . ' ' .
|
|
escapeshellarg($original_pdf) . ' ' .
|
|
'--output ' . escapeshellarg($remediated_pdf) . ' ' .
|
|
'--all';
|
|
|
|
// Set PATH for poppler
|
|
$env_path = getenv('PATH');
|
|
$poppler_paths = '/opt/homebrew/bin:/usr/local/bin';
|
|
putenv("PATH={$poppler_paths}:{$env_path}");
|
|
|
|
// Run remediation
|
|
$error_log = RESULTS_DIR . '/' . $job_id . '.remediation.log';
|
|
$cmd .= ' > ' . escapeshellarg($error_log) . ' 2>&1';
|
|
|
|
exec($cmd, $output, $return_code);
|
|
|
|
// Check if remediation succeeded
|
|
if ($return_code !== 0 || !file_exists($remediated_pdf)) {
|
|
$log_content = file_exists($error_log) ? file_get_contents($error_log) : 'Unknown error';
|
|
error('Remediation failed: ' . substr($log_content, -500));
|
|
}
|
|
|
|
// Store remediated file info
|
|
$job_data['remediated_pdf'] = $remediated_pdf;
|
|
$job_data['remediated_at'] = date('Y-m-d H:i:s');
|
|
file_put_contents($meta_file, json_encode($job_data, JSON_PRETTY_PRINT));
|
|
|
|
success([
|
|
'job_id' => $job_id,
|
|
'remediated_pdf' => basename($remediated_pdf),
|
|
'original_filename' => $job_data['original_filename'],
|
|
'fixes_applied' => $result_data['auto_fixable_count'],
|
|
'download_url' => 'api.php?action=download&job_id=' . $job_id . '&type=remediated',
|
|
'message' => 'PDF remediated successfully'
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* Download original or remediated PDF
|
|
*/
|
|
function handleDownload() {
|
|
$job_id = $_GET['job_id'] ?? '';
|
|
$type = $_GET['type'] ?? 'original'; // 'original' or 'remediated'
|
|
|
|
if (empty($job_id)) {
|
|
error('Job ID required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
|
|
$meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json';
|
|
|
|
if (!file_exists($meta_file)) {
|
|
error('Job not found');
|
|
}
|
|
|
|
$job_data = json_decode(file_get_contents($meta_file), true);
|
|
|
|
if ($type === 'remediated') {
|
|
if (!isset($job_data['remediated_pdf']) || !file_exists($job_data['remediated_pdf'])) {
|
|
error('Remediated PDF not found');
|
|
}
|
|
$file_path = $job_data['remediated_pdf'];
|
|
$filename = pathinfo($job_data['original_filename'], PATHINFO_FILENAME) . '_fixed.pdf';
|
|
} else {
|
|
$file_path = $job_data['filepath'];
|
|
$filename = $job_data['original_filename'];
|
|
}
|
|
|
|
// Serve the file
|
|
header('Content-Type: application/pdf');
|
|
header('Content-Disposition: attachment; filename="' . $filename . '"');
|
|
header('Content-Length: ' . filesize($file_path));
|
|
readfile($file_path);
|
|
exit;
|
|
}
|
|
|
|
/**
|
|
* Get aggregate job statistics
|
|
*/
|
|
function handleStats() {
|
|
$stats = [
|
|
'total_jobs' => 0,
|
|
'completed' => 0,
|
|
'failed' => 0,
|
|
'processing' => 0,
|
|
];
|
|
|
|
// Count jobs from meta files
|
|
$files = glob(RESULTS_DIR . '/*.meta.json');
|
|
foreach ($files as $file) {
|
|
$job = json_decode(file_get_contents($file), true);
|
|
$stats['total_jobs']++;
|
|
$result_file = str_replace('.meta.json', '.result.json', $file);
|
|
if (file_exists($result_file)) {
|
|
$stats['completed']++;
|
|
} else if (($job['status'] ?? '') === 'failed') {
|
|
$stats['failed']++;
|
|
} else {
|
|
$stats['processing']++;
|
|
}
|
|
}
|
|
|
|
success($stats);
|
|
}
|
|
|
|
/**
|
|
* Handle batch file upload — accepts multiple PDFs
|
|
*/
|
|
function handleBatchUpload() {
|
|
if (!checkRateLimit('upload', 10, 3600)) {
|
|
http_response_code(429);
|
|
echo json_encode(['success' => false, 'error' => 'Upload rate limit exceeded.']);
|
|
exit;
|
|
}
|
|
|
|
if (!isset($_FILES['pdfs']) || !is_array($_FILES['pdfs']['name'])) {
|
|
error('No files uploaded. Use "pdfs[]" as the file field name.');
|
|
}
|
|
|
|
$batch_id = 'batch_' . bin2hex(random_bytes(8));
|
|
$file_count = count($_FILES['pdfs']['name']);
|
|
$uploaded = [];
|
|
$errors = [];
|
|
|
|
for ($i = 0; $i < $file_count; $i++) {
|
|
$name = $_FILES['pdfs']['name'][$i];
|
|
$tmp = $_FILES['pdfs']['tmp_name'][$i];
|
|
$size = $_FILES['pdfs']['size'][$i];
|
|
$err = $_FILES['pdfs']['error'][$i];
|
|
|
|
if ($err !== UPLOAD_ERR_OK) {
|
|
$errors[] = ['filename' => $name, 'error' => "Upload error code: $err"];
|
|
continue;
|
|
}
|
|
if ($size > MAX_FILE_SIZE) {
|
|
$errors[] = ['filename' => $name, 'error' => 'File too large'];
|
|
continue;
|
|
}
|
|
$ext = strtolower(pathinfo($name, PATHINFO_EXTENSION));
|
|
if (!in_array($ext, ALLOWED_EXTENSIONS)) {
|
|
$errors[] = ['filename' => $name, 'error' => 'Not a PDF file'];
|
|
continue;
|
|
}
|
|
$header = file_get_contents($tmp, false, null, 0, 5);
|
|
if ($header !== '%PDF-') {
|
|
$errors[] = ['filename' => $name, 'error' => 'Invalid PDF header'];
|
|
continue;
|
|
}
|
|
|
|
$job_id = 'pdf_' . bin2hex(random_bytes(16));
|
|
$filename = $job_id . '.pdf';
|
|
$filepath = UPLOAD_DIR . '/' . $filename;
|
|
|
|
if (!move_uploaded_file($tmp, $filepath)) {
|
|
$errors[] = ['filename' => $name, 'error' => 'Failed to save'];
|
|
continue;
|
|
}
|
|
|
|
$job_data = [
|
|
'job_id' => $job_id,
|
|
'batch_id' => $batch_id,
|
|
'original_filename' => $name,
|
|
'uploaded_at' => date('Y-m-d H:i:s'),
|
|
'file_size' => $size,
|
|
'status' => 'uploaded',
|
|
'filepath' => $filepath
|
|
];
|
|
file_put_contents(
|
|
RESULTS_DIR . '/' . $job_id . '.meta.json',
|
|
json_encode($job_data, JSON_PRETTY_PRINT)
|
|
);
|
|
|
|
$uploaded[] = ['job_id' => $job_id, 'filename' => $name];
|
|
}
|
|
|
|
// Save batch manifest
|
|
$batch_data = [
|
|
'batch_id' => $batch_id,
|
|
'created_at' => date('Y-m-d H:i:s'),
|
|
'total_files' => $file_count,
|
|
'jobs' => array_column($uploaded, 'job_id'),
|
|
];
|
|
file_put_contents(
|
|
RESULTS_DIR . '/' . $batch_id . '.batch.json',
|
|
json_encode($batch_data, JSON_PRETTY_PRINT)
|
|
);
|
|
|
|
success([
|
|
'batch_id' => $batch_id,
|
|
'uploaded' => $uploaded,
|
|
'errors' => $errors,
|
|
'message' => count($uploaded) . ' of ' . $file_count . ' files uploaded'
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* Get status of a batch job
|
|
*/
|
|
function handleBatchStatus() {
|
|
$batch_id = $_GET['batch_id'] ?? '';
|
|
if (empty($batch_id) || !preg_match('/^batch_[a-f0-9]+$/', $batch_id)) {
|
|
error('Invalid batch ID');
|
|
}
|
|
|
|
$batch_file = RESULTS_DIR . '/' . $batch_id . '.batch.json';
|
|
if (!file_exists($batch_file)) {
|
|
error('Batch not found');
|
|
}
|
|
|
|
$batch = json_decode(file_get_contents($batch_file), true);
|
|
$jobs = [];
|
|
$completed = 0;
|
|
$failed = 0;
|
|
|
|
foreach ($batch['jobs'] as $job_id) {
|
|
$meta_file = RESULTS_DIR . '/' . $job_id . '.meta.json';
|
|
$result_file = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
|
|
$status = 'unknown';
|
|
$score = null;
|
|
$filename = '';
|
|
|
|
if (file_exists($meta_file)) {
|
|
$meta = json_decode(file_get_contents($meta_file), true);
|
|
$status = $meta['status'] ?? 'uploaded';
|
|
$filename = $meta['original_filename'] ?? '';
|
|
}
|
|
if (file_exists($result_file)) {
|
|
$status = 'completed';
|
|
$result = json_decode(file_get_contents($result_file), true);
|
|
$score = $result['accessibility_score'] ?? null;
|
|
$completed++;
|
|
} else if ($status === 'failed') {
|
|
$failed++;
|
|
}
|
|
|
|
$jobs[] = [
|
|
'job_id' => $job_id,
|
|
'filename' => $filename,
|
|
'status' => $status,
|
|
'score' => $score
|
|
];
|
|
}
|
|
|
|
$total = count($batch['jobs']);
|
|
$overall_status = ($completed === $total) ? 'completed' :
|
|
(($completed + $failed === $total) ? 'finished' : 'processing');
|
|
|
|
success([
|
|
'batch_id' => $batch_id,
|
|
'status' => $overall_status,
|
|
'total' => $total,
|
|
'completed' => $completed,
|
|
'failed' => $failed,
|
|
'jobs' => $jobs
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* Export results as HTML or JSON
|
|
*/
|
|
function handleExport() {
|
|
$job_id = $_GET['job_id'] ?? '';
|
|
$format = $_GET['format'] ?? 'json';
|
|
|
|
if (empty($job_id)) {
|
|
error('Job ID required');
|
|
}
|
|
$job_id = sanitizeJobId($job_id);
|
|
|
|
$result_file = RESULTS_DIR . '/' . $job_id . '.result.json';
|
|
if (!file_exists($result_file)) {
|
|
error('Results not found');
|
|
}
|
|
|
|
$result = json_decode(file_get_contents($result_file), true);
|
|
|
|
if ($format === 'html') {
|
|
// Generate HTML report via Python
|
|
$venv_python = __DIR__ . '/venv/bin/python3';
|
|
$python_bin = file_exists($venv_python) ? $venv_python : 'python3';
|
|
$report_script = __DIR__ . '/report_generator.py';
|
|
|
|
$html_file = RESULTS_DIR . '/' . $job_id . '.report.html';
|
|
|
|
$cmd = escapeshellcmd($python_bin . ' ' . $report_script) .
|
|
' --input ' . escapeshellarg($result_file) .
|
|
' --output ' . escapeshellarg($html_file);
|
|
|
|
exec($cmd . ' 2>&1', $output, $return_code);
|
|
|
|
if ($return_code !== 0 || !file_exists($html_file)) {
|
|
error('Report generation failed');
|
|
}
|
|
|
|
header('Content-Type: text/html; charset=utf-8');
|
|
header('Content-Disposition: attachment; filename="accessibility_report_' . $job_id . '.html"');
|
|
readfile($html_file);
|
|
exit;
|
|
}
|
|
|
|
// Default: JSON download
|
|
header('Content-Type: application/json');
|
|
header('Content-Disposition: attachment; filename="accessibility_report_' . $job_id . '.json"');
|
|
echo json_encode($result, JSON_PRETTY_PRINT);
|
|
exit;
|
|
}
|
|
|
|
/**
|
|
* Send success response
|
|
*/
|
|
function success($data) {
|
|
echo json_encode([
|
|
'success' => true,
|
|
'data' => $data
|
|
]);
|
|
exit;
|
|
}
|
|
|
|
/**
|
|
* Send error response
|
|
*/
|
|
function error($message) {
|
|
http_response_code(400);
|
|
echo json_encode([
|
|
'success' => false,
|
|
'error' => $message
|
|
]);
|
|
exit;
|
|
}
|