pdf-accessibility/cloudrun_service.py
michael 4080638856 Migrate PDF processing from Redis worker to Google Cloud Run
Replace the Redis queue + Python worker daemon with a synchronous HTTP
call to a Cloud Run service, eliminating Redis and simplifying the
infrastructure from 4 containers (web, worker, redis, postgres) to just
web + postgres (with Cloud Run handling processing).

- Add cloudrun_service.py: Flask app wrapping EnterprisePDFChecker with
  POST /check and GET /health endpoints, GCS image upload
- Add Dockerfile.cloudrun + requirements-cloudrun.txt for Cloud Run image
- Add cloudbuild.yaml for Cloud Build with custom Dockerfile
- Rewrite api.php: remove all Redis code, add Cloud Run OIDC auth
  (getCloudRunToken), synchronous processing in handleCheck(), file-based
  rate limiting, GCS redirect in handleImage(), DB helper updateJobInDatabase()
- Update js/upload.js: handle synchronous completed response from Cloud Run,
  increase poll timeout to 15 minutes
- Update js/page-viewer.js: use GCS URLs directly for page images
- Simplify docker-compose.yml and docker-compose.prod.yml: remove worker
  and redis services
- Remove PHP Redis extension from Dockerfile.web
- Set 900s timeouts across nginx, PHP-FPM, gunicorn, curl, and Cloud Run
- Update cleanup.py: remove result_images pattern (now on GCS), add
  rate_limits cleanup
- Update .env.example: replace Redis vars with Cloud Run/GCS config

Cloud Run service deployed to:
  https://pdf-checker-bcb6ipdqka-uc.a.run.app
GCS bucket: gs://optical-pdf-images (7-day lifecycle, public read)
GCP project: optical-414516

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 14:50:38 -06:00

136 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""
PDF Accessibility Checker — Cloud Run HTTP Service
Flask app wrapping EnterprisePDFChecker for serverless execution.
Receives PDF via multipart POST, runs checks, uploads page images to GCS,
returns full result JSON.
"""
import os
import json
import tempfile
import logging
from pathlib import Path
from flask import Flask, request, jsonify
from google.cloud import storage
from enterprise_pdf_checker import EnterprisePDFChecker
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [cloudrun] %(levelname)s: %(message)s'
)
logger = logging.getLogger('cloudrun')
app = Flask(__name__)
GCS_BUCKET_NAME = os.getenv('GCS_BUCKET_NAME', 'optical-pdf-images')
def upload_images_to_gcs(images_dir: Path, job_id: str) -> dict:
"""Upload page images to GCS and return {page_num: public_url} mapping."""
client = storage.Client()
bucket = client.bucket(GCS_BUCKET_NAME)
page_images = {}
for image_file in sorted(images_dir.glob('page_*.png')):
# Extract page number from filename (page_1.png -> 1)
page_num = int(image_file.stem.split('_')[1])
blob_name = f"{job_id}/{image_file.name}"
blob = bucket.blob(blob_name)
blob.upload_from_filename(str(image_file), content_type='image/png')
# Bucket has uniform bucket-level access with allUsers objectViewer,
# so objects are public by default — no need for blob.make_public()
public_url = f"https://storage.googleapis.com/{GCS_BUCKET_NAME}/{blob_name}"
page_images[page_num] = public_url
logger.info("Uploaded %s -> %s", image_file.name, public_url)
return page_images
@app.route('/check', methods=['POST'])
def check_pdf():
"""Accept multipart PDF upload, run accessibility checks, return results."""
pdf_file = request.files.get('pdf')
if not pdf_file:
return jsonify({'success': False, 'error': 'No PDF file provided'}), 400
job_id = request.form.get('job_id', 'unknown')
quick_mode = request.form.get('quick_mode', 'false').lower() in ('true', '1', 'yes')
original_filename = request.form.get('original_filename', pdf_file.filename or 'document.pdf')
logger.info("Received job %s: %s (quick=%s)", job_id, original_filename, quick_mode)
tmp_pdf = None
images_dir = None
try:
# Save uploaded PDF to temp file
tmp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
pdf_file.save(tmp_pdf)
tmp_pdf.close()
# Run accessibility checks
config = {
'anthropic_api_key': os.getenv('ANTHROPIC_API_KEY'),
'google_api_key': os.getenv('GOOGLE_API_KEY'),
}
checker = EnterprisePDFChecker(tmp_pdf.name, config, quick_mode=quick_mode)
checker.check_all()
# Generate page images to a temp directory
images_dir = tempfile.mkdtemp(prefix='pdf_images_')
images_path = Path(images_dir)
checker._generate_page_images(images_path)
# Get results before uploading images (page_images has local filenames)
results = checker.to_dict()
# Upload images to GCS and replace local filenames with public URLs
if checker.page_images:
gcs_urls = upload_images_to_gcs(images_path, job_id)
results['page_images'] = gcs_urls
# Add grade based on score
score = results.get('accessibility_score', 0)
if score >= 90:
results['grade'] = 'A'
elif score >= 80:
results['grade'] = 'B'
elif score >= 70:
results['grade'] = 'C'
elif score >= 60:
results['grade'] = 'D'
else:
results['grade'] = 'F'
logger.info("Job %s completed: score=%s grade=%s issues=%d",
job_id, results['accessibility_score'],
results['grade'], results['total_issues'])
return jsonify({'success': True, 'data': results})
except Exception as e:
logger.error("Job %s failed: %s", job_id, str(e), exc_info=True)
return jsonify({'success': False, 'error': str(e)}), 500
finally:
# Clean up temp files
if tmp_pdf and os.path.exists(tmp_pdf.name):
os.unlink(tmp_pdf.name)
if images_dir and os.path.exists(images_dir):
import shutil
shutil.rmtree(images_dir, ignore_errors=True)
@app.route('/health', methods=['GET'])
def health():
return jsonify({'status': 'ok'})
if __name__ == '__main__':
port = int(os.getenv('PORT', 8080))
app.run(host='0.0.0.0', port=port, debug=False)