PDF-accessibility-saas/cloudrun_service.py

136 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""
PDF Accessibility Checker — Cloud Run HTTP Service
Flask app wrapping EnterprisePDFChecker for serverless execution.
Receives PDF via multipart POST, runs checks, uploads page images to GCS,
returns full result JSON.
"""
import os
import json
import tempfile
import logging
from pathlib import Path
from flask import Flask, request, jsonify
from google.cloud import storage
from enterprise_pdf_checker import EnterprisePDFChecker
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [cloudrun] %(levelname)s: %(message)s'
)
logger = logging.getLogger('cloudrun')
app = Flask(__name__)
GCS_BUCKET_NAME = os.getenv('GCS_BUCKET_NAME', 'optical-pdf-images')
def upload_images_to_gcs(images_dir: Path, job_id: str) -> dict:
"""Upload page images to GCS and return {page_num: public_url} mapping."""
client = storage.Client()
bucket = client.bucket(GCS_BUCKET_NAME)
page_images = {}
for image_file in sorted(images_dir.glob('page_*.png')):
# Extract page number from filename (page_1.png -> 1)
page_num = int(image_file.stem.split('_')[1])
blob_name = f"{job_id}/{image_file.name}"
blob = bucket.blob(blob_name)
blob.upload_from_filename(str(image_file), content_type='image/png')
# Bucket has uniform bucket-level access with allUsers objectViewer,
# so objects are public by default — no need for blob.make_public()
public_url = f"https://storage.googleapis.com/{GCS_BUCKET_NAME}/{blob_name}"
page_images[page_num] = public_url
logger.info("Uploaded %s -> %s", image_file.name, public_url)
return page_images
@app.route('/check', methods=['POST'])
def check_pdf():
"""Accept multipart PDF upload, run accessibility checks, return results."""
pdf_file = request.files.get('pdf')
if not pdf_file:
return jsonify({'success': False, 'error': 'No PDF file provided'}), 400
job_id = request.form.get('job_id', 'unknown')
quick_mode = request.form.get('quick_mode', 'false').lower() in ('true', '1', 'yes')
original_filename = request.form.get('original_filename', pdf_file.filename or 'document.pdf')
logger.info("Received job %s: %s (quick=%s)", job_id, original_filename, quick_mode)
tmp_pdf = None
images_dir = None
try:
# Save uploaded PDF to temp file
tmp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
pdf_file.save(tmp_pdf)
tmp_pdf.close()
# Run accessibility checks
config = {
'anthropic_api_key': os.getenv('ANTHROPIC_API_KEY'),
'google_api_key': os.getenv('GOOGLE_API_KEY'),
}
checker = EnterprisePDFChecker(tmp_pdf.name, config, quick_mode=quick_mode)
checker.check_all()
# Generate page images to a temp directory
images_dir = tempfile.mkdtemp(prefix='pdf_images_')
images_path = Path(images_dir)
checker._generate_page_images(images_path)
# Get results before uploading images (page_images has local filenames)
results = checker.to_dict()
# Upload images to GCS and replace local filenames with public URLs
if checker.page_images:
gcs_urls = upload_images_to_gcs(images_path, job_id)
results['page_images'] = gcs_urls
# Add grade based on score
score = results.get('accessibility_score', 0)
if score >= 90:
results['grade'] = 'A'
elif score >= 80:
results['grade'] = 'B'
elif score >= 70:
results['grade'] = 'C'
elif score >= 60:
results['grade'] = 'D'
else:
results['grade'] = 'F'
logger.info("Job %s completed: score=%s grade=%s issues=%d",
job_id, results['accessibility_score'],
results['grade'], results['total_issues'])
return jsonify({'success': True, 'data': results})
except Exception as e:
logger.error("Job %s failed: %s", job_id, str(e), exc_info=True)
return jsonify({'success': False, 'error': str(e)}), 500
finally:
# Clean up temp files
if tmp_pdf and os.path.exists(tmp_pdf.name):
os.unlink(tmp_pdf.name)
if images_dir and os.path.exists(images_dir):
import shutil
shutil.rmtree(images_dir, ignore_errors=True)
@app.route('/health', methods=['GET'])
def health():
return jsonify({'status': 'ok'})
if __name__ == '__main__':
port = int(os.getenv('PORT', 8080))
app.run(host='0.0.0.0', port=port, debug=False)