PDF-accessibility-saas/cloudrun_service.py
Vadym Samoilenko 5a00ec88d7 Phase 1: De-Oliver rebrand — remove Azure AD, GCP, Oliver branding
- Delete PHP API layer (api.php, auth.php) — replaced by FastAPI in Phase 2
- Delete MSAL/Azure AD JS files (app.js, app-history.js, api.js)
- Delete GCP Cloud Build/Deploy infra (cloudbuild.yaml, deploy.sh, Dockerfiles)
- Delete Oliver-specific docs (OLIVER_CUSTOMIZATION.md, DAVE_QUICK_SETUP.md, etc.)
- Replace Oliver yellow #FFC407 with Aimpress indigo #6366F1 across CSS + reports
- Replace Oliver Solutions footer in report_generator.py with Aimpress
- Switch font from Montserrat to Inter in CSS
- Replace GCS optical-pdf-images bucket with STORAGE_BUCKET env var
- Rewrite README.md for Aimpress SaaS product

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 14:41:27 +01:00

136 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""
PDF Accessibility Checker — Cloud Run HTTP Service
Flask app wrapping EnterprisePDFChecker for serverless execution.
Receives PDF via multipart POST, runs checks, uploads page images to GCS,
returns full result JSON.
"""
import os
import json
import tempfile
import logging
from pathlib import Path
from flask import Flask, request, jsonify
from google.cloud import storage
from enterprise_pdf_checker import EnterprisePDFChecker
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [cloudrun] %(levelname)s: %(message)s'
)
logger = logging.getLogger('cloudrun')
app = Flask(__name__)
GCS_BUCKET_NAME = os.getenv('STORAGE_BUCKET', 'pdf-pages')
def upload_images_to_gcs(images_dir: Path, job_id: str) -> dict:
"""Upload page images to GCS and return {page_num: public_url} mapping."""
client = storage.Client()
bucket = client.bucket(GCS_BUCKET_NAME)
page_images = {}
for image_file in sorted(images_dir.glob('page_*.png')):
# Extract page number from filename (page_1.png -> 1)
page_num = int(image_file.stem.split('_')[1])
blob_name = f"{job_id}/{image_file.name}"
blob = bucket.blob(blob_name)
blob.upload_from_filename(str(image_file), content_type='image/png')
# Bucket has uniform bucket-level access with allUsers objectViewer,
# so objects are public by default — no need for blob.make_public()
public_url = f"https://storage.googleapis.com/{GCS_BUCKET_NAME}/{blob_name}"
page_images[page_num] = public_url
logger.info("Uploaded %s -> %s", image_file.name, public_url)
return page_images
@app.route('/check', methods=['POST'])
def check_pdf():
"""Accept multipart PDF upload, run accessibility checks, return results."""
pdf_file = request.files.get('pdf')
if not pdf_file:
return jsonify({'success': False, 'error': 'No PDF file provided'}), 400
job_id = request.form.get('job_id', 'unknown')
quick_mode = request.form.get('quick_mode', 'false').lower() in ('true', '1', 'yes')
original_filename = request.form.get('original_filename', pdf_file.filename or 'document.pdf')
logger.info("Received job %s: %s (quick=%s)", job_id, original_filename, quick_mode)
tmp_pdf = None
images_dir = None
try:
# Save uploaded PDF to temp file
tmp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
pdf_file.save(tmp_pdf)
tmp_pdf.close()
# Run accessibility checks
config = {
'anthropic_api_key': os.getenv('ANTHROPIC_API_KEY'),
'google_api_key': os.getenv('GOOGLE_API_KEY'),
}
checker = EnterprisePDFChecker(tmp_pdf.name, config, quick_mode=quick_mode)
checker.check_all()
# Generate page images to a temp directory
images_dir = tempfile.mkdtemp(prefix='pdf_images_')
images_path = Path(images_dir)
checker._generate_page_images(images_path)
# Get results before uploading images (page_images has local filenames)
results = checker.to_dict()
# Upload images to GCS and replace local filenames with public URLs
if checker.page_images:
gcs_urls = upload_images_to_gcs(images_path, job_id)
results['page_images'] = gcs_urls
# Add grade based on score
score = results.get('accessibility_score', 0)
if score >= 90:
results['grade'] = 'A'
elif score >= 80:
results['grade'] = 'B'
elif score >= 70:
results['grade'] = 'C'
elif score >= 60:
results['grade'] = 'D'
else:
results['grade'] = 'F'
logger.info("Job %s completed: score=%s grade=%s issues=%d",
job_id, results['accessibility_score'],
results['grade'], results['total_issues'])
return jsonify({'success': True, 'data': results})
except Exception as e:
logger.error("Job %s failed: %s", job_id, str(e), exc_info=True)
return jsonify({'success': False, 'error': str(e)}), 500
finally:
# Clean up temp files
if tmp_pdf and os.path.exists(tmp_pdf.name):
os.unlink(tmp_pdf.name)
if images_dir and os.path.exists(images_dir):
import shutil
shutil.rmtree(images_dir, ignore_errors=True)
@app.route('/health', methods=['GET'])
def health():
return jsonify({'status': 'ok'})
if __name__ == '__main__':
port = int(os.getenv('PORT', 8080))
app.run(host='0.0.0.0', port=port, debug=False)