Replace the Redis queue + Python worker daemon with a synchronous HTTP call to a Cloud Run service, eliminating Redis and simplifying the infrastructure from 4 containers (web, worker, redis, postgres) to just web + postgres (with Cloud Run handling processing). - Add cloudrun_service.py: Flask app wrapping EnterprisePDFChecker with POST /check and GET /health endpoints, GCS image upload - Add Dockerfile.cloudrun + requirements-cloudrun.txt for Cloud Run image - Add cloudbuild.yaml for Cloud Build with custom Dockerfile - Rewrite api.php: remove all Redis code, add Cloud Run OIDC auth (getCloudRunToken), synchronous processing in handleCheck(), file-based rate limiting, GCS redirect in handleImage(), DB helper updateJobInDatabase() - Update js/upload.js: handle synchronous completed response from Cloud Run, increase poll timeout to 15 minutes - Update js/page-viewer.js: use GCS URLs directly for page images - Simplify docker-compose.yml and docker-compose.prod.yml: remove worker and redis services - Remove PHP Redis extension from Dockerfile.web - Set 900s timeouts across nginx, PHP-FPM, gunicorn, curl, and Cloud Run - Update cleanup.py: remove result_images pattern (now on GCS), add rate_limits cleanup - Update .env.example: replace Redis vars with Cloud Run/GCS config Cloud Run service deployed to: https://pdf-checker-bcb6ipdqka-uc.a.run.app GCS bucket: gs://optical-pdf-images (7-day lifecycle, public read) GCP project: optical-414516 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
131 lines
4.2 KiB
Python
131 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PDF Accessibility Checker — File Cleanup
|
|
|
|
Deletes uploaded PDFs, result JSON files, error logs, and rate limit files
|
|
older than RETENTION_HOURS (default 24h). Page images are on GCS with
|
|
a 7-day lifecycle policy.
|
|
|
|
Usage:
|
|
python cleanup.py # dry-run (show what would be deleted)
|
|
python cleanup.py --execute # actually delete
|
|
|
|
Designed to run via cron, e.g.:
|
|
0 * * * * cd /var/www/html/pdf-accessibility && python3 cleanup.py --execute >> logs/cleanup.log 2>&1
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import shutil
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s [cleanup] %(levelname)s: %(message)s'
|
|
)
|
|
logger = logging.getLogger('cleanup')
|
|
|
|
UPLOADS_DIR = Path(os.getenv('UPLOADS_DIR', '/opt/pdf-accessibility/uploads'))
|
|
RESULTS_DIR = Path(os.getenv('RESULTS_DIR', '/opt/pdf-accessibility/results'))
|
|
RATE_LIMIT_DIR = Path(os.getenv('RATE_LIMIT_DIR', '/opt/pdf-accessibility/rate_limits'))
|
|
RETENTION_HOURS = int(os.getenv('RETENTION_HOURS', '24'))
|
|
|
|
|
|
def get_age_hours(path: Path) -> float:
|
|
"""Return file/dir age in hours based on modification time."""
|
|
return (time.time() - path.stat().st_mtime) / 3600
|
|
|
|
|
|
def cleanup_directory(directory: Path, patterns: list[str], dry_run: bool) -> tuple[int, int]:
|
|
"""Delete files matching patterns older than RETENTION_HOURS.
|
|
|
|
Returns (files_deleted, bytes_freed).
|
|
"""
|
|
if not directory.exists():
|
|
logger.warning("Directory does not exist: %s", directory)
|
|
return 0, 0
|
|
|
|
deleted = 0
|
|
freed = 0
|
|
|
|
for pattern in patterns:
|
|
for path in directory.glob(pattern):
|
|
try:
|
|
age = get_age_hours(path)
|
|
if age < RETENTION_HOURS:
|
|
continue
|
|
|
|
if path.is_dir():
|
|
size = sum(f.stat().st_size for f in path.rglob('*') if f.is_file())
|
|
if dry_run:
|
|
logger.info("[DRY-RUN] Would delete dir: %s (%.1fh old, %s)",
|
|
path.name, age, format_size(size))
|
|
else:
|
|
shutil.rmtree(path)
|
|
logger.info("Deleted dir: %s (%.1fh old, %s)",
|
|
path.name, age, format_size(size))
|
|
else:
|
|
size = path.stat().st_size
|
|
if dry_run:
|
|
logger.info("[DRY-RUN] Would delete: %s (%.1fh old, %s)",
|
|
path.name, age, format_size(size))
|
|
else:
|
|
path.unlink()
|
|
logger.info("Deleted: %s (%.1fh old, %s)",
|
|
path.name, age, format_size(size))
|
|
|
|
deleted += 1
|
|
freed += size
|
|
|
|
except OSError as e:
|
|
logger.error("Failed to delete %s: %s", path, e)
|
|
|
|
return deleted, freed
|
|
|
|
|
|
def format_size(size_bytes: int) -> str:
|
|
"""Format bytes as human-readable string."""
|
|
for unit in ('B', 'KB', 'MB', 'GB'):
|
|
if size_bytes < 1024:
|
|
return f"{size_bytes:.1f} {unit}"
|
|
size_bytes /= 1024
|
|
return f"{size_bytes:.1f} TB"
|
|
|
|
|
|
def main():
|
|
dry_run = '--execute' not in sys.argv
|
|
|
|
if dry_run:
|
|
logger.info("=== DRY RUN (pass --execute to delete) ===")
|
|
|
|
logger.info("Retention: %dh | Uploads: %s | Results: %s",
|
|
RETENTION_HOURS, UPLOADS_DIR, RESULTS_DIR)
|
|
|
|
total_deleted = 0
|
|
total_freed = 0
|
|
|
|
# Clean uploads (PDF files)
|
|
d, f = cleanup_directory(UPLOADS_DIR, ['*.pdf'], dry_run)
|
|
total_deleted += d
|
|
total_freed += f
|
|
|
|
# Clean results (JSON, error logs — page images are on GCS with 7-day lifecycle)
|
|
d, f = cleanup_directory(RESULTS_DIR, ['*.result.json', '*.error.log', '*.meta.json'], dry_run)
|
|
total_deleted += d
|
|
total_freed += f
|
|
|
|
# Clean rate limit files
|
|
d, f = cleanup_directory(RATE_LIMIT_DIR, ['*.json'], dry_run)
|
|
total_deleted += d
|
|
total_freed += f
|
|
|
|
logger.info("Summary: %d items %s, %s freed",
|
|
total_deleted,
|
|
'would be deleted' if dry_run else 'deleted',
|
|
format_size(total_freed))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|