#!/usr/bin/env python3 """ PDF Accessibility Checker — File Cleanup Deletes uploaded PDFs, result JSON files, error logs, and rate limit files older than RETENTION_HOURS (default 24h). Page images are on GCS with a 7-day lifecycle policy. Usage: python cleanup.py # dry-run (show what would be deleted) python cleanup.py --execute # actually delete Designed to run via cron, e.g.: 0 * * * * cd /var/www/html/pdf-accessibility && python3 cleanup.py --execute >> logs/cleanup.log 2>&1 """ import os import sys import time import shutil import logging from pathlib import Path logging.basicConfig( level=logging.INFO, format='%(asctime)s [cleanup] %(levelname)s: %(message)s' ) logger = logging.getLogger('cleanup') UPLOADS_DIR = Path(os.getenv('UPLOADS_DIR', '/opt/pdf-accessibility/uploads')) RESULTS_DIR = Path(os.getenv('RESULTS_DIR', '/opt/pdf-accessibility/results')) RATE_LIMIT_DIR = Path(os.getenv('RATE_LIMIT_DIR', '/opt/pdf-accessibility/rate_limits')) RETENTION_HOURS = int(os.getenv('RETENTION_HOURS', '24')) RESULTS_RETENTION_HOURS = int(os.getenv('RESULTS_RETENTION_HOURS', '720')) # 30 days def get_age_hours(path: Path) -> float: """Return file/dir age in hours based on modification time.""" return (time.time() - path.stat().st_mtime) / 3600 def cleanup_directory(directory: Path, patterns: list[str], dry_run: bool, retention_hours: int = None) -> tuple[int, int]: """Delete files matching patterns older than retention_hours. Returns (files_deleted, bytes_freed). """ if retention_hours is None: retention_hours = RETENTION_HOURS if not directory.exists(): logger.warning("Directory does not exist: %s", directory) return 0, 0 deleted = 0 freed = 0 for pattern in patterns: for path in directory.glob(pattern): try: age = get_age_hours(path) if age < retention_hours: continue if path.is_dir(): size = sum(f.stat().st_size for f in path.rglob('*') if f.is_file()) if dry_run: logger.info("[DRY-RUN] Would delete dir: %s (%.1fh old, %s)", path.name, age, format_size(size)) else: shutil.rmtree(path) logger.info("Deleted dir: %s (%.1fh old, %s)", path.name, age, format_size(size)) else: size = path.stat().st_size if dry_run: logger.info("[DRY-RUN] Would delete: %s (%.1fh old, %s)", path.name, age, format_size(size)) else: path.unlink() logger.info("Deleted: %s (%.1fh old, %s)", path.name, age, format_size(size)) deleted += 1 freed += size except OSError as e: logger.error("Failed to delete %s: %s", path, e) return deleted, freed def format_size(size_bytes: int) -> str: """Format bytes as human-readable string.""" for unit in ('B', 'KB', 'MB', 'GB'): if size_bytes < 1024: return f"{size_bytes:.1f} {unit}" size_bytes /= 1024 return f"{size_bytes:.1f} TB" def main(): dry_run = '--execute' not in sys.argv if dry_run: logger.info("=== DRY RUN (pass --execute to delete) ===") logger.info("Retention: uploads=%dh, results=%dh | Uploads: %s | Results: %s", RETENTION_HOURS, RESULTS_RETENTION_HOURS, UPLOADS_DIR, RESULTS_DIR) total_deleted = 0 total_freed = 0 # Clean uploads (PDF files) — short retention (default 24h) d, f = cleanup_directory(UPLOADS_DIR, ['*.pdf'], dry_run, RETENTION_HOURS) total_deleted += d total_freed += f # Clean error logs — short retention d, f = cleanup_directory(RESULTS_DIR, ['*.error.log'], dry_run, RETENTION_HOURS) total_deleted += d total_freed += f # Clean result/meta/dismissed/overrides/adjusted JSONs — long retention (default 30 days) d, f = cleanup_directory( RESULTS_DIR, ['*.result.json', '*.meta.json', '*.dismissed.json', '*.overrides.json', '*.adjusted.json'], dry_run, RESULTS_RETENTION_HOURS, ) total_deleted += d total_freed += f # Clean rate limit files d, f = cleanup_directory(RATE_LIMIT_DIR, ['*.json'], dry_run) total_deleted += d total_freed += f logger.info("Summary: %d items %s, %s freed", total_deleted, 'would be deleted' if dry_run else 'deleted', format_size(total_freed)) if __name__ == '__main__': main()