PDF-accessibility-saas/cleanup.py

146 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""
PDF Accessibility Checker — File Cleanup
Deletes uploaded PDFs, result JSON files, error logs, and rate limit files
older than RETENTION_HOURS (default 24h). Page images are on GCS with
a 7-day lifecycle policy.
Usage:
python cleanup.py # dry-run (show what would be deleted)
python cleanup.py --execute # actually delete
Designed to run via cron, e.g.:
0 * * * * cd /var/www/html/pdf-accessibility && python3 cleanup.py --execute >> logs/cleanup.log 2>&1
"""
import os
import sys
import time
import shutil
import logging
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [cleanup] %(levelname)s: %(message)s'
)
logger = logging.getLogger('cleanup')
UPLOADS_DIR = Path(os.getenv('UPLOADS_DIR', '/opt/pdf-accessibility/uploads'))
RESULTS_DIR = Path(os.getenv('RESULTS_DIR', '/opt/pdf-accessibility/results'))
RATE_LIMIT_DIR = Path(os.getenv('RATE_LIMIT_DIR', '/opt/pdf-accessibility/rate_limits'))
RETENTION_HOURS = int(os.getenv('RETENTION_HOURS', '24'))
RESULTS_RETENTION_HOURS = int(os.getenv('RESULTS_RETENTION_HOURS', '720')) # 30 days
def get_age_hours(path: Path) -> float:
"""Return file/dir age in hours based on modification time."""
return (time.time() - path.stat().st_mtime) / 3600
def cleanup_directory(directory: Path, patterns: list[str], dry_run: bool,
retention_hours: int = None) -> tuple[int, int]:
"""Delete files matching patterns older than retention_hours.
Returns (files_deleted, bytes_freed).
"""
if retention_hours is None:
retention_hours = RETENTION_HOURS
if not directory.exists():
logger.warning("Directory does not exist: %s", directory)
return 0, 0
deleted = 0
freed = 0
for pattern in patterns:
for path in directory.glob(pattern):
try:
age = get_age_hours(path)
if age < retention_hours:
continue
if path.is_dir():
size = sum(f.stat().st_size for f in path.rglob('*') if f.is_file())
if dry_run:
logger.info("[DRY-RUN] Would delete dir: %s (%.1fh old, %s)",
path.name, age, format_size(size))
else:
shutil.rmtree(path)
logger.info("Deleted dir: %s (%.1fh old, %s)",
path.name, age, format_size(size))
else:
size = path.stat().st_size
if dry_run:
logger.info("[DRY-RUN] Would delete: %s (%.1fh old, %s)",
path.name, age, format_size(size))
else:
path.unlink()
logger.info("Deleted: %s (%.1fh old, %s)",
path.name, age, format_size(size))
deleted += 1
freed += size
except OSError as e:
logger.error("Failed to delete %s: %s", path, e)
return deleted, freed
def format_size(size_bytes: int) -> str:
"""Format bytes as human-readable string."""
for unit in ('B', 'KB', 'MB', 'GB'):
if size_bytes < 1024:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024
return f"{size_bytes:.1f} TB"
def main():
dry_run = '--execute' not in sys.argv
if dry_run:
logger.info("=== DRY RUN (pass --execute to delete) ===")
logger.info("Retention: uploads=%dh, results=%dh | Uploads: %s | Results: %s",
RETENTION_HOURS, RESULTS_RETENTION_HOURS, UPLOADS_DIR, RESULTS_DIR)
total_deleted = 0
total_freed = 0
# Clean uploads (PDF files) — short retention (default 24h)
d, f = cleanup_directory(UPLOADS_DIR, ['*.pdf'], dry_run, RETENTION_HOURS)
total_deleted += d
total_freed += f
# Clean error logs — short retention
d, f = cleanup_directory(RESULTS_DIR, ['*.error.log'], dry_run, RETENTION_HOURS)
total_deleted += d
total_freed += f
# Clean result/meta/dismissed/overrides/adjusted JSONs — long retention (default 30 days)
d, f = cleanup_directory(
RESULTS_DIR,
['*.result.json', '*.meta.json', '*.dismissed.json', '*.overrides.json', '*.adjusted.json'],
dry_run,
RESULTS_RETENTION_HOURS,
)
total_deleted += d
total_freed += f
# Clean rate limit files
d, f = cleanup_directory(RATE_LIMIT_DIR, ['*.json'], dry_run)
total_deleted += d
total_freed += f
logger.info("Summary: %d items %s, %s freed",
total_deleted,
'would be deleted' if dry_run else 'deleted',
format_size(total_freed))
if __name__ == '__main__':
main()