146 lines
4.8 KiB
Python
146 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PDF Accessibility Checker — File Cleanup
|
|
|
|
Deletes uploaded PDFs, result JSON files, error logs, and rate limit files
|
|
older than RETENTION_HOURS (default 24h). Page images are on GCS with
|
|
a 7-day lifecycle policy.
|
|
|
|
Usage:
|
|
python cleanup.py # dry-run (show what would be deleted)
|
|
python cleanup.py --execute # actually delete
|
|
|
|
Designed to run via cron, e.g.:
|
|
0 * * * * cd /var/www/html/pdf-accessibility && python3 cleanup.py --execute >> logs/cleanup.log 2>&1
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import shutil
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s [cleanup] %(levelname)s: %(message)s'
|
|
)
|
|
logger = logging.getLogger('cleanup')
|
|
|
|
UPLOADS_DIR = Path(os.getenv('UPLOADS_DIR', '/opt/pdf-accessibility/uploads'))
|
|
RESULTS_DIR = Path(os.getenv('RESULTS_DIR', '/opt/pdf-accessibility/results'))
|
|
RATE_LIMIT_DIR = Path(os.getenv('RATE_LIMIT_DIR', '/opt/pdf-accessibility/rate_limits'))
|
|
RETENTION_HOURS = int(os.getenv('RETENTION_HOURS', '24'))
|
|
RESULTS_RETENTION_HOURS = int(os.getenv('RESULTS_RETENTION_HOURS', '720')) # 30 days
|
|
|
|
|
|
def get_age_hours(path: Path) -> float:
|
|
"""Return file/dir age in hours based on modification time."""
|
|
return (time.time() - path.stat().st_mtime) / 3600
|
|
|
|
|
|
def cleanup_directory(directory: Path, patterns: list[str], dry_run: bool,
|
|
retention_hours: int = None) -> tuple[int, int]:
|
|
"""Delete files matching patterns older than retention_hours.
|
|
|
|
Returns (files_deleted, bytes_freed).
|
|
"""
|
|
if retention_hours is None:
|
|
retention_hours = RETENTION_HOURS
|
|
|
|
if not directory.exists():
|
|
logger.warning("Directory does not exist: %s", directory)
|
|
return 0, 0
|
|
|
|
deleted = 0
|
|
freed = 0
|
|
|
|
for pattern in patterns:
|
|
for path in directory.glob(pattern):
|
|
try:
|
|
age = get_age_hours(path)
|
|
if age < retention_hours:
|
|
continue
|
|
|
|
if path.is_dir():
|
|
size = sum(f.stat().st_size for f in path.rglob('*') if f.is_file())
|
|
if dry_run:
|
|
logger.info("[DRY-RUN] Would delete dir: %s (%.1fh old, %s)",
|
|
path.name, age, format_size(size))
|
|
else:
|
|
shutil.rmtree(path)
|
|
logger.info("Deleted dir: %s (%.1fh old, %s)",
|
|
path.name, age, format_size(size))
|
|
else:
|
|
size = path.stat().st_size
|
|
if dry_run:
|
|
logger.info("[DRY-RUN] Would delete: %s (%.1fh old, %s)",
|
|
path.name, age, format_size(size))
|
|
else:
|
|
path.unlink()
|
|
logger.info("Deleted: %s (%.1fh old, %s)",
|
|
path.name, age, format_size(size))
|
|
|
|
deleted += 1
|
|
freed += size
|
|
|
|
except OSError as e:
|
|
logger.error("Failed to delete %s: %s", path, e)
|
|
|
|
return deleted, freed
|
|
|
|
|
|
def format_size(size_bytes: int) -> str:
|
|
"""Format bytes as human-readable string."""
|
|
for unit in ('B', 'KB', 'MB', 'GB'):
|
|
if size_bytes < 1024:
|
|
return f"{size_bytes:.1f} {unit}"
|
|
size_bytes /= 1024
|
|
return f"{size_bytes:.1f} TB"
|
|
|
|
|
|
def main():
|
|
dry_run = '--execute' not in sys.argv
|
|
|
|
if dry_run:
|
|
logger.info("=== DRY RUN (pass --execute to delete) ===")
|
|
|
|
logger.info("Retention: uploads=%dh, results=%dh | Uploads: %s | Results: %s",
|
|
RETENTION_HOURS, RESULTS_RETENTION_HOURS, UPLOADS_DIR, RESULTS_DIR)
|
|
|
|
total_deleted = 0
|
|
total_freed = 0
|
|
|
|
# Clean uploads (PDF files) — short retention (default 24h)
|
|
d, f = cleanup_directory(UPLOADS_DIR, ['*.pdf'], dry_run, RETENTION_HOURS)
|
|
total_deleted += d
|
|
total_freed += f
|
|
|
|
# Clean error logs — short retention
|
|
d, f = cleanup_directory(RESULTS_DIR, ['*.error.log'], dry_run, RETENTION_HOURS)
|
|
total_deleted += d
|
|
total_freed += f
|
|
|
|
# Clean result/meta/dismissed/overrides/adjusted JSONs — long retention (default 30 days)
|
|
d, f = cleanup_directory(
|
|
RESULTS_DIR,
|
|
['*.result.json', '*.meta.json', '*.dismissed.json', '*.overrides.json', '*.adjusted.json'],
|
|
dry_run,
|
|
RESULTS_RETENTION_HOURS,
|
|
)
|
|
total_deleted += d
|
|
total_freed += f
|
|
|
|
# Clean rate limit files
|
|
d, f = cleanup_directory(RATE_LIMIT_DIR, ['*.json'], dry_run)
|
|
total_deleted += d
|
|
total_freed += f
|
|
|
|
logger.info("Summary: %d items %s, %s freed",
|
|
total_deleted,
|
|
'would be deleted' if dry_run else 'deleted',
|
|
format_size(total_freed))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|