diff --git a/opt/05-backups/scripts/health-check-alerting.sh b/opt/05-backups/scripts/health-check-alerting.sh index 9919208..e52170d 100755 --- a/opt/05-backups/scripts/health-check-alerting.sh +++ b/opt/05-backups/scripts/health-check-alerting.sh @@ -9,10 +9,21 @@ set -euo pipefail # Configuration -# Get from Vault +# Get from Vault or fallback file export VAULT_ADDR=http://127.0.0.1:8200 export VAULT_TOKEN=$(cat /opt/00-infrastructure/vault/.vault-token 2>/dev/null || echo "") -SLACK_WEBHOOK_URL=$(vault kv get -field=slack_webhook secret/monitoring 2>/dev/null || echo "") # Set in Vault or environment + +# Try to get Slack webhook from fallback file first +SLACK_CREDENTIALS="/opt/05-backups/scripts/.slack-webhook" +if [[ -f "$SLACK_CREDENTIALS" ]]; then + source "$SLACK_CREDENTIALS" +fi + +# Try Vault if not set +if [[ -z "$SLACK_WEBHOOK_URL" ]]; then + SLACK_WEBHOOK_URL=$(vault kv get -field=slack_webhook secret/monitoring 2>/dev/null || echo "") +fi + ALERT_EMAIL="${ALERT_EMAIL:-admin@ai-impress.com}" SMTP_SERVER="${SMTP_SERVER:-localhost}" LOG_FILE="/opt/05-backups/logs/health-check-$(date +%Y%m%d).log" @@ -187,37 +198,67 @@ check_r2_usage() { check_backup_status() { log "Checking backup status..." - - if [[ ! -d /mnt/backups ]]; then - add_problem "Backup directory /mnt/backups NOT FOUND" - return - fi - - local latest_backup=$(find /mnt/backups -type f -name "*.tar.gz" -o -name "*.sql.gz" 2>/dev/null | sort | tail -1) - - if [[ -z "$latest_backup" ]]; then - add_problem "NO BACKUPS FOUND in /mnt/backups" + + # Check Restic snapshots (primary backup method) + if [[ -f /opt/05-backups/restic/.env ]]; then + source /opt/05-backups/restic/.env + + # Get latest snapshot time using python to properly parse JSON array + local latest_snapshot=$(restic snapshots --compact --json 2>/dev/null | python3 -c 'import sys, json; snapshots = json.load(sys.stdin); print(snapshots[-1]["time"] if snapshots else "")' 2>/dev/null) + + if [[ -z "$latest_snapshot" ]]; then + add_problem "NO RESTIC SNAPSHOTS FOUND in R2 backup" + else + # Convert snapshot time to epoch + local snapshot_epoch=$(date -d "$latest_snapshot" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${latest_snapshot%%.*}" +%s 2>/dev/null) + local current_epoch=$(date +%s) + local backup_age_hours=$(( (current_epoch - snapshot_epoch) / 3600 )) + local backup_age_days=$(( backup_age_hours / 24 )) + + if [[ "$backup_age_days" -gt 2 ]]; then + add_problem "Restic backup is $backup_age_days days old (last: $(date -d "$latest_snapshot" '+%Y-%m-%d %H:%M' 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${latest_snapshot%%.*}" '+%Y-%m-%d %H:%M' 2>/dev/null))" + elif [[ "$backup_age_hours" -gt 36 ]]; then + add_warning "Restic backup is ${backup_age_hours}h old" + else + log "✅ Restic backup OK: ${backup_age_hours}h old" + fi + fi else - local backup_age_days=$(( ($(date +%s) - $(stat -c %Y "$latest_backup")) / 86400 )) - - if [[ "$backup_age_days" -gt 2 ]]; then - add_problem "Latest backup is $backup_age_days days old (last: $(basename "$latest_backup"))" - elif [[ "$backup_age_days" -gt 1 ]]; then - add_warning "Latest backup is $backup_age_days days old" + add_warning "Restic config not found - cannot check cloud backups" + fi + + # Check local database backups as secondary check + if [[ -d /mnt/backups/local-backups ]]; then + local latest_db_backup=$(find /mnt/backups/local-backups -type f -name "postgresql-*.sql.gz" 2>/dev/null | sort | tail -1) + + if [[ -n "$latest_db_backup" ]]; then + local db_backup_age_hours=$(( ($(date +%s) - $(stat -c %Y "$latest_db_backup")) / 3600 )) + + if [[ "$db_backup_age_hours" -gt 48 ]]; then + add_warning "Local DB backup is $(($db_backup_age_hours / 24)) days old (last: $(basename "$latest_db_backup"))" + else + log "✅ Local DB backup OK: $(basename "$latest_db_backup") (${db_backup_age_hours}h old)" + fi fi fi } check_container_restarts() { log "Checking for excessive container restarts..." - - while read -r container restart_count; do - if [[ "$restart_count" -gt 10 ]]; then - add_problem "Container $container has restarted $restart_count times" - elif [[ "$restart_count" -gt 5 ]]; then - add_warning "Container $container has restarted $restart_count times" + + while read -r container restart_count started_at; do + # Calculate container uptime in hours + local started_epoch=$(date -d "$started_at" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${started_at%%.*}" +%s 2>/dev/null) + local current_epoch=$(date +%s) + local uptime_hours=$(( (current_epoch - started_epoch) / 3600 )) + + # Only alert if container has high restart count AND recently restarted (uptime < 24h) + if [[ "$restart_count" -gt 10 ]] && [[ "$uptime_hours" -lt 24 ]]; then + add_problem "Container $container has restarted $restart_count times (uptime: ${uptime_hours}h)" + elif [[ "$restart_count" -gt 5 ]] && [[ "$uptime_hours" -lt 48 ]]; then + add_warning "Container $container has restarted $restart_count times (uptime: ${uptime_hours}h)" fi - done < <(docker ps --format '{{.Names}}' | xargs -I {} sh -c 'echo {} $(docker inspect --format="{{.RestartCount}}" {})') + done < <(docker ps --format '{{.Names}}' | xargs -I {} sh -c 'echo {} $(docker inspect --format="{{.RestartCount}} {{.State.StartedAt}}" {})') } ################################################################################