From 12b95e25f9e5317c837c9cd18b9b56a30fd7af1d Mon Sep 17 00:00:00 2001
From: SamoilenkoVadym <samoylenko.vadym@gmail.comm>
Date: Fri, 7 Nov 2025 14:45:54 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0=D0=B2?=
 =?UTF-8?q?=D0=BB=D0=B5=D0=BD=20health-check-alerting.sh=20=D0=B8=20=D0=BD?=
 =?UTF-8?q?=D0=B0=D1=81=D1=82=D1=80=D0=BE=D0=B5=D0=BD=D1=8B=20=D0=B5=D0=B6?=
 =?UTF-8?q?=D0=B5=D0=B4=D0=BD=D0=B5=D0=B2=D0=BD=D1=8B=D0=B5=20=D0=BE=D1=82?=
 =?UTF-8?q?=D1=87=D0=B5=D1=82=D1=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Исправлены критические проблемы в health-check-alerting.sh:

1. check_backup_status: Исправлено определение последнего snapshot
   - Было: grep | tail | cut (брал первый time из JSON)
   - Стало: python3 парсинг JSON массива (берет snapshots[-1])
   - Теперь видит актуальный бэкап от 2025-11-07 14:37

2. check_container_restarts: Добавлена проверка uptime
   - Теперь алертит только если uptime < 24h для >10 рестартов
   - Или uptime < 48h для >5 рестартов
   - realtime-dev.supabase-realtime (uptime: 63h) больше не алертит

3. check_backup_status: Улучшена проверка локальных бэкапов
   - Изменено с days на hours для более точного контроля

Настроено расписание cron:
- Было: Каждые 30 минут (*/30 * * * *)
- Стало: Дважды в день в 10:00 и 19:00 (0 10,19 * * *)

Результаты тестирования:
- ✅ Restic backup OK: 0h old (было: 8 days old)
- ✅ Local DB backup OK: 0h old
- ✅ Container restarts: нет алертов (было: 2 critical)
- ⚠️ 1 warning: Too many R2 snapshots (некритично)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../scripts/health-check-alerting.sh          | 91 ++++++++++++++-----
 1 file changed, 66 insertions(+), 25 deletions(-)

diff --git a/opt/05-backups/scripts/health-check-alerting.sh b/opt/05-backups/scripts/health-check-alerting.sh
index 9919208..e52170d 100755
--- a/opt/05-backups/scripts/health-check-alerting.sh
+++ b/opt/05-backups/scripts/health-check-alerting.sh
@@ -9,10 +9,21 @@
 set -euo pipefail
 
 # Configuration
-# Get from Vault
+# Get from Vault or fallback file
 export VAULT_ADDR=http://127.0.0.1:8200
 export VAULT_TOKEN=$(cat /opt/00-infrastructure/vault/.vault-token 2>/dev/null || echo "")
-SLACK_WEBHOOK_URL=$(vault kv get -field=slack_webhook secret/monitoring 2>/dev/null || echo "")  # Set in Vault or environment
+
+# Try to get Slack webhook from fallback file first
+SLACK_CREDENTIALS="/opt/05-backups/scripts/.slack-webhook"
+if [[ -f "$SLACK_CREDENTIALS" ]]; then
+    source "$SLACK_CREDENTIALS"
+fi
+
+# Try Vault if not set
+if [[ -z "$SLACK_WEBHOOK_URL" ]]; then
+    SLACK_WEBHOOK_URL=$(vault kv get -field=slack_webhook secret/monitoring 2>/dev/null || echo "")
+fi
+
 ALERT_EMAIL="${ALERT_EMAIL:-admin@ai-impress.com}"
 SMTP_SERVER="${SMTP_SERVER:-localhost}"
 LOG_FILE="/opt/05-backups/logs/health-check-$(date +%Y%m%d).log"
@@ -187,37 +198,67 @@ check_r2_usage() {
 
 check_backup_status() {
     log "Checking backup status..."
-    
-    if [[ ! -d /mnt/backups ]]; then
-        add_problem "Backup directory /mnt/backups NOT FOUND"
-        return
-    fi
-    
-    local latest_backup=$(find /mnt/backups -type f -name "*.tar.gz" -o -name "*.sql.gz" 2>/dev/null | sort | tail -1)
-    
-    if [[ -z "$latest_backup" ]]; then
-        add_problem "NO BACKUPS FOUND in /mnt/backups"
+
+    # Check Restic snapshots (primary backup method)
+    if [[ -f /opt/05-backups/restic/.env ]]; then
+        source /opt/05-backups/restic/.env
+
+        # Get latest snapshot time using python to properly parse JSON array
+        local latest_snapshot=$(restic snapshots --compact --json 2>/dev/null | python3 -c 'import sys, json; snapshots = json.load(sys.stdin); print(snapshots[-1]["time"] if snapshots else "")' 2>/dev/null)
+
+        if [[ -z "$latest_snapshot" ]]; then
+            add_problem "NO RESTIC SNAPSHOTS FOUND in R2 backup"
+        else
+            # Convert snapshot time to epoch
+            local snapshot_epoch=$(date -d "$latest_snapshot" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${latest_snapshot%%.*}" +%s 2>/dev/null)
+            local current_epoch=$(date +%s)
+            local backup_age_hours=$(( (current_epoch - snapshot_epoch) / 3600 ))
+            local backup_age_days=$(( backup_age_hours / 24 ))
+
+            if [[ "$backup_age_days" -gt 2 ]]; then
+                add_problem "Restic backup is $backup_age_days days old (last: $(date -d "$latest_snapshot" '+%Y-%m-%d %H:%M' 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${latest_snapshot%%.*}" '+%Y-%m-%d %H:%M' 2>/dev/null))"
+            elif [[ "$backup_age_hours" -gt 36 ]]; then
+                add_warning "Restic backup is ${backup_age_hours}h old"
+            else
+                log "✅ Restic backup OK: ${backup_age_hours}h old"
+            fi
+        fi
     else
-        local backup_age_days=$(( ($(date +%s) - $(stat -c %Y "$latest_backup")) / 86400 ))
-        
-        if [[ "$backup_age_days" -gt 2 ]]; then
-            add_problem "Latest backup is $backup_age_days days old (last: $(basename "$latest_backup"))"
-        elif [[ "$backup_age_days" -gt 1 ]]; then
-            add_warning "Latest backup is $backup_age_days days old"
+        add_warning "Restic config not found - cannot check cloud backups"
+    fi
+
+    # Check local database backups as secondary check
+    if [[ -d /mnt/backups/local-backups ]]; then
+        local latest_db_backup=$(find /mnt/backups/local-backups -type f -name "postgresql-*.sql.gz" 2>/dev/null | sort | tail -1)
+
+        if [[ -n "$latest_db_backup" ]]; then
+            local db_backup_age_hours=$(( ($(date +%s) - $(stat -c %Y "$latest_db_backup")) / 3600 ))
+
+            if [[ "$db_backup_age_hours" -gt 48 ]]; then
+                add_warning "Local DB backup is $(($db_backup_age_hours / 24)) days old (last: $(basename "$latest_db_backup"))"
+            else
+                log "✅ Local DB backup OK: $(basename "$latest_db_backup") (${db_backup_age_hours}h old)"
+            fi
         fi
     fi
 }
 
 check_container_restarts() {
     log "Checking for excessive container restarts..."
-    
-    while read -r container restart_count; do
-        if [[ "$restart_count" -gt 10 ]]; then
-            add_problem "Container $container has restarted $restart_count times"
-        elif [[ "$restart_count" -gt 5 ]]; then
-            add_warning "Container $container has restarted $restart_count times"
+
+    while read -r container restart_count started_at; do
+        # Calculate container uptime in hours
+        local started_epoch=$(date -d "$started_at" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${started_at%%.*}" +%s 2>/dev/null)
+        local current_epoch=$(date +%s)
+        local uptime_hours=$(( (current_epoch - started_epoch) / 3600 ))
+
+        # Only alert if container has high restart count AND recently restarted (uptime < 24h)
+        if [[ "$restart_count" -gt 10 ]] && [[ "$uptime_hours" -lt 24 ]]; then
+            add_problem "Container $container has restarted $restart_count times (uptime: ${uptime_hours}h)"
+        elif [[ "$restart_count" -gt 5 ]] && [[ "$uptime_hours" -lt 48 ]]; then
+            add_warning "Container $container has restarted $restart_count times (uptime: ${uptime_hours}h)"
         fi
-    done < <(docker ps --format '{{.Names}}' | xargs -I {} sh -c 'echo {} $(docker inspect --format="{{.RestartCount}}" {})')
+    done < <(docker ps --format '{{.Names}}' | xargs -I {} sh -c 'echo {} $(docker inspect --format="{{.RestartCount}} {{.State.StartedAt}}" {})')
 }
 
 ################################################################################