fix: улучшения скриптов мониторинга и backup

Исправлены критические проблемы и добавлены улучшения:

1. **server-full-report.sh**:
   - Улучшены Slack уведомления с детектором проблем
   - Добавлены автоматические рекомендации по исправлению
   - Добавлена цветная индикация статуса (good/warning/danger)
   - Улучшена структура уведомлений с приоритетами

2. **generate-summary.sh**:
   - Исправлено дублирование контента в отчетах
   - Удален незакрытый heredoc, вызывавший проблемы
   - Добавлены правильные разделители секций

3. **backup-full-enhanced.sh** v2.0.0 → v2.1.0:
   - Добавлен полный auto-discovery для всех типов БД
   - Добавлена поддержка MongoDB backup
   - Улучшена детекция PostgreSQL/MariaDB через образы
   - Автоматическое определение пользователей БД
   - Удален hardcoded список баз данных

4. **health-check-alerting.sh**:
   - Добавлена проверка наличия 'bc' перед использованием
   - Добавлен fallback на integer comparison без bc
   - Улучшена надежность проверки R2 storage

Slack уведомления теперь включают:
- Автоматическое обнаружение проблем (unhealthy, down sites, high disk)
- Конкретные команды для исправления проблем
- SSH инструкции и ссылки на admin tools
- Цветовую индикацию серьезности (danger/warning/good)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
SamoilenkoVadym 2025-11-06 10:47:20 +00:00
parent a1d606d4fb
commit c5401eb33c
8 changed files with 975 additions and 4 deletions

Binary file not shown.

View file

@ -0,0 +1 @@
{"LLM":"google","OPENAI_API_KEY":"4roCnNdmB9PpRuW0cGcU66bFBU6pUcUgeEsuP3a3lGmxwuZTWD3mJQQJ99BKACfhMk5XJ3w3AAAAACOGcIX4","OPENAI_MODEL":"gpt-5","GOOGLE_API_KEY":"AIzaSyC5Tsf57X9egANg_ft3aFA_59sTL8i8gwA","GOOGLE_MODEL":"models/gemini-2.5-flash","OLLAMA_URL":"http://localhost:11434","CUSTOM_LLM_URL":"http://litellm-proxy:4000","CUSTOM_LLM_API_KEY":"presenton-proxy-key-2025","CUSTOM_MODEL":"gpt-5","IMAGE_PROVIDER":"gemini_flash","TOOL_CALLS":"true","EXTENDED_REASONING":"true"}

View file

@ -0,0 +1,467 @@
#!/bin/bash
################################################################################
# AI-Impress Enhanced Full Backup System
# Version: 2.1.0
# Purpose: Auto-discover and backup all system components
# Features:
# - Auto-discovery of docker-compose projects
# - Automatic database detection (PostgreSQL, MariaDB/MySQL, MongoDB)
# - Incremental backups with Restic
# - Local backup on /mnt/backups
# - Slack & Email notifications
# Author: AI-Impress Admin System
# Date: 2025-11-06
# Changelog v2.1.0:
# - Added auto-discovery for all database types
# - Improved database detection with image inspection
# - Added MongoDB backup support
# - Better error handling
################################################################################
set -e
# ============================================
# CONFIGURATION
# ============================================
RESTIC_ENV="/opt/05-backups/restic/.env"
BACKUP_BASE="/mnt/backups"
LOCAL_BACKUP_DIR="$BACKUP_BASE/local-backups"
REPORTS_DIR="/opt/05-backups/reports"
LOG_DIR="/opt/05-backups/logs"
LOG_FILE="$LOG_DIR/backup-$(date +%Y%m%d-%H%M%S).log"
BACKUP_REPORT="$REPORTS_DIR/backup-report-$(date +%Y%m%d-%H%M%S).json"
SLACK_WEBHOOK="${SLACK_WEBHOOK_URL:-}"
EMAIL_TO="admin@ai-impress.com"
# Load Restic credentials
if [[ -f "$RESTIC_ENV" ]]; then
source "$RESTIC_ENV"
else
echo "ERROR: Restic .env not found at $RESTIC_ENV"
exit 1
fi
# Export Vault token
export VAULT_TOKEN="${VAULT_TOKEN:-$(cat /opt/00-infrastructure/vault/.vault-token 2>/dev/null)}"
# Create directories
mkdir -p "$BACKUP_BASE" "$LOCAL_BACKUP_DIR" "$REPORTS_DIR" "$LOG_DIR"
# Redirect output to log
exec 1> >(tee -a "$LOG_FILE")
exec 2>&1
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# ============================================
# HELPER FUNCTIONS
# ============================================
log() {
echo "[$(date +%Y-%m-%d\ %H:%M:%S)] $1"
}
success() {
echo -e "${GREEN}$1${NC}"
}
warning() {
echo -e "${YELLOW}⚠️ $1${NC}"
}
error() {
echo -e "${RED}$1${NC}"
}
send_slack() {
local title=$1
local message=$2
local color=${3:-good}
if [[ -z "$SLACK_WEBHOOK" ]]; then
return
fi
curl -X POST "$SLACK_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "{
\"attachments\": [{
\"color\": \"$color\",
\"title\": \"$title\",
\"text\": \"$message\",
\"footer\": \"AI-Impress Backup System\",
\"ts\": $(date +%s)
}]
}" 2>/dev/null || true
}
send_email() {
local subject=$1
local body=$2
echo "$body" | mail -s "$subject" "$EMAIL_TO" 2>/dev/null || true
}
# Initialize backup report
init_report() {
cat > "$BACKUP_REPORT" << EOF
{
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"hostname": "$(hostname)",
"backup_status": "IN_PROGRESS",
"components": {},
"summary": {
"total_components": 0,
"successful": 0,
"failed": 0,
"total_size": "0GB",
"duration": "calculating"
},
"alerts": []
}
EOF
}
################################################################################
# AUTO-DISCOVERY FUNCTIONS
################################################################################
discover_docker_compose_projects() {
log "=== Auto-discovering Docker Compose Projects ==="
local projects=()
# Scan /opt for docker-compose.yml files
while IFS= read -r compose_file; do
local project_path=$(dirname "$compose_file")
local project_name=$(basename "$project_path")
projects+=("$project_path")
log "Found: $project_path"
done < <(find /opt -maxdepth 4 -name "docker-compose.yml" -type f 2>/dev/null)
echo "${projects[@]}"
}
discover_databases() {
log "=== Auto-discovering Databases ==="
local databases=()
# Auto-detect PostgreSQL containers (common image names and postgres in container name)
while IFS= read -r container; do
if docker inspect "$container" --format '{{.Config.Image}}' 2>/dev/null | grep -qiE '(postgres|postgresql|timescale|postgis)'; then
databases+=("postgresql:$container")
log "Found PostgreSQL: $container"
fi
done < <(docker ps --format '{{.Names}}' 2>/dev/null | grep -iE '(postgres|pg|timescale|supabase-db|authentik-postgres|postiz-postgres)')
# Auto-detect MariaDB/MySQL containers
while IFS= read -r container; do
if docker inspect "$container" --format '{{.Config.Image}}' 2>/dev/null | grep -qiE '(mariadb|mysql)'; then
databases+=("mariadb:$container")
log "Found MariaDB/MySQL: $container"
fi
done < <(docker ps --format '{{.Names}}' 2>/dev/null | grep -iE '(mariadb|mysql|mautic-db)')
# Auto-detect MongoDB containers
while IFS= read -r container; do
if docker inspect "$container" --format '{{.Config.Image}}' 2>/dev/null | grep -qiE 'mongo'; then
databases+=("mongodb:$container")
log "Found MongoDB: $container"
fi
done < <(docker ps --format '{{.Names}}' 2>/dev/null | grep -iE 'mongo')
echo "${databases[@]}"
}
discover_volumes() {
log "=== Auto-discovering Docker Volumes ==="
local volumes=()
# Get all used volumes
while IFS= read -r volume; do
volumes+=("$volume")
log "Found volume: $volume"
done < <(docker volume ls --format "{{.Name}}" 2>/dev/null | grep -v "^$")
echo "${volumes[@]}"
}
################################################################################
# BACKUP FUNCTIONS
################################################################################
backup_postgresql() {
local container=$1
local db_user=${2:-aimpress_admin}
local backup_file="$LOCAL_BACKUP_DIR/postgresql-$container-$(date +%Y%m%d-%H%M%S).sql.gz"
log "[DB] Backing up PostgreSQL: $container"
if ! docker ps --filter "name=$container" -q &>/dev/null; then
warning "PostgreSQL container $container not running"
return 1
fi
if docker exec "$container" pg_dumpall -U $db_user 2>/dev/null | gzip > "$backup_file"; then
local size=$(du -h "$backup_file" | cut -f1)
success "PostgreSQL $container backed up ($size)"
# Keep only last 14 days
find "$LOCAL_BACKUP_DIR" -name "postgresql-$container-*.sql.gz" -mtime +14 -delete
return 0
else
error "Failed to backup PostgreSQL: $container"
return 1
fi
}
backup_mariadb() {
local container=$1
local backup_file="$LOCAL_BACKUP_DIR/mariadb-$container-$(date +%Y%m%d-%H%M%S).sql.gz"
log "[DB] Backing up MariaDB: $container"
if ! docker ps --filter "name=$container" -q &>/dev/null; then
warning "MariaDB container $container not running"
return 1
fi
if docker exec "$container" mariadb-dump --all-databases 2>/dev/null | gzip > "$backup_file"; then
local size=$(du -h "$backup_file" | cut -f1)
success "MariaDB $container backed up ($size)"
# Keep only last 14 days
find "$LOCAL_BACKUP_DIR" -name "mariadb-$container-*.sql.gz" -mtime +14 -delete
return 0
else
error "Failed to backup MariaDB: $container"
return 1
fi
}
backup_mongodb() {
local container=$1
local backup_file="$LOCAL_BACKUP_DIR/mongodb-$container-$(date +%Y%m%d-%H%M%S).gz"
log "[DB] Backing up MongoDB: $container"
if ! docker ps --filter "name=$container" -q &>/dev/null; then
warning "MongoDB container $container not running"
return 1
fi
if docker exec "$container" mongodump --archive 2>/dev/null | gzip > "$backup_file"; then
local size=$(du -h "$backup_file" | cut -f1)
success "MongoDB $container backed up ($size)"
# Keep only last 14 days
find "$LOCAL_BACKUP_DIR" -name "mongodb-$container-*.gz" -mtime +14 -delete
return 0
else
error "Failed to backup MongoDB: $container"
return 1
fi
}
backup_vault() {
log "[CONFIG] Backing up Vault..."
local backup_file="$LOCAL_BACKUP_DIR/vault-data-$(date +%Y%m%d-%H%M%S).tar.gz"
if sudo tar czf "$backup_file" -C /opt/00-infrastructure/vault data 2>/dev/null; then
local size=$(du -h "$backup_file" | cut -f1)
success "Vault data backed up ($size)"
# Keep only last 30 days
find "$LOCAL_BACKUP_DIR" -name "vault-data-*.tar.gz" -mtime +30 -delete
return 0
else
error "Failed to backup Vault"
return 1
fi
}
backup_docker_configs() {
log "[CONFIG] Backing up Docker Compose files..."
local backup_file="$LOCAL_BACKUP_DIR/docker-configs-$(date +%Y%m%d-%H%M%S).tar.gz"
if tar czf "$backup_file" -C /opt . -path "*docker-compose.yml" 2>/dev/null; then
local size=$(du -h "$backup_file" | cut -f1)
success "Docker configs backed up ($size)"
# Keep only last 30 days
find "$LOCAL_BACKUP_DIR" -name "docker-configs-*.tar.gz" -mtime +30 -delete
return 0
else
error "Failed to backup Docker configs"
return 1
fi
}
backup_application_data() {
log "[DATA] Backing up Application Data..."
local backup_file="$LOCAL_BACKUP_DIR/app-data-$(date +%Y%m%d-%H%M%S).tar.gz"
local data_dirs=(
"/opt/03-business/mautic/sync_v2"
"/opt/02-core/supabase/supabase/docker/volumes"
)
if tar czf "$backup_file" "${data_dirs[@]}" 2>/dev/null; then
local size=$(du -h "$backup_file" | cut -f1)
success "Application data backed up ($size)"
# Keep only last 14 days
find "$LOCAL_BACKUP_DIR" -name "app-data-*.tar.gz" -mtime +14 -delete
return 0
else
error "Failed to backup application data"
return 1
fi
}
backup_with_restic() {
log "=== Uploading to Restic (Cloudflare R2) ==="
if ! command -v restic &>/dev/null; then
warning "Restic not installed, skipping cloud backup"
return 1
fi
# Initialize Restic repository if needed
if ! restic cat config &>/dev/null; then
log "Initializing Restic repository..."
restic init || warning "Restic repository might already exist"
fi
# Backup local directory to Restic
if restic backup "$BACKUP_BASE" --exclude-file="$BACKUP_BASE/.restic-exclude" 2>/dev/null; then
success "Restic backup completed"
# Cleanup old snapshots (keep last 30)
restic forget --keep-daily 3 --keep-weekly 1 --prune 2>/dev/null || true
return 0
else
error "Restic backup failed"
return 1
fi
}
################################################################################
# MAIN BACKUP EXECUTION
################################################################################
main() {
local start_time=$(date +%s)
log "╔════════════════════════════════════════════════════════════╗"
log "║ AI-Impress Enhanced Full Backup v2.1.0 ║"
log "$(date +%Y-%m-%d\ %H:%M:%S)"
log "╚════════════════════════════════════════════════════════════╝"
log ""
local failed=0
local successful=0
# Auto-discover and backup databases
log "=== PHASE 1: Database Backups (Auto-Discovery) ==="
# Discover all databases automatically
local discovered_dbs=$(discover_databases)
if [[ -z "$discovered_dbs" ]]; then
warning "No databases discovered"
else
log "Found databases: $discovered_dbs"
log ""
for db in $discovered_dbs; do
local db_type=$(echo "$db" | cut -d: -f1)
local db_container=$(echo "$db" | cut -d: -f2)
case $db_type in
postgresql)
# Determine DB user based on container name
local db_user="aimpress_admin"
[[ "$db_container" == "authentik-postgres" ]] && db_user="authentik"
[[ "$db_container" == "postiz-postgres" ]] && db_user="postiz"
backup_postgresql "$db_container" "$db_user" && ((successful++)) || ((failed++))
;;
mariadb)
backup_mariadb "$db_container" && ((successful++)) || ((failed++))
;;
mongodb)
backup_mongodb "$db_container" && ((successful++)) || ((failed++))
;;
*)
warning "Unknown database type: $db_type"
((failed++))
;;
esac
done
fi
log ""
log "=== PHASE 2: Configuration Backups ==="
backup_vault && ((successful++)) || ((failed++))
backup_docker_configs && ((successful++)) || ((failed++))
log ""
log "=== PHASE 3: Application Data ==="
backup_application_data && ((successful++)) || ((failed++))
log ""
log "=== PHASE 4: Cloud Backup (Restic) ==="
backup_with_restic && ((successful++)) || ((failed++))
# Calculate duration
local end_time=$(date +%s)
local duration=$((end_time - start_time))
local duration_min=$((duration / 60))
local duration_sec=$((duration % 60))
log ""
log "╔════════════════════════════════════════════════════════════╗"
log "║ BACKUP COMPLETE ║"
log "╚════════════════════════════════════════════════════════════╝"
log ""
log "Summary:"
log " Total Components: $((successful + failed))"
log " Successful: $successful"
log " Failed: $failed"
log " Duration: ${duration_min}m ${duration_sec}s"
log " Local Backups: $LOCAL_BACKUP_DIR"
log " Cloud Backups: Restic (Cloudflare R2)"
log ""
# Send notifications
if [[ $failed -eq 0 ]]; then
success "All backups completed successfully!"
send_slack "✅ Backup Complete" "All components backed up successfully in ${duration_min}m ${duration_sec}s" "good"
send_email "Backup Complete" "All backups completed successfully.\n\nDuration: ${duration_min}m ${duration_sec}s\nLocation: $BACKUP_BASE"
else
warning "Backup completed with $failed failures"
send_slack "⚠️ Backup Completed with Errors" "Failed components: $failed\nCheck logs: $LOG_FILE" "warning"
send_email "Backup Completed with Errors" "Backup completed with $failed failures.\n\nCheck logs: $LOG_FILE"
fi
}
main "$@"

View file

@ -0,0 +1,377 @@
#!/bin/bash
################################################################################
# AI-Impress Health Check & Alerting System
# Version: 1.0.0
# Purpose: Monitor system health and send alerts on problems
################################################################################
set -euo pipefail
# Configuration
# Get from Vault
export VAULT_ADDR=http://127.0.0.1:8200
export VAULT_TOKEN=$(cat /opt/00-infrastructure/vault/.vault-token 2>/dev/null || echo "")
SLACK_WEBHOOK_URL=$(vault kv get -field=slack_webhook secret/monitoring 2>/dev/null || echo "") # Set in Vault or environment
ALERT_EMAIL="${ALERT_EMAIL:-admin@ai-impress.com}"
SMTP_SERVER="${SMTP_SERVER:-localhost}"
LOG_FILE="/opt/05-backups/logs/health-check-$(date +%Y%m%d).log"
# Thresholds
DISK_THRESHOLD=90 # Alert if disk > 90%
MEMORY_THRESHOLD=90 # Alert if memory > 90%
MAX_UNHEALTHY_CONTAINERS=2 # Alert if more than 2 containers unhealthy
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
PROBLEMS=()
WARNINGS=()
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
add_problem() {
PROBLEMS+=("$1")
log "🔴 PROBLEM: $1"
}
add_warning() {
WARNINGS+=("$1")
log "🟡 WARNING: $1"
}
################################################################################
# CHECKS
################################################################################
check_critical_services() {
log "Checking critical services..."
local critical_services=("traefik" "postgres-main" "redis-main")
for service in "${critical_services[@]}"; do
if ! docker ps --format '{{.Names}}' | grep -q "^${service}$"; then
add_problem "Critical service $service is NOT RUNNING"
else
local health=$(docker inspect --format='{{.State.Health.Status}}' "$service" 2>/dev/null || echo "no healthcheck")
if [[ "$health" == "unhealthy" ]]; then
add_problem "Critical service $service is UNHEALTHY"
fi
fi
done
}
check_websites() {
log "Checking websites..."
local websites=(
"wiki.ai-impress.com"
"n8n.ai-impress.com"
"odoo.ai-impress.com"
"auth.ai-impress.com"
)
for site in "${websites[@]}"; do
local http_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "https://$site" 2>/dev/null || echo "000")
if [[ "$http_code" != "200" ]] && [[ ! "$http_code" =~ ^30 ]]; then
add_problem "Website $site is DOWN (HTTP $http_code)"
fi
done
}
check_disk_space() {
log "Checking disk space..."
local disk_usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
if [[ "$disk_usage" -gt "$DISK_THRESHOLD" ]]; then
add_problem "Disk usage is CRITICAL: ${disk_usage}% (threshold: ${DISK_THRESHOLD}%)"
elif [[ "$disk_usage" -gt 80 ]]; then
add_warning "Disk usage is high: ${disk_usage}%"
fi
# Check /mnt/psql-data
if [[ -d /mnt/psql-data ]]; then
local db_disk_usage=$(df -h /mnt/psql-data | awk 'NR==2 {print $5}' | sed 's/%//')
if [[ "$db_disk_usage" -gt "$DISK_THRESHOLD" ]]; then
add_problem "Database disk usage is CRITICAL: ${db_disk_usage}%"
fi
fi
}
check_memory() {
log "Checking memory usage..."
local memory_usage=$(free | awk '/Mem:/ {printf "%.0f", ($3/$2)*100}')
if [[ "$memory_usage" -gt "$MEMORY_THRESHOLD" ]]; then
add_problem "Memory usage is CRITICAL: ${memory_usage}% (threshold: ${MEMORY_THRESHOLD}%)"
elif [[ "$memory_usage" -gt 80 ]]; then
add_warning "Memory usage is high: ${memory_usage}%"
fi
}
check_unhealthy_containers() {
log "Checking unhealthy containers..."
local unhealthy_count=$(docker ps --filter "health=unhealthy" -q | wc -l)
if [[ "$unhealthy_count" -gt "$MAX_UNHEALTHY_CONTAINERS" ]]; then
local containers=$(docker ps --filter "health=unhealthy" --format '{{.Names}}' | tr '\n' ', ')
add_problem "$unhealthy_count containers are UNHEALTHY: $containers"
elif [[ "$unhealthy_count" -gt 0 ]]; then
local containers=$(docker ps --filter "health=unhealthy" --format '{{.Names}}' | tr '\n' ', ')
add_warning "$unhealthy_count container(s) unhealthy: $containers"
fi
}
check_r2_usage() {
log "Checking Cloudflare R2 backup storage..."
# Load Restic environment
if [[ ! -f /opt/05-backups/restic/.env ]]; then
add_warning "Restic config not found - skipping R2 check"
return
fi
source /opt/05-backups/restic/.env
# Get R2 stats
local r2_stats=$(restic stats --mode restore-size 2>/dev/null | grep "Total Size")
if [[ -z "$r2_stats" ]]; then
add_warning "Unable to get R2 statistics"
return
fi
local size_gb=$(echo "$r2_stats" | grep -oP '\d+\.\d+' | head -1)
local r2_limit=10 # Cloudflare R2 free tier limit: 10 GB
# Check if bc is available for floating point comparison
if command -v bc &> /dev/null; then
if (( $(echo "$size_gb > $r2_limit" | bc -l) )); then
add_problem "R2 storage EXCEEDED: ${size_gb}GB / ${r2_limit}GB limit"
elif (( $(echo "$size_gb > 8" | bc -l) )); then
add_warning "R2 storage high: ${size_gb}GB / ${r2_limit}GB (>80%)"
else
log "✅ R2 storage OK: ${size_gb}GB / ${r2_limit}GB"
fi
else
# Fallback: use integer comparison if bc not available
local size_gb_int=$(echo "$size_gb" | cut -d. -f1)
if [[ "$size_gb_int" -gt "$r2_limit" ]]; then
add_problem "R2 storage EXCEEDED: ${size_gb}GB / ${r2_limit}GB limit"
elif [[ "$size_gb_int" -gt 8 ]]; then
add_warning "R2 storage high: ${size_gb}GB / ${r2_limit}GB (>80%)"
else
log "✅ R2 storage OK: ${size_gb}GB / ${r2_limit}GB"
fi
fi
# Check snapshot count
local snapshot_count=$(restic snapshots --compact 2>/dev/null | grep -c "^[a-f0-9]" || echo "0")
log "📦 R2 snapshots: $snapshot_count (policy: keep 3 daily + 1 weekly)"
if [[ $snapshot_count -gt 5 ]]; then
add_warning "Too many R2 snapshots: $snapshot_count (expected ≤4)"
fi
}
check_backup_status() {
log "Checking backup status..."
if [[ ! -d /mnt/backups ]]; then
add_problem "Backup directory /mnt/backups NOT FOUND"
return
fi
local latest_backup=$(find /mnt/backups -type f -name "*.tar.gz" -o -name "*.sql.gz" 2>/dev/null | sort | tail -1)
if [[ -z "$latest_backup" ]]; then
add_problem "NO BACKUPS FOUND in /mnt/backups"
else
local backup_age_days=$(( ($(date +%s) - $(stat -c %Y "$latest_backup")) / 86400 ))
if [[ "$backup_age_days" -gt 2 ]]; then
add_problem "Latest backup is $backup_age_days days old (last: $(basename "$latest_backup"))"
elif [[ "$backup_age_days" -gt 1 ]]; then
add_warning "Latest backup is $backup_age_days days old"
fi
fi
}
check_container_restarts() {
log "Checking for excessive container restarts..."
while read -r container restart_count; do
if [[ "$restart_count" -gt 10 ]]; then
add_problem "Container $container has restarted $restart_count times"
elif [[ "$restart_count" -gt 5 ]]; then
add_warning "Container $container has restarted $restart_count times"
fi
done < <(docker ps --format '{{.Names}}' | xargs -I {} sh -c 'echo {} $(docker inspect --format="{{.RestartCount}}" {})')
}
################################################################################
# ALERT SENDING
################################################################################
send_slack_alert() {
local message="$1"
if [[ -z "$SLACK_WEBHOOK_URL" ]]; then
log "Slack webhook not configured, skipping..."
return
fi
local payload=$(cat <<EOF
{
"username": "AI-Impress Monitor",
"icon_emoji": ":warning:",
"text": "$message"
}
EOF
)
curl -X POST -H 'Content-type: application/json' \
--data "$payload" \
"$SLACK_WEBHOOK_URL" 2>/dev/null || log "Failed to send Slack alert"
}
send_email_alert() {
local subject="$1"
local body="$2"
# Try to send via mail command if available
if command -v mail &> /dev/null; then
echo "$body" | mail -s "$subject" "$ALERT_EMAIL" 2>/dev/null || log "Failed to send email"
elif command -v sendmail &> /dev/null; then
echo -e "Subject: $subject\nTo: $ALERT_EMAIL\n\n$body" | sendmail "$ALERT_EMAIL" 2>/dev/null || log "Failed to send email"
else
log "No mail command available, saving email to file"
echo -e "To: $ALERT_EMAIL\nSubject: $subject\n\n$body" > "/tmp/alert-email-$(date +%s).txt"
fi
}
generate_alert_report() {
local report="🚨 AI-Impress Server Health Alert\n\n"
report+="Server: ai-impress-prod (51.89.231.46)\n"
report+="Time: $(date '+%Y-%m-%d %H:%M:%S')\n\n"
if [[ ${#PROBLEMS[@]} -gt 0 ]]; then
report+="🔴 CRITICAL PROBLEMS (${#PROBLEMS[@]}):\n"
for problem in "${PROBLEMS[@]}"; do
report+=" - $problem\n"
done
report+="\n"
fi
if [[ ${#WARNINGS[@]} -gt 0 ]]; then
report+="🟡 WARNINGS (${#WARNINGS[@]}):\n"
for warning in "${WARNINGS[@]}"; do
report+=" - $warning\n"
done
report+="\n"
fi
report+="━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
report+="🔧 HOW TO FIX:\n\n"
report+="1. SSH to server:\n"
report+=" ssh ubuntu@51.89.231.46\n\n"
report+="2. Check full status:\n"
report+=" /opt/05-backups/scripts/admin.sh status\n\n"
report+="3. View detailed logs:\n"
report+=" docker logs <container-name> --tail 100\n\n"
report+="4. Restart service if needed:\n"
report+=" docker restart <container-name>\n\n"
report+="5. Check disk space:\n"
report+=" df -h\n\n"
report+="━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
report+="📊 Quick System Status:\n"
report+=" Memory: $(free -h | awk '/^Mem:/ {print $3 "/" $2}')\n"
report+=" Disk: $(df -h / | awk 'NR==2 {print $3 "/" $2 " (" $5 ")"}')\n"
report+=" Containers: $(docker ps -q | wc -l) running\n"
report+=" Uptime: $(uptime -p)\n\n"
report+="━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
report+="Generated by: /opt/05-backups/scripts/health-check-alerting.sh\n"
echo -e "$report"
}
################################################################################
# MAIN
################################################################################
main() {
log "╔══════════════════════════════════════════════════════╗"
log "║ AI-Impress Health Check & Alerting System ║"
log "$(date '+%Y-%m-%d %H:%M:%S')"
log "╚══════════════════════════════════════════════════════╝"
log ""
# Run all checks
check_critical_services
check_websites
check_disk_space
check_memory
check_r2_usage
check_unhealthy_containers
check_backup_status
check_container_restarts
log ""
log "Summary: ${#PROBLEMS[@]} problems, ${#WARNINGS[@]} warnings"
# Send alerts if there are problems
if [[ ${#PROBLEMS[@]} -gt 0 ]]; then
log "🚨 CRITICAL PROBLEMS DETECTED - Sending alerts..."
local alert_report=$(generate_alert_report)
# Send to Slack
send_slack_alert "$alert_report"
# Send via Email
send_email_alert "🚨 AI-Impress Server Alert - ${#PROBLEMS[@]} Critical Problems" "$alert_report"
echo -e "\n${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${RED}⚠️ CRITICAL PROBLEMS DETECTED!${NC}"
echo -e "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
for problem in "${PROBLEMS[@]}"; do
echo -e "${RED}$problem${NC}"
done
echo -e "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
exit 1
elif [[ ${#WARNINGS[@]} -gt 0 ]]; then
log "⚠️ Warnings detected (no critical problems)"
echo -e "\n${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${YELLOW}⚠️ WARNINGS DETECTED${NC}"
echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
for warning in "${WARNINGS[@]}"; do
echo -e "${YELLOW}$warning${NC}"
done
echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
exit 0
else
log "✅ All checks passed - System is healthy"
echo -e "\n${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}✅ All checks passed - System is healthy${NC}"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
exit 0
fi
}
# Run if executed directly
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View file

@ -108,7 +108,7 @@ docker exec postgres-main psql -U aimpress_admin -c \
# 3. Save to Vault
export VAULT_ADDR="http://127.0.0.1:8200"
export VAULT_TOKEN="hvs.jYguDdf2IzobXG8b9QWyATV8"
export VAULT_TOKEN=$(cat /opt/00-infrastructure/vault/.vault-token)
vault kv put aimpress/postgres/<service> password="$NEW_PASS"
# 4. Update application config

View file

@ -64,5 +64,6 @@ for service in traefik postgres-main redis-main vault; do
fi
done
cat << 'EOF'
echo ""
echo "---"
echo ""

View file

@ -105,7 +105,7 @@ ssh ubuntu@51.89.231.46
# Set Vault variables
export VAULT_ADDR="http://127.0.0.1:8200"
export VAULT_TOKEN="hvs.jYguDdf2IzobXG8b9QWyATV8"
export VAULT_TOKEN=$(cat /opt/00-infrastructure/vault/.vault-token)
# List all available secrets
vault kv list aimpress/

View file

@ -26,6 +26,126 @@ log() { echo -e "${CYAN}[$(date +%H:%M:%S)]${NC} $1"; }
success() { echo -e "${GREEN}$1${NC}"; }
error() { echo -e "${RED}$1${NC}"; exit 1; }
# Slack notification function
send_slack_summary() {
local report_file="$1"
# Get Vault token and Slack webhook
export VAULT_ADDR="http://127.0.0.1:8200"
export VAULT_TOKEN=$(cat /opt/00-infrastructure/vault/.vault-token 2>/dev/null || echo "")
local slack_webhook=$(vault kv get -field=slack_webhook secret/monitoring 2>/dev/null || echo "")
if [[ -z "$slack_webhook" ]]; then
log "Slack webhook not configured, skipping notification"
return 0
fi
# Extract key metrics from report
local containers_running=$(grep "Docker Containers" "$report_file" | grep -oP '\d+(?= running)' | head -1)
local containers_total=$(grep "Docker Containers" "$report_file" | grep -oP 'running / \d+' | grep -oP '\d+' | head -1)
local unhealthy=$(grep "Unhealthy Containers" "$report_file" | grep -oP '\d+' | head -1)
local memory=$(grep "Memory" "$report_file" | grep -oP '\| \*\*Memory\*\* \| \K[^|]+' | xargs)
local disk=$(grep "Disk (/)" "$report_file" | grep -oP '\| \*\*Disk \(/\)\*\* \| \K[^|]+' | xargs)
local disk_percent=$(echo "$disk" | grep -oP '\d+(?=\%)')
local uptime=$(grep "Uptime" "$report_file" | grep -oP '\| \*\*Uptime\*\* \| \K[^|]+' | xargs)
# Count websites status
local websites_ok=$(grep -c "✅ OK" "$report_file" 2>/dev/null || echo "0")
local websites_down=$(grep -c "❌" "$report_file" 2>/dev/null || echo "0")
# Detect problems and create recommendations
local problems=""
local recommendations=""
local color="good"
local status_emoji="✅"
if [[ "$unhealthy" -gt 0 ]]; then
problems="${problems}$unhealthy unhealthy container(s) detected\n"
recommendations="${recommendations}• Check logs: \`docker logs <container>\`\n• Restart if needed: \`docker restart <container>\`\n"
color="danger"
status_emoji="🚨"
fi
if [[ "$websites_down" -gt 0 ]]; then
problems="${problems}$websites_down website(s) are down\n"
recommendations="${recommendations}• Check Traefik: \`docker logs traefik --tail 50\`\n• Verify DNS: \`nslookup <domain>\`\n• Check SSL certs: \`/opt/05-backups/scripts/admin.sh status websites\`\n"
if [[ "$color" != "danger" ]]; then
color="warning"
status_emoji="⚠️"
fi
fi
if [[ -n "$disk_percent" ]] && [[ "$disk_percent" -gt 80 ]]; then
problems="${problems}• Disk usage is high: ${disk_percent}%\n"
recommendations="${recommendations}• Clean up old logs: \`/opt/05-backups/scripts/admin.sh cleanup logs\`\n• Clean up Docker: \`/opt/05-backups/scripts/admin.sh cleanup docker\`\n• Check disk: \`/opt/05-backups/scripts/admin.sh status disk\`\n"
if [[ "$color" == "good" ]]; then
color="warning"
status_emoji="⚠️"
fi
fi
# Create fields array
local fields='[
{
"title": "System Status",
"value": "🐳 Containers: '"$containers_running/$containers_total"' running\n💾 Memory: '"$memory"'\n💿 Disk: '"$disk"'\n⏱ Uptime: '"$uptime"'",
"short": true
},
{
"title": "Health Check",
"value": "🔴 Unhealthy: '"$unhealthy"' containers\n🌐 Websites: '"$websites_ok"' OK, '"$websites_down"' Down",
"short": true
}'
# Add problems section if any
if [[ -n "$problems" ]]; then
fields+=',
{
"title": "⚠️ Detected Problems",
"value": "'"${problems}"'",
"short": false
},
{
"title": "🔧 Recommended Actions",
"value": "'"${recommendations}"'SSH: \`ssh ubuntu@51.89.231.46\`\nAdmin tool: \`/opt/05-backups/scripts/admin.sh help\`",
"short": false
}'
fi
fields+=',
{
"title": "Full Report",
"value": "📄 Generated: \`'"$(basename $report_file)"'\`\n📍 Location: \`/opt/infrastructure-docs/reports/\`\n📤 Upload to Wiki: \`/opt/05-backups/scripts/upload-to-outline.sh latest-report\`",
"short": false
}
]'
# Create Slack message
local payload='{
"attachments": [
{
"color": "'"$color"'",
"title": "'"$status_emoji"' Daily Server Report - '"$(date '+%Y-%m-%d %H:%M')"'",
"fields": '"$fields"',
"footer": "AI-Impress Infrastructure Monitor",
"footer_icon": "https://wiki.ai-impress.com/favicon.png",
"ts": '"$(date +%s)"'
}
]
}'
# Send to Slack
local response=$(curl -s -X POST "$slack_webhook" \
-H 'Content-Type: application/json' \
-d "$payload")
if [[ "$response" == "ok" ]]; then
success "Slack summary sent with $(echo -e "$problems" | grep -c "•" || echo "0") problems detected"
else
log "Slack notification may have failed: $response"
fi
}
log "╔════════════════════════════════════════════════════════════╗"
log "║ AI-Impress Complete Server Report Generator v5.0 ║"
log "║ Modular Architecture - Full System Report ║"
@ -174,4 +294,9 @@ cat << EOFSTATS
EOFSTATS
# Send Slack summary
log ""
log "Sending Slack summary..."
send_slack_summary "$REPORT_FILE"
exit 0