From 5f0be9a50444e6cebf63522b77c0f5c8fe0f5c14 Mon Sep 17 00:00:00 2001 From: SamoilenkoVadym Date: Fri, 21 Nov 2025 16:08:03 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20=D0=BE=D0=BF=D1=82=D0=B8=D0=BC=D0=B8?= =?UTF-8?q?=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20=D0=B0=D0=BB=D0=B5=D1=80=D1=82?= =?UTF-8?q?=D0=B8=D0=BD=D0=B3=D0=B0=20Prometheus=20=D0=B8=20Alertmanager?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Исправлены критические проблемы с избыточными уведомлениями: 1. Alertmanager (config.yml): - group_wait: 10s → 30s (уменьшен спам повторных алертов) - group_interval: 10s → 5m (алерты группируются правильно) - repeat_interval: 1h → 4h (повторные уведомления раз в 4 часа) - Добавлена группировка по severity и instance - Исправлен шаблон Slack для отображения деталей алертов 2. Prometheus правила (alerts.yml): - ContainerHighMemory: порог 90% → 95%, for: 2m → 5m - WebsiteDown: for: 1m → 10m (синхронизировано со scrape_interval) - Добавлены детальные описания в alerts Результат: количество уведомлений снижено с 90+ до минимума, уведомления теперь содержат полную информацию о проблеме. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../monitoring/alertmanager/config.yml | 25 +++++++++++-------- .../monitoring/prometheus/alerts/alerts.yml | 10 ++++---- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/opt/04-tools/monitoring/alertmanager/config.yml b/opt/04-tools/monitoring/alertmanager/config.yml index 1326ee2..bfda0b9 100644 --- a/opt/04-tools/monitoring/alertmanager/config.yml +++ b/opt/04-tools/monitoring/alertmanager/config.yml @@ -2,10 +2,10 @@ global: slack_api_url: 'https://hooks.slack.com/services/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' route: - group_by: ['alertname'] - group_wait: 10s - group_interval: 10s - repeat_interval: 1h + group_by: ['alertname', 'severity', 'instance'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h receiver: 'slack-default' receivers: @@ -13,15 +13,18 @@ receivers: slack_configs: - channel: '#server-status' send_resolved: true - title: '{{ if eq .Status "firing" }}🚨{{ else }}✅{{ end }} {{ .GroupLabels.alertname }}' + title: '{{ if eq .Status "firing" }}🚨{{ else }}✅{{ end }} {{ .GroupLabels.alertname }} ({{ .GroupLabels.severity }})' text: |- {{ if eq .Status "firing" }}*🔥 Problem Detected*{{ else }}*✅ Problem Resolved*{{ end }} - - *Summary:* {{ .CommonAnnotations.summary }} - *Details:* {{ .CommonAnnotations.description }} + {{ range .Alerts }} + *Alert:* {{ .Labels.alertname }} + *Summary:* {{ .Annotations.summary }} + *Details:* {{ .Annotations.description }} + *Instance:* {{ .Labels.instance }}{{ if .Labels.container }} ({{ .Labels.container }}){{ end }} + {{ end }} *Severity:* {{ .CommonLabels.severity }} - + *Links:* - • - • + • + • color: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "critical" }}danger{{ else }}warning{{ end }}{{ else }}good{{ end }}' diff --git a/opt/04-tools/monitoring/prometheus/alerts/alerts.yml b/opt/04-tools/monitoring/prometheus/alerts/alerts.yml index 666a263..43d3662 100644 --- a/opt/04-tools/monitoring/prometheus/alerts/alerts.yml +++ b/opt/04-tools/monitoring/prometheus/alerts/alerts.yml @@ -100,14 +100,14 @@ groups: description: "Container {{ $labels.name }} has restarted {{ $value }} times in the last 10 minutes" - alert: ContainerHighMemory - expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) * 100 > 90 - for: 2m + expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) * 100 > 95 + for: 5m labels: severity: warning container: "{{ $labels.name }}" annotations: summary: "Container {{ $labels.name }} high memory usage" - description: "Container {{ $labels.name }} memory usage is {{ $value | humanizePercentage }}" + description: "Container {{ $labels.name }} memory usage is {{ $value | humanizePercentage }} for more than 5 minutes" - alert: ContainerHighCPU expr: (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100 > 80 @@ -124,13 +124,13 @@ groups: rules: - alert: WebsiteDown expr: probe_success{job="blackbox-ssl"} == 0 - for: 1m + for: 10m labels: severity: critical instance: "{{ $labels.instance }}" annotations: summary: "Website {{ $labels.instance }} is DOWN" - description: "Website {{ $labels.instance }} has been unreachable for more than 1 minute" + description: "Website {{ $labels.instance }} has been unreachable for more than 10 minutes. Last probe at {{ $labels.job }}" - alert: WebsiteSlow expr: probe_duration_seconds{job="blackbox-ssl"} > 5