Files
home-kubernetes/vms/utility-101-shadow/docker/monitoring/alertmanager.yml
Jan Novak dda6a9d032 vms: add monitoring stack and node-exporter for docker host
utility-101-shadow:
- Add full monitoring stack (Prometheus + Blackbox Exporter + Alertmanager)
  with Docker Compose and a systemd unit (monitoring.service)
- Prometheus scrapes: itself, blackbox-exporter, and node-exporter on
  the docker host (docker:9100); blackbox probes cover HTTPS endpoints
  with TLS cert monitoring
- Alertmanager routes warnings to Slack/Discord, critical alerts also
  to email (Gmail SMTP); inhibit rule suppresses SSLCertExpiringSoon
  when SSLCertExpired already fires
- Alert rules: 11 node-exporter alerts (host down, CPU, memory, disk
  fill/prediction, iowait, OOM kill, systemd failed units) + 3 blackbox
  alerts (probe failed, SSL expiring, SSL expired)
- readme: add services list and Docker Engine installation steps

docker host:
- Add node-exporter container running with host pid/network and
  read-only mounts of /proc, /sys, / for full host metrics visibility
- Enable --collector.systemd for systemd unit state metrics
- Add systemd unit (node-exporter.service) to manage the container

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-07 23:07:44 +01:00

69 lines
2.7 KiB
YAML

global:
resolve_timeout: 5m
# ── Email (SMTP) defaults ──────────────────────────────────────────────────
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'kacerr.cz+utility-101-shadow@gmail.com'
smtp_auth_username: 'kacerr.cz@gmail.com'
smtp_auth_password_file: '/run/secrets/smtp_password'
smtp_require_tls: true
route:
receiver: default
group_by: [alertname, instance]
group_wait: 30s # wait before sending the first alert in a group
group_interval: 5m # wait before sending alerts for a group that has been updated
repeat_interval: 4h # resend still-firing alerts after this interval
routes:
# warnings only go to slack + discord, not email
- matchers:
- severity = warning
receiver: non-critical
# critical alerts go to all channels
- matchers:
- severity = critical
receiver: default
receivers:
# Sends to all three channels
- name: default
email_configs:
- to: 'kacerr.cz@gmail.com'
send_resolved: true
slack_configs:
- api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
channel: '#alerts'
send_resolved: true
title: '{{ if eq .Status "firing" }}:red_circle:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
discord_configs:
- webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
send_resolved: true
title: '{{ if eq .Status "firing" }}🔴{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
# Warnings: slack + discord only
- name: non-critical
slack_configs:
- api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
channel: '#alerts'
send_resolved: true
title: '{{ if eq .Status "firing" }}:warning:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
discord_configs:
- webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
send_resolved: true
title: '{{ if eq .Status "firing" }}⚠️{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
inhibit_rules:
# Suppress SSLCertExpiringSoon if SSLCertExpired is already firing for the same instance
- source_matchers:
- alertname = SSLCertExpired
target_matchers:
- alertname = SSLCertExpiringSoon
equal: [instance]