Files
home-kubernetes/vms/utility-101-shadow/docker/monitoring/alerts.yml
Jan Novak dda6a9d032 vms: add monitoring stack and node-exporter for docker host
utility-101-shadow:
- Add full monitoring stack (Prometheus + Blackbox Exporter + Alertmanager)
  with Docker Compose and a systemd unit (monitoring.service)
- Prometheus scrapes: itself, blackbox-exporter, and node-exporter on
  the docker host (docker:9100); blackbox probes cover HTTPS endpoints
  with TLS cert monitoring
- Alertmanager routes warnings to Slack/Discord, critical alerts also
  to email (Gmail SMTP); inhibit rule suppresses SSLCertExpiringSoon
  when SSLCertExpired already fires
- Alert rules: 11 node-exporter alerts (host down, CPU, memory, disk
  fill/prediction, iowait, OOM kill, systemd failed units) + 3 blackbox
  alerts (probe failed, SSL expiring, SSL expired)
- readme: add services list and Docker Engine installation steps

docker host:
- Add node-exporter container running with host pid/network and
  read-only mounts of /proc, /sys, / for full host metrics visibility
- Enable --collector.systemd for systemd unit state metrics
- Add systemd unit (node-exporter.service) to manage the container

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-07 23:07:44 +01:00

147 lines
5.7 KiB
YAML

groups:
- name: node-exporter
rules:
# node_exporter unreachable
- alert: HostDown
expr: up{job=~"node-exporter.*"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Host down: {{ $labels.instance }}"
description: "node_exporter on {{ $labels.instance }} has been unreachable for more than 2 minutes."
# CPU > 85% for 5m
- alert: HighCPULoad
expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU load: {{ $labels.instance }}"
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 85%)."
# CPU > 95% for 5m
- alert: CriticalCPULoad
expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.95
for: 5m
labels:
severity: critical
annotations:
summary: "Critical CPU load: {{ $labels.instance }}"
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."
# RAM used > 90% for 5m
- alert: HighMemoryUsage
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage: {{ $labels.instance }}"
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 90%)."
# RAM used > 95% for 5m
- alert: CriticalMemoryUsage
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
for: 5m
labels:
severity: critical
annotations:
summary: "Critical memory usage: {{ $labels.instance }}"
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."
# Disk used > 85% for 5m
- alert: DiskSpaceLow
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 85%)."
# Disk used > 95% for 5m
- alert: DiskSpaceCritical
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.95
for: 5m
labels:
severity: critical
annotations:
summary: "Critical disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 95%)."
# Disk predicted to fill within 4h
- alert: DiskWillFillIn4h
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"}[1h], 4 * 3600) < 0
for: 30m
labels:
severity: warning
annotations:
summary: "Disk filling up: {{ $labels.instance }} {{ $labels.mountpoint }}"
description: "Disk on {{ $labels.instance }}:{{ $labels.mountpoint }} is projected to run out of space within 4 hours."
# iowait > 20% for 10m
- alert: HighDiskIOWait
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.20
for: 10m
labels:
severity: warning
annotations:
summary: "High disk I/O wait: {{ $labels.instance }}"
description: "I/O wait on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 20%)."
# OOM kill detected
- alert: OOMKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "OOM kill detected: {{ $labels.instance }}"
description: "An OOM kill event was detected on {{ $labels.instance }}."
# systemd unit in failed state (requires --collector.systemd)
- alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"} == 1
for: 2m
labels:
severity: warning
annotations:
summary: "Systemd service failed: {{ $labels.instance }}"
description: "Service {{ $labels.name }} on {{ $labels.instance }} has been in failed state for more than 2 minutes."
- name: blackbox-https
rules:
# Probe returned non-2xx or timed out
- alert: ProbeFailed
expr: probe_success == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Probe failed: {{ $labels.instance }}"
description: "Endpoint {{ $labels.instance }} is unreachable or returned a non-2xx response for more than 2 minutes."
# SSL certificate expires within 21 days
- alert: SSLCertExpiringSoon
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 21
for: 1h
labels:
severity: warning
annotations:
summary: "SSL cert expiring soon: {{ $labels.instance }}"
description: "SSL certificate for {{ $labels.instance }} expires in {{ $value | printf \"%.0f\" }} days."
# SSL certificate already expired
- alert: SSLCertExpired
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 0
for: 0m
labels:
severity: critical
annotations:
summary: "SSL cert expired: {{ $labels.instance }}"
description: "SSL certificate for {{ $labels.instance }} has expired."