utility-101-shadow: - Add full monitoring stack (Prometheus + Blackbox Exporter + Alertmanager) with Docker Compose and a systemd unit (monitoring.service) - Prometheus scrapes: itself, blackbox-exporter, and node-exporter on the docker host (docker:9100); blackbox probes cover HTTPS endpoints with TLS cert monitoring - Alertmanager routes warnings to Slack/Discord, critical alerts also to email (Gmail SMTP); inhibit rule suppresses SSLCertExpiringSoon when SSLCertExpired already fires - Alert rules: 11 node-exporter alerts (host down, CPU, memory, disk fill/prediction, iowait, OOM kill, systemd failed units) + 3 blackbox alerts (probe failed, SSL expiring, SSL expired) - readme: add services list and Docker Engine installation steps docker host: - Add node-exporter container running with host pid/network and read-only mounts of /proc, /sys, / for full host metrics visibility - Enable --collector.systemd for systemd unit state metrics - Add systemd unit (node-exporter.service) to manage the container Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
147 lines
5.7 KiB
YAML
147 lines
5.7 KiB
YAML
groups:
|
|
- name: node-exporter
|
|
rules:
|
|
|
|
# node_exporter unreachable
|
|
- alert: HostDown
|
|
expr: up{job=~"node-exporter.*"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Host down: {{ $labels.instance }}"
|
|
description: "node_exporter on {{ $labels.instance }} has been unreachable for more than 2 minutes."
|
|
|
|
# CPU > 85% for 5m
|
|
- alert: HighCPULoad
|
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU load: {{ $labels.instance }}"
|
|
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 85%)."
|
|
|
|
# CPU > 95% for 5m
|
|
- alert: CriticalCPULoad
|
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical CPU load: {{ $labels.instance }}"
|
|
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."
|
|
|
|
# RAM used > 90% for 5m
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage: {{ $labels.instance }}"
|
|
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 90%)."
|
|
|
|
# RAM used > 95% for 5m
|
|
- alert: CriticalMemoryUsage
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical memory usage: {{ $labels.instance }}"
|
|
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."
|
|
|
|
# Disk used > 85% for 5m
|
|
- alert: DiskSpaceLow
|
|
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
|
|
description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 85%)."
|
|
|
|
# Disk used > 95% for 5m
|
|
- alert: DiskSpaceCritical
|
|
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
|
|
description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 95%)."
|
|
|
|
# Disk predicted to fill within 4h
|
|
- alert: DiskWillFillIn4h
|
|
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"}[1h], 4 * 3600) < 0
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk filling up: {{ $labels.instance }} {{ $labels.mountpoint }}"
|
|
description: "Disk on {{ $labels.instance }}:{{ $labels.mountpoint }} is projected to run out of space within 4 hours."
|
|
|
|
# iowait > 20% for 10m
|
|
- alert: HighDiskIOWait
|
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.20
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High disk I/O wait: {{ $labels.instance }}"
|
|
description: "I/O wait on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 20%)."
|
|
|
|
# OOM kill detected
|
|
- alert: OOMKillDetected
|
|
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "OOM kill detected: {{ $labels.instance }}"
|
|
description: "An OOM kill event was detected on {{ $labels.instance }}."
|
|
|
|
# systemd unit in failed state (requires --collector.systemd)
|
|
- alert: SystemdServiceFailed
|
|
expr: node_systemd_unit_state{state="failed"} == 1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Systemd service failed: {{ $labels.instance }}"
|
|
description: "Service {{ $labels.name }} on {{ $labels.instance }} has been in failed state for more than 2 minutes."
|
|
|
|
- name: blackbox-https
|
|
rules:
|
|
|
|
# Probe returned non-2xx or timed out
|
|
- alert: ProbeFailed
|
|
expr: probe_success == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Probe failed: {{ $labels.instance }}"
|
|
description: "Endpoint {{ $labels.instance }} is unreachable or returned a non-2xx response for more than 2 minutes."
|
|
|
|
# SSL certificate expires within 21 days
|
|
- alert: SSLCertExpiringSoon
|
|
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 21
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "SSL cert expiring soon: {{ $labels.instance }}"
|
|
description: "SSL certificate for {{ $labels.instance }} expires in {{ $value | printf \"%.0f\" }} days."
|
|
|
|
# SSL certificate already expired
|
|
- alert: SSLCertExpired
|
|
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "SSL cert expired: {{ $labels.instance }}"
|
|
description: "SSL certificate for {{ $labels.instance }} has expired."
|