groups: - name: node-exporter rules: # node_exporter unreachable - alert: HostDown expr: up{job=~"node-exporter.*"} == 0 for: 2m labels: severity: critical annotations: summary: "Host down: {{ $labels.instance }}" description: "node_exporter on {{ $labels.instance }} has been unreachable for more than 2 minutes." # CPU > 85% for 5m - alert: HighCPULoad expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.85 for: 5m labels: severity: warning annotations: summary: "High CPU load: {{ $labels.instance }}" description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 85%)." # CPU > 95% for 5m - alert: CriticalCPULoad expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.95 for: 5m labels: severity: critical annotations: summary: "Critical CPU load: {{ $labels.instance }}" description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)." # RAM used > 90% for 5m - alert: HighMemoryUsage expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 for: 5m labels: severity: warning annotations: summary: "High memory usage: {{ $labels.instance }}" description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 90%)." # RAM used > 95% for 5m - alert: CriticalMemoryUsage expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95 for: 5m labels: severity: critical annotations: summary: "Critical memory usage: {{ $labels.instance }}" description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)." # Disk used > 85% for 5m - alert: DiskSpaceLow expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.85 for: 5m labels: severity: warning annotations: summary: "Low disk space: {{ $labels.instance }} {{ $labels.mountpoint }}" description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 85%)." # Disk used > 95% for 5m - alert: DiskSpaceCritical expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.95 for: 5m labels: severity: critical annotations: summary: "Critical disk space: {{ $labels.instance }} {{ $labels.mountpoint }}" description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 95%)." # Disk predicted to fill within 4h - alert: DiskWillFillIn4h expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"}[1h], 4 * 3600) < 0 for: 30m labels: severity: warning annotations: summary: "Disk filling up: {{ $labels.instance }} {{ $labels.mountpoint }}" description: "Disk on {{ $labels.instance }}:{{ $labels.mountpoint }} is projected to run out of space within 4 hours." # iowait > 20% for 10m - alert: HighDiskIOWait expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.20 for: 10m labels: severity: warning annotations: summary: "High disk I/O wait: {{ $labels.instance }}" description: "I/O wait on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 20%)." # OOM kill detected - alert: OOMKillDetected expr: increase(node_vmstat_oom_kill[5m]) > 0 for: 0m labels: severity: critical annotations: summary: "OOM kill detected: {{ $labels.instance }}" description: "An OOM kill event was detected on {{ $labels.instance }}." # systemd unit in failed state (requires --collector.systemd) - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 for: 2m labels: severity: warning annotations: summary: "Systemd service failed: {{ $labels.instance }}" description: "Service {{ $labels.name }} on {{ $labels.instance }} has been in failed state for more than 2 minutes." - name: blackbox-https rules: # Probe returned non-2xx or timed out - alert: ProbeFailed expr: probe_success == 0 for: 2m labels: severity: critical annotations: summary: "Probe failed: {{ $labels.instance }}" description: "Endpoint {{ $labels.instance }} is unreachable or returned a non-2xx response for more than 2 minutes." # SSL certificate expires within 21 days - alert: SSLCertExpiringSoon expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 21 for: 1h labels: severity: warning annotations: summary: "SSL cert expiring soon: {{ $labels.instance }}" description: "SSL certificate for {{ $labels.instance }} expires in {{ $value | printf \"%.0f\" }} days." # SSL certificate already expired - alert: SSLCertExpired expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 0 for: 0m labels: severity: critical annotations: summary: "SSL cert expired: {{ $labels.instance }}" description: "SSL certificate for {{ $labels.instance }} has expired."