home-kubernetes/vms/utility-101-shadow/docker/monitoring/alerts.yml

groups:
  - name: node-exporter
    rules:

      # node_exporter unreachable
      - alert: HostDown
        expr: up{job=~"node-exporter.*"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Host down: {{ $labels.instance }}"
          description: "node_exporter on {{ $labels.instance }} has been unreachable for more than 2 minutes."

      # CPU > 85% for 5m
      - alert: HighCPULoad
        expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU load: {{ $labels.instance }}"
          description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 85%)."

      # CPU > 95% for 5m
      - alert: CriticalCPULoad
        expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Critical CPU load: {{ $labels.instance }}"
          description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."

      # RAM used > 90% for 5m
      - alert: HighMemoryUsage
        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage: {{ $labels.instance }}"
          description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 90%)."

      # RAM used > 95% for 5m
      - alert: CriticalMemoryUsage
        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Critical memory usage: {{ $labels.instance }}"
          description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."

      # Disk used > 85% for 5m
      - alert: DiskSpaceLow
        expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Low disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
          description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 85%)."

      # Disk used > 95% for 5m
      - alert: DiskSpaceCritical
        expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Critical disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
          description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 95%)."

      # Disk predicted to fill within 4h
      - alert: DiskWillFillIn4h
        expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"}[1h], 4 * 3600) < 0
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "Disk filling up: {{ $labels.instance }} {{ $labels.mountpoint }}"
          description: "Disk on {{ $labels.instance }}:{{ $labels.mountpoint }} is projected to run out of space within 4 hours."

      # iowait > 20% for 10m
      - alert: HighDiskIOWait
        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.20
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High disk I/O wait: {{ $labels.instance }}"
          description: "I/O wait on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 20%)."

      # OOM kill detected
      - alert: OOMKillDetected
        expr: increase(node_vmstat_oom_kill[5m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "OOM kill detected: {{ $labels.instance }}"
          description: "An OOM kill event was detected on {{ $labels.instance }}."

      # systemd unit in failed state (requires --collector.systemd)
      - alert: SystemdServiceFailed
        expr: node_systemd_unit_state{state="failed"} == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Systemd service failed: {{ $labels.instance }}"
          description: "Service {{ $labels.name }} on {{ $labels.instance }} has been in failed state for more than 2 minutes."

  - name: blackbox-https
    rules:

      # Probe returned non-2xx or timed out
      - alert: ProbeFailed
        expr: probe_success == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Probe failed: {{ $labels.instance }}"
          description: "Endpoint {{ $labels.instance }} is unreachable or returned a non-2xx response for more than 2 minutes."

      # SSL certificate expires within 21 days
      - alert: SSLCertExpiringSoon
        expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 21
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "SSL cert expiring soon: {{ $labels.instance }}"
          description: "SSL certificate for {{ $labels.instance }} expires in {{ $value | printf \"%.0f\" }} days."

      # SSL certificate already expired
      - alert: SSLCertExpired
        expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "SSL cert expired: {{ $labels.instance }}"
          description: "SSL certificate for {{ $labels.instance }} has expired."