vms: add monitoring stack and node-exporter for docker host

utility-101-shadow: - Add full monitoring stack (Prometheus + Blackbox Exporter + Alertmanager) with Docker Compose and a systemd unit (monitoring.service) - Prometheus scrapes: itself, blackbox-exporter, and node-exporter on the docker host (docker:9100); blackbox probes cover HTTPS endpoints with TLS cert monitoring - Alertmanager routes warnings to Slack/Discord, critical alerts also to email (Gmail SMTP); inhibit rule suppresses SSLCertExpiringSoon when SSLCertExpired already fires - Alert rules: 11 node-exporter alerts (host down, CPU, memory, disk fill/prediction, iowait, OOM kill, systemd failed units) + 3 blackbox alerts (probe failed, SSL expiring, SSL expired) - readme: add services list and Docker Engine installation steps docker host: - Add node-exporter container running with host pid/network and read-only mounts of /proc, /sys, / for full host metrics visibility - Enable --collector.systemd for systemd unit state metrics - Add systemd unit (node-exporter.service) to manage the container Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-07 23:07:44 +01:00
parent 1b6015f732
commit dda6a9d032
11 changed files with 632 additions and 0 deletions
--- a/vms/utility-101-shadow/docker/monitoring/alertmanager.yml
+++ b/vms/utility-101-shadow/docker/monitoring/alertmanager.yml
@@ -0,0 +1,68 @@
+global:
+  resolve_timeout: 5m
+
+  # ── Email (SMTP) defaults ──────────────────────────────────────────────────
+  smtp_smarthost: 'smtp.gmail.com:587'
+  smtp_from: 'kacerr.cz+utility-101-shadow@gmail.com'
+  smtp_auth_username: 'kacerr.cz@gmail.com'
+  smtp_auth_password_file: '/run/secrets/smtp_password'
+  smtp_require_tls: true
+
+route:
+  receiver: default
+  group_by: [alertname, instance]
+  group_wait: 30s       # wait before sending the first alert in a group
+  group_interval: 5m    # wait before sending alerts for a group that has been updated
+  repeat_interval: 4h   # resend still-firing alerts after this interval
+
+  routes:
+    # warnings only go to slack + discord, not email
+    - matchers:
+        - severity = warning
+      receiver: non-critical
+
+    # critical alerts go to all channels
+    - matchers:
+        - severity = critical
+      receiver: default
+
+receivers:
+
+  # Sends to all three channels
+  - name: default
+    email_configs:
+      - to: 'kacerr.cz@gmail.com'
+        send_resolved: true
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
+        channel: '#alerts'
+        send_resolved: true
+        title: '{{ if eq .Status "firing" }}:red_circle:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
+        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
+    discord_configs:
+      - webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
+        send_resolved: true
+        title: '{{ if eq .Status "firing" }}🔴{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
+        message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
+
+  # Warnings: slack + discord only
+  - name: non-critical
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
+        channel: '#alerts'
+        send_resolved: true
+        title: '{{ if eq .Status "firing" }}:warning:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
+        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
+    discord_configs:
+      - webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
+        send_resolved: true
+        title: '{{ if eq .Status "firing" }}⚠️{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
+        message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
+
+inhibit_rules:
+  # Suppress SSLCertExpiringSoon if SSLCertExpired is already firing for the same instance
+  - source_matchers:
+      - alertname = SSLCertExpired
+    target_matchers:
+      - alertname = SSLCertExpiringSoon
+    equal: [instance]
--- a/vms/utility-101-shadow/docker/monitoring/alerts.yml
+++ b/vms/utility-101-shadow/docker/monitoring/alerts.yml
@@ -0,0 +1,146 @@
+groups:
+  - name: node-exporter
+    rules:
+
+      # node_exporter unreachable
+      - alert: HostDown
+        expr: up{job=~"node-exporter.*"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host down: {{ $labels.instance }}"
+          description: "node_exporter on {{ $labels.instance }} has been unreachable for more than 2 minutes."
+
+      # CPU > 85% for 5m
+      - alert: HighCPULoad
+        expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU load: {{ $labels.instance }}"
+          description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 85%)."
+
+      # CPU > 95% for 5m
+      - alert: CriticalCPULoad
+        expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.95
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Critical CPU load: {{ $labels.instance }}"
+          description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."
+
+      # RAM used > 90% for 5m
+      - alert: HighMemoryUsage
+        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage: {{ $labels.instance }}"
+          description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 90%)."
+
+      # RAM used > 95% for 5m
+      - alert: CriticalMemoryUsage
+        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Critical memory usage: {{ $labels.instance }}"
+          description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."
+
+      # Disk used > 85% for 5m
+      - alert: DiskSpaceLow
+        expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
+          description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 85%)."
+
+      # Disk used > 95% for 5m
+      - alert: DiskSpaceCritical
+        expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.95
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Critical disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
+          description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 95%)."
+
+      # Disk predicted to fill within 4h
+      - alert: DiskWillFillIn4h
+        expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"}[1h], 4 * 3600) < 0
+        for: 30m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk filling up: {{ $labels.instance }} {{ $labels.mountpoint }}"
+          description: "Disk on {{ $labels.instance }}:{{ $labels.mountpoint }} is projected to run out of space within 4 hours."
+
+      # iowait > 20% for 10m
+      - alert: HighDiskIOWait
+        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.20
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High disk I/O wait: {{ $labels.instance }}"
+          description: "I/O wait on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 20%)."
+
+      # OOM kill detected
+      - alert: OOMKillDetected
+        expr: increase(node_vmstat_oom_kill[5m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: "OOM kill detected: {{ $labels.instance }}"
+          description: "An OOM kill event was detected on {{ $labels.instance }}."
+
+      # systemd unit in failed state (requires --collector.systemd)
+      - alert: SystemdServiceFailed
+        expr: node_systemd_unit_state{state="failed"} == 1
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Systemd service failed: {{ $labels.instance }}"
+          description: "Service {{ $labels.name }} on {{ $labels.instance }} has been in failed state for more than 2 minutes."
+
+  - name: blackbox-https
+    rules:
+
+      # Probe returned non-2xx or timed out
+      - alert: ProbeFailed
+        expr: probe_success == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Probe failed: {{ $labels.instance }}"
+          description: "Endpoint {{ $labels.instance }} is unreachable or returned a non-2xx response for more than 2 minutes."
+
+      # SSL certificate expires within 21 days
+      - alert: SSLCertExpiringSoon
+        expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 21
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: "SSL cert expiring soon: {{ $labels.instance }}"
+          description: "SSL certificate for {{ $labels.instance }} expires in {{ $value | printf \"%.0f\" }} days."
+
+      # SSL certificate already expired
+      - alert: SSLCertExpired
+        expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: "SSL cert expired: {{ $labels.instance }}"
+          description: "SSL certificate for {{ $labels.instance }} has expired."
--- a/vms/utility-101-shadow/docker/monitoring/blackbox.yml
+++ b/vms/utility-101-shadow/docker/monitoring/blackbox.yml
@@ -0,0 +1,113 @@
+modules:
+
+  # ── HTTP ──────────────────────────────────────────────────────────────────
+
+  # Plain HTTP GET, expects 2xx response
+  http_2xx:
+    prober: http
+    timeout: 10s
+    http:
+      method: GET
+      valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+      valid_status_codes: []  # defaults to 2xx
+      follow_redirects: true
+      preferred_ip_protocol: "ip4"
+
+  # HTTP POST with JSON body, expects 2xx response
+  http_post_2xx:
+    prober: http
+    timeout: 10s
+    http:
+      method: POST
+      headers:
+        Content-Type: application/json
+      body: '{}'
+      valid_status_codes: []  # defaults to 2xx
+      follow_redirects: true
+      preferred_ip_protocol: "ip4"
+
+  # HTTP GET for endpoints that must return 401 (e.g. unauthenticated API)
+  http_401_expected:
+    prober: http
+    timeout: 10s
+    http:
+      method: GET
+      valid_status_codes: [401]
+      follow_redirects: false
+      preferred_ip_protocol: "ip4"
+
+  # ── HTTPS ─────────────────────────────────────────────────────────────────
+
+  # HTTPS GET, TLS certificate verified (chain + expiry)
+  # Use this for production endpoints — probe_ssl_earliest_cert_expiry is populated
+  https_2xx:
+    prober: http
+    timeout: 10s
+    http:
+      method: GET
+      valid_status_codes: []  # defaults to 2xx
+      follow_redirects: true
+      preferred_ip_protocol: "ip4"
+      tls_config:
+        insecure_skip_verify: false
+
+  # HTTPS GET, skip TLS certificate verification
+  # Use for self-signed certs or internal CAs not trusted by the exporter
+  https_insecure:
+    prober: http
+    timeout: 10s
+    http:
+      method: GET
+      valid_status_codes: []  # defaults to 2xx
+      follow_redirects: true
+      preferred_ip_protocol: "ip4"
+      tls_config:
+        insecure_skip_verify: true
+
+  # ── TCP ───────────────────────────────────────────────────────────────────
+
+  # Raw TCP connect — checks port is open and accepting connections
+  # Target format: host:port  (e.g. "192.168.0.30:5432")
+  tcp_connect:
+    prober: tcp
+    timeout: 10s
+    tcp:
+      preferred_ip_protocol: "ip4"
+
+  # ── ICMP ──────────────────────────────────────────────────────────────────
+
+  # ICMP ping — checks host reachability and latency
+  # Note: requires NET_RAW capability or running as root
+  icmp:
+    prober: icmp
+    timeout: 10s
+    icmp:
+      preferred_ip_protocol: "ip4"
+
+  # ── DNS ───────────────────────────────────────────────────────────────────
+
+  # DNS lookup via UDP (standard)
+  # Target: resolver IP  (e.g. "8.8.8.8")
+  # Query name and type set in prometheus.yml per target via params
+  dns_udp:
+    prober: dns
+    timeout: 10s
+    dns:
+      transport_protocol: "udp"
+      preferred_ip_protocol: "ip4"
+      query_name: "example.com"
+      query_type: "A"
+      valid_rcodes:
+        - NOERROR
+
+  # DNS lookup via TCP
+  dns_tcp:
+    prober: dns
+    timeout: 10s
+    dns:
+      transport_protocol: "tcp"
+      preferred_ip_protocol: "ip4"
+      query_name: "example.com"
+      query_type: "A"
+      valid_rcodes:
+        - NOERROR
--- a/vms/utility-101-shadow/docker/monitoring/docker-compose.yaml
+++ b/vms/utility-101-shadow/docker/monitoring/docker-compose.yaml
@@ -0,0 +1,57 @@
+version: '3.8'
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: prometheus
+    restart: unless-stopped
+    ports:
+      - "9090:9090"
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./alerts.yml:/etc/prometheus/alerts.yml:ro
+      - ./data:/prometheus
+      - /etc/timezone:/etc/timezone:ro
+      - /etc/localtime:/etc/localtime:ro
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=60d'
+      - '--web.enable-lifecycle'
+    networks:
+      - monitoring-network
+
+  blackbox-exporter:
+    image: prom/blackbox-exporter:latest
+    container_name: blackbox-exporter
+    restart: unless-stopped
+    ports:
+      - "9115:9115"
+    volumes:
+      - ./blackbox.yml:/etc/blackbox_exporter/config.yml:ro
+      - /etc/timezone:/etc/timezone:ro
+      - /etc/localtime:/etc/localtime:ro
+    networks:
+      - monitoring-network
+
+  alertmanager:
+    image: prom/alertmanager:latest
+    container_name: alertmanager
+    restart: unless-stopped
+    ports:
+      - "9093:9093"
+    volumes:
+      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+      - ./alertmanager-data:/alertmanager
+      - ./smtp_password:/run/secrets/smtp_password:ro
+      - /etc/timezone:/etc/timezone:ro
+      - /etc/localtime:/etc/localtime:ro
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+      - '--storage.path=/alertmanager'
+    networks:
+      - monitoring-network
+
+networks:
+  monitoring-network:
+    driver: bridge
--- a/vms/utility-101-shadow/docker/monitoring/install.md
+++ b/vms/utility-101-shadow/docker/monitoring/install.md
@@ -0,0 +1,20 @@
+## extra installation needs
+
+```bash
+mkdir -p /srv/docker/monitoring/data
+chown 65534:65534 /srv/docker/monitoring/data
+
+# google smtp password
+echo -n 'correct google app password' > smtp_password
+chmod 600 smtp_password
+
+chown 65534:65534 /srv/docker/monitoring/smtp_password
+
+# enable systemd unit
+cp /srv/docker/monitoring/monitoring.service /etc/systemd/system/monitoring.service
+systemctl daemon-reload
+systemctl enable --now monitoring
+
+
+
+```
--- a/vms/utility-101-shadow/docker/monitoring/monitoring.service
+++ b/vms/utility-101-shadow/docker/monitoring/monitoring.service
@@ -0,0 +1,15 @@
+[Unit]
+Description=Monitoring Stack (Prometheus + Blackbox Exporter + Alertmanager)
+After=docker.service network-online.target
+Requires=docker.service
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+WorkingDirectory=/srv/docker/monitoring
+ExecStart=/usr/bin/docker compose up -d --remove-orphans
+ExecStop=/usr/bin/docker compose down
+TimeoutStartSec=300
+
+[Install]
+WantedBy=multi-user.target
--- a/vms/utility-101-shadow/docker/monitoring/prometheus.yml
+++ b/vms/utility-101-shadow/docker/monitoring/prometheus.yml
@@ -0,0 +1,142 @@
+global:
+  scrape_interval: 60s
+  evaluation_interval: 60s
+  scrape_timeout: 15s
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - alertmanager:9093
+
+rule_files:
+  - /etc/prometheus/alerts.yml
+
+scrape_configs:
+
+  # ── Prometheus itself ──────────────────────────────────────────────────────
+  - job_name: prometheus
+    static_configs:
+      - targets:
+          - localhost:9090
+
+  # ── Blackbox Exporter own metrics ─────────────────────────────────────────
+  - job_name: blackbox
+    static_configs:
+      - targets:
+          - blackbox-exporter:9115
+
+  # ── Node Exporter (host: docker) ──────────────────────────────────────────
+  - job_name: node-exporter-docker
+    static_configs:
+      - targets:
+          - docker:9100
+        labels:
+          instance: docker
+
+  # ── HTTPS probes (TLS verified) ───────────────────────────────────────────
+  - job_name: blackbox-https
+    metrics_path: /probe
+    params:
+      module: [https_2xx]
+    static_configs:
+      - targets:
+          - https://fuj-management.home.hrajfrisbee.cz/
+          - https://gitea.home.hrajfrisbee.cz/
+          - https://vault.hrajfrisbee.cz/
+          - https://idm.home.hrajfrisbee.cz/
+          - https://maru-hleda-byt.home.hrajfrisbee.cz/mapa_bytu.html
+          # - https://nonexistent.home.hrajfrisbee.cz/
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+
+  # ── HTTP probes (plain HTTP, no TLS) ──────────────────────────────────────
+  - job_name: blackbox-http
+    metrics_path: /probe
+    params:
+      module: [http_2xx]
+    static_configs:
+      - targets:
+          # - http://192.168.0.30:8080/
+          # - http://some-internal-service:port/healthz
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+
+  # ── HTTP POST probes ───────────────────────────────────────────────────────
+  - job_name: blackbox-http-post
+    metrics_path: /probe
+    params:
+      module: [http_post_2xx]
+    static_configs:
+      - targets:
+          # - http://some-api/endpoint
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+
+  # ── TCP port probes ────────────────────────────────────────────────────────
+  - job_name: blackbox-tcp
+    metrics_path: /probe
+    params:
+      module: [tcp_connect]
+    static_configs:
+      - targets:
+          # - 192.168.0.30:5432   # postgres
+          # - 192.168.0.30:6379   # redis
+          # - 192.168.0.30:22     # ssh
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+
+  # ── ICMP ping probes ───────────────────────────────────────────────────────
+  - job_name: blackbox-icmp
+    metrics_path: /probe
+    params:
+      module: [icmp]
+    static_configs:
+      - targets:
+          # - 192.168.0.30
+          # - 192.168.0.1    # gateway
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+
+  # ── DNS probes ─────────────────────────────────────────────────────────────
+  - job_name: blackbox-dns
+    metrics_path: /probe
+    params:
+      module: [dns_udp]
+    static_configs:
+      - targets:
+          # - 8.8.8.8           # Google DNS
+          # - 1.1.1.1           # Cloudflare DNS
+          # - 192.168.0.1       # local resolver
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115