vms: add monitoring stack and node-exporter for docker host
utility-101-shadow: - Add full monitoring stack (Prometheus + Blackbox Exporter + Alertmanager) with Docker Compose and a systemd unit (monitoring.service) - Prometheus scrapes: itself, blackbox-exporter, and node-exporter on the docker host (docker:9100); blackbox probes cover HTTPS endpoints with TLS cert monitoring - Alertmanager routes warnings to Slack/Discord, critical alerts also to email (Gmail SMTP); inhibit rule suppresses SSLCertExpiringSoon when SSLCertExpired already fires - Alert rules: 11 node-exporter alerts (host down, CPU, memory, disk fill/prediction, iowait, OOM kill, systemd failed units) + 3 blackbox alerts (probe failed, SSL expiring, SSL expired) - readme: add services list and Docker Engine installation steps docker host: - Add node-exporter container running with host pid/network and read-only mounts of /proc, /sys, / for full host metrics visibility - Enable --collector.systemd for systemd unit state metrics - Add systemd unit (node-exporter.service) to manage the container Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
113
vms/utility-101-shadow/docker/monitoring/blackbox.yml
Normal file
113
vms/utility-101-shadow/docker/monitoring/blackbox.yml
Normal file
@@ -0,0 +1,113 @@
|
||||
modules:
|
||||
|
||||
# ── HTTP ──────────────────────────────────────────────────────────────────
|
||||
|
||||
# Plain HTTP GET, expects 2xx response
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
method: GET
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||
valid_status_codes: [] # defaults to 2xx
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: "ip4"
|
||||
|
||||
# HTTP POST with JSON body, expects 2xx response
|
||||
http_post_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
method: POST
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
body: '{}'
|
||||
valid_status_codes: [] # defaults to 2xx
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: "ip4"
|
||||
|
||||
# HTTP GET for endpoints that must return 401 (e.g. unauthenticated API)
|
||||
http_401_expected:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
method: GET
|
||||
valid_status_codes: [401]
|
||||
follow_redirects: false
|
||||
preferred_ip_protocol: "ip4"
|
||||
|
||||
# ── HTTPS ─────────────────────────────────────────────────────────────────
|
||||
|
||||
# HTTPS GET, TLS certificate verified (chain + expiry)
|
||||
# Use this for production endpoints — probe_ssl_earliest_cert_expiry is populated
|
||||
https_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
method: GET
|
||||
valid_status_codes: [] # defaults to 2xx
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: "ip4"
|
||||
tls_config:
|
||||
insecure_skip_verify: false
|
||||
|
||||
# HTTPS GET, skip TLS certificate verification
|
||||
# Use for self-signed certs or internal CAs not trusted by the exporter
|
||||
https_insecure:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
method: GET
|
||||
valid_status_codes: [] # defaults to 2xx
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: "ip4"
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
|
||||
# ── TCP ───────────────────────────────────────────────────────────────────
|
||||
|
||||
# Raw TCP connect — checks port is open and accepting connections
|
||||
# Target format: host:port (e.g. "192.168.0.30:5432")
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
timeout: 10s
|
||||
tcp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
|
||||
# ── ICMP ──────────────────────────────────────────────────────────────────
|
||||
|
||||
# ICMP ping — checks host reachability and latency
|
||||
# Note: requires NET_RAW capability or running as root
|
||||
icmp:
|
||||
prober: icmp
|
||||
timeout: 10s
|
||||
icmp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
|
||||
# ── DNS ───────────────────────────────────────────────────────────────────
|
||||
|
||||
# DNS lookup via UDP (standard)
|
||||
# Target: resolver IP (e.g. "8.8.8.8")
|
||||
# Query name and type set in prometheus.yml per target via params
|
||||
dns_udp:
|
||||
prober: dns
|
||||
timeout: 10s
|
||||
dns:
|
||||
transport_protocol: "udp"
|
||||
preferred_ip_protocol: "ip4"
|
||||
query_name: "example.com"
|
||||
query_type: "A"
|
||||
valid_rcodes:
|
||||
- NOERROR
|
||||
|
||||
# DNS lookup via TCP
|
||||
dns_tcp:
|
||||
prober: dns
|
||||
timeout: 10s
|
||||
dns:
|
||||
transport_protocol: "tcp"
|
||||
preferred_ip_protocol: "ip4"
|
||||
query_name: "example.com"
|
||||
query_type: "A"
|
||||
valid_rcodes:
|
||||
- NOERROR
|
||||
Reference in New Issue
Block a user