vms: add monitoring stack and node-exporter for docker host
utility-101-shadow: - Add full monitoring stack (Prometheus + Blackbox Exporter + Alertmanager) with Docker Compose and a systemd unit (monitoring.service) - Prometheus scrapes: itself, blackbox-exporter, and node-exporter on the docker host (docker:9100); blackbox probes cover HTTPS endpoints with TLS cert monitoring - Alertmanager routes warnings to Slack/Discord, critical alerts also to email (Gmail SMTP); inhibit rule suppresses SSLCertExpiringSoon when SSLCertExpired already fires - Alert rules: 11 node-exporter alerts (host down, CPU, memory, disk fill/prediction, iowait, OOM kill, systemd failed units) + 3 blackbox alerts (probe failed, SSL expiring, SSL expired) - readme: add services list and Docker Engine installation steps docker host: - Add node-exporter container running with host pid/network and read-only mounts of /proc, /sys, / for full host metrics visibility - Enable --collector.systemd for systemd unit state metrics - Add systemd unit (node-exporter.service) to manage the container Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
3
vms/docker/install.md
Normal file
3
vms/docker/install.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
## expected services
|
||||||
|
|
||||||
|
- tailscaled
|
||||||
19
vms/docker/node-exporter/docker-compose.yaml
Normal file
19
vms/docker/node-exporter/docker-compose.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
services:
|
||||||
|
node-exporter:
|
||||||
|
image: prom/node-exporter:latest
|
||||||
|
container_name: node-exporter
|
||||||
|
restart: unless-stopped
|
||||||
|
pid: host
|
||||||
|
network_mode: host
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/host/root:ro,rslave
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
command:
|
||||||
|
- '--path.procfs=/host/proc'
|
||||||
|
- '--path.sysfs=/host/sys'
|
||||||
|
- '--path.rootfs=/host/root'
|
||||||
|
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker/.+)($|/)'
|
||||||
|
- '--collector.systemd'
|
||||||
15
vms/docker/node-exporter/node-exporter.service
Normal file
15
vms/docker/node-exporter/node-exporter.service
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Node Exporter
|
||||||
|
After=docker.service network-online.target
|
||||||
|
Requires=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=yes
|
||||||
|
WorkingDirectory=/srv/docker/node-exporter
|
||||||
|
ExecStart=/usr/bin/docker compose up -d --remove-orphans
|
||||||
|
ExecStop=/usr/bin/docker compose down
|
||||||
|
TimeoutStartSec=120
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
68
vms/utility-101-shadow/docker/monitoring/alertmanager.yml
Normal file
68
vms/utility-101-shadow/docker/monitoring/alertmanager.yml
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
# ── Email (SMTP) defaults ──────────────────────────────────────────────────
|
||||||
|
smtp_smarthost: 'smtp.gmail.com:587'
|
||||||
|
smtp_from: 'kacerr.cz+utility-101-shadow@gmail.com'
|
||||||
|
smtp_auth_username: 'kacerr.cz@gmail.com'
|
||||||
|
smtp_auth_password_file: '/run/secrets/smtp_password'
|
||||||
|
smtp_require_tls: true
|
||||||
|
|
||||||
|
route:
|
||||||
|
receiver: default
|
||||||
|
group_by: [alertname, instance]
|
||||||
|
group_wait: 30s # wait before sending the first alert in a group
|
||||||
|
group_interval: 5m # wait before sending alerts for a group that has been updated
|
||||||
|
repeat_interval: 4h # resend still-firing alerts after this interval
|
||||||
|
|
||||||
|
routes:
|
||||||
|
# warnings only go to slack + discord, not email
|
||||||
|
- matchers:
|
||||||
|
- severity = warning
|
||||||
|
receiver: non-critical
|
||||||
|
|
||||||
|
# critical alerts go to all channels
|
||||||
|
- matchers:
|
||||||
|
- severity = critical
|
||||||
|
receiver: default
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
|
||||||
|
# Sends to all three channels
|
||||||
|
- name: default
|
||||||
|
email_configs:
|
||||||
|
- to: 'kacerr.cz@gmail.com'
|
||||||
|
send_resolved: true
|
||||||
|
slack_configs:
|
||||||
|
- api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
|
||||||
|
channel: '#alerts'
|
||||||
|
send_resolved: true
|
||||||
|
title: '{{ if eq .Status "firing" }}:red_circle:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
|
||||||
|
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
||||||
|
discord_configs:
|
||||||
|
- webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
|
||||||
|
send_resolved: true
|
||||||
|
title: '{{ if eq .Status "firing" }}🔴{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
|
||||||
|
message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
||||||
|
|
||||||
|
# Warnings: slack + discord only
|
||||||
|
- name: non-critical
|
||||||
|
slack_configs:
|
||||||
|
- api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
|
||||||
|
channel: '#alerts'
|
||||||
|
send_resolved: true
|
||||||
|
title: '{{ if eq .Status "firing" }}:warning:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
|
||||||
|
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
||||||
|
discord_configs:
|
||||||
|
- webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
|
||||||
|
send_resolved: true
|
||||||
|
title: '{{ if eq .Status "firing" }}⚠️{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
|
||||||
|
message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
||||||
|
|
||||||
|
inhibit_rules:
|
||||||
|
# Suppress SSLCertExpiringSoon if SSLCertExpired is already firing for the same instance
|
||||||
|
- source_matchers:
|
||||||
|
- alertname = SSLCertExpired
|
||||||
|
target_matchers:
|
||||||
|
- alertname = SSLCertExpiringSoon
|
||||||
|
equal: [instance]
|
||||||
146
vms/utility-101-shadow/docker/monitoring/alerts.yml
Normal file
146
vms/utility-101-shadow/docker/monitoring/alerts.yml
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
groups:
|
||||||
|
- name: node-exporter
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# node_exporter unreachable
|
||||||
|
- alert: HostDown
|
||||||
|
expr: up{job=~"node-exporter.*"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Host down: {{ $labels.instance }}"
|
||||||
|
description: "node_exporter on {{ $labels.instance }} has been unreachable for more than 2 minutes."
|
||||||
|
|
||||||
|
# CPU > 85% for 5m
|
||||||
|
- alert: HighCPULoad
|
||||||
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU load: {{ $labels.instance }}"
|
||||||
|
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 85%)."
|
||||||
|
|
||||||
|
# CPU > 95% for 5m
|
||||||
|
- alert: CriticalCPULoad
|
||||||
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.95
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Critical CPU load: {{ $labels.instance }}"
|
||||||
|
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."
|
||||||
|
|
||||||
|
# RAM used > 90% for 5m
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High memory usage: {{ $labels.instance }}"
|
||||||
|
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 90%)."
|
||||||
|
|
||||||
|
# RAM used > 95% for 5m
|
||||||
|
- alert: CriticalMemoryUsage
|
||||||
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Critical memory usage: {{ $labels.instance }}"
|
||||||
|
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 95%)."
|
||||||
|
|
||||||
|
# Disk used > 85% for 5m
|
||||||
|
- alert: DiskSpaceLow
|
||||||
|
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Low disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
|
||||||
|
description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 85%)."
|
||||||
|
|
||||||
|
# Disk used > 95% for 5m
|
||||||
|
- alert: DiskSpaceCritical
|
||||||
|
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes) > 0.95
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Critical disk space: {{ $labels.instance }} {{ $labels.mountpoint }}"
|
||||||
|
description: "Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} is {{ $value | humanizePercentage }} (threshold: 95%)."
|
||||||
|
|
||||||
|
# Disk predicted to fill within 4h
|
||||||
|
- alert: DiskWillFillIn4h
|
||||||
|
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"}[1h], 4 * 3600) < 0
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Disk filling up: {{ $labels.instance }} {{ $labels.mountpoint }}"
|
||||||
|
description: "Disk on {{ $labels.instance }}:{{ $labels.mountpoint }} is projected to run out of space within 4 hours."
|
||||||
|
|
||||||
|
# iowait > 20% for 10m
|
||||||
|
- alert: HighDiskIOWait
|
||||||
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.20
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High disk I/O wait: {{ $labels.instance }}"
|
||||||
|
description: "I/O wait on {{ $labels.instance }} is {{ $value | humanizePercentage }} (threshold: 20%)."
|
||||||
|
|
||||||
|
# OOM kill detected
|
||||||
|
- alert: OOMKillDetected
|
||||||
|
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "OOM kill detected: {{ $labels.instance }}"
|
||||||
|
description: "An OOM kill event was detected on {{ $labels.instance }}."
|
||||||
|
|
||||||
|
# systemd unit in failed state (requires --collector.systemd)
|
||||||
|
- alert: SystemdServiceFailed
|
||||||
|
expr: node_systemd_unit_state{state="failed"} == 1
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Systemd service failed: {{ $labels.instance }}"
|
||||||
|
description: "Service {{ $labels.name }} on {{ $labels.instance }} has been in failed state for more than 2 minutes."
|
||||||
|
|
||||||
|
- name: blackbox-https
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# Probe returned non-2xx or timed out
|
||||||
|
- alert: ProbeFailed
|
||||||
|
expr: probe_success == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Probe failed: {{ $labels.instance }}"
|
||||||
|
description: "Endpoint {{ $labels.instance }} is unreachable or returned a non-2xx response for more than 2 minutes."
|
||||||
|
|
||||||
|
# SSL certificate expires within 21 days
|
||||||
|
- alert: SSLCertExpiringSoon
|
||||||
|
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 21
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "SSL cert expiring soon: {{ $labels.instance }}"
|
||||||
|
description: "SSL certificate for {{ $labels.instance }} expires in {{ $value | printf \"%.0f\" }} days."
|
||||||
|
|
||||||
|
# SSL certificate already expired
|
||||||
|
- alert: SSLCertExpired
|
||||||
|
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "SSL cert expired: {{ $labels.instance }}"
|
||||||
|
description: "SSL certificate for {{ $labels.instance }} has expired."
|
||||||
113
vms/utility-101-shadow/docker/monitoring/blackbox.yml
Normal file
113
vms/utility-101-shadow/docker/monitoring/blackbox.yml
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
modules:
|
||||||
|
|
||||||
|
# ── HTTP ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Plain HTTP GET, expects 2xx response
|
||||||
|
http_2xx:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||||
|
valid_status_codes: [] # defaults to 2xx
|
||||||
|
follow_redirects: true
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
|
||||||
|
# HTTP POST with JSON body, expects 2xx response
|
||||||
|
http_post_2xx:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
method: POST
|
||||||
|
headers:
|
||||||
|
Content-Type: application/json
|
||||||
|
body: '{}'
|
||||||
|
valid_status_codes: [] # defaults to 2xx
|
||||||
|
follow_redirects: true
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
|
||||||
|
# HTTP GET for endpoints that must return 401 (e.g. unauthenticated API)
|
||||||
|
http_401_expected:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
valid_status_codes: [401]
|
||||||
|
follow_redirects: false
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
|
||||||
|
# ── HTTPS ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# HTTPS GET, TLS certificate verified (chain + expiry)
|
||||||
|
# Use this for production endpoints — probe_ssl_earliest_cert_expiry is populated
|
||||||
|
https_2xx:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
valid_status_codes: [] # defaults to 2xx
|
||||||
|
follow_redirects: true
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: false
|
||||||
|
|
||||||
|
# HTTPS GET, skip TLS certificate verification
|
||||||
|
# Use for self-signed certs or internal CAs not trusted by the exporter
|
||||||
|
https_insecure:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
valid_status_codes: [] # defaults to 2xx
|
||||||
|
follow_redirects: true
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
|
||||||
|
# ── TCP ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Raw TCP connect — checks port is open and accepting connections
|
||||||
|
# Target format: host:port (e.g. "192.168.0.30:5432")
|
||||||
|
tcp_connect:
|
||||||
|
prober: tcp
|
||||||
|
timeout: 10s
|
||||||
|
tcp:
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
|
||||||
|
# ── ICMP ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# ICMP ping — checks host reachability and latency
|
||||||
|
# Note: requires NET_RAW capability or running as root
|
||||||
|
icmp:
|
||||||
|
prober: icmp
|
||||||
|
timeout: 10s
|
||||||
|
icmp:
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
|
||||||
|
# ── DNS ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# DNS lookup via UDP (standard)
|
||||||
|
# Target: resolver IP (e.g. "8.8.8.8")
|
||||||
|
# Query name and type set in prometheus.yml per target via params
|
||||||
|
dns_udp:
|
||||||
|
prober: dns
|
||||||
|
timeout: 10s
|
||||||
|
dns:
|
||||||
|
transport_protocol: "udp"
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
query_name: "example.com"
|
||||||
|
query_type: "A"
|
||||||
|
valid_rcodes:
|
||||||
|
- NOERROR
|
||||||
|
|
||||||
|
# DNS lookup via TCP
|
||||||
|
dns_tcp:
|
||||||
|
prober: dns
|
||||||
|
timeout: 10s
|
||||||
|
dns:
|
||||||
|
transport_protocol: "tcp"
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
query_name: "example.com"
|
||||||
|
query_type: "A"
|
||||||
|
valid_rcodes:
|
||||||
|
- NOERROR
|
||||||
57
vms/utility-101-shadow/docker/monitoring/docker-compose.yaml
Normal file
57
vms/utility-101-shadow/docker/monitoring/docker-compose.yaml
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:latest
|
||||||
|
container_name: prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "9090:9090"
|
||||||
|
volumes:
|
||||||
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- ./alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||||
|
- ./data:/prometheus
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--storage.tsdb.path=/prometheus'
|
||||||
|
- '--storage.tsdb.retention.time=60d'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
networks:
|
||||||
|
- monitoring-network
|
||||||
|
|
||||||
|
blackbox-exporter:
|
||||||
|
image: prom/blackbox-exporter:latest
|
||||||
|
container_name: blackbox-exporter
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "9115:9115"
|
||||||
|
volumes:
|
||||||
|
- ./blackbox.yml:/etc/blackbox_exporter/config.yml:ro
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
networks:
|
||||||
|
- monitoring-network
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager:latest
|
||||||
|
container_name: alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "9093:9093"
|
||||||
|
volumes:
|
||||||
|
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||||
|
- ./alertmanager-data:/alertmanager
|
||||||
|
- ./smtp_password:/run/secrets/smtp_password:ro
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||||
|
- '--storage.path=/alertmanager'
|
||||||
|
networks:
|
||||||
|
- monitoring-network
|
||||||
|
|
||||||
|
networks:
|
||||||
|
monitoring-network:
|
||||||
|
driver: bridge
|
||||||
20
vms/utility-101-shadow/docker/monitoring/install.md
Normal file
20
vms/utility-101-shadow/docker/monitoring/install.md
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
## extra installation needs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p /srv/docker/monitoring/data
|
||||||
|
chown 65534:65534 /srv/docker/monitoring/data
|
||||||
|
|
||||||
|
# google smtp password
|
||||||
|
echo -n 'correct google app password' > smtp_password
|
||||||
|
chmod 600 smtp_password
|
||||||
|
|
||||||
|
chown 65534:65534 /srv/docker/monitoring/smtp_password
|
||||||
|
|
||||||
|
# enable systemd unit
|
||||||
|
cp /srv/docker/monitoring/monitoring.service /etc/systemd/system/monitoring.service
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable --now monitoring
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
15
vms/utility-101-shadow/docker/monitoring/monitoring.service
Normal file
15
vms/utility-101-shadow/docker/monitoring/monitoring.service
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Monitoring Stack (Prometheus + Blackbox Exporter + Alertmanager)
|
||||||
|
After=docker.service network-online.target
|
||||||
|
Requires=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=yes
|
||||||
|
WorkingDirectory=/srv/docker/monitoring
|
||||||
|
ExecStart=/usr/bin/docker compose up -d --remove-orphans
|
||||||
|
ExecStop=/usr/bin/docker compose down
|
||||||
|
TimeoutStartSec=300
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
142
vms/utility-101-shadow/docker/monitoring/prometheus.yml
Normal file
142
vms/utility-101-shadow/docker/monitoring/prometheus.yml
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 60s
|
||||||
|
evaluation_interval: 60s
|
||||||
|
scrape_timeout: 15s
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- alertmanager:9093
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
- /etc/prometheus/alerts.yml
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
|
||||||
|
# ── Prometheus itself ──────────────────────────────────────────────────────
|
||||||
|
- job_name: prometheus
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- localhost:9090
|
||||||
|
|
||||||
|
# ── Blackbox Exporter own metrics ─────────────────────────────────────────
|
||||||
|
- job_name: blackbox
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- blackbox-exporter:9115
|
||||||
|
|
||||||
|
# ── Node Exporter (host: docker) ──────────────────────────────────────────
|
||||||
|
- job_name: node-exporter-docker
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- docker:9100
|
||||||
|
labels:
|
||||||
|
instance: docker
|
||||||
|
|
||||||
|
# ── HTTPS probes (TLS verified) ───────────────────────────────────────────
|
||||||
|
- job_name: blackbox-https
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [https_2xx]
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- https://fuj-management.home.hrajfrisbee.cz/
|
||||||
|
- https://gitea.home.hrajfrisbee.cz/
|
||||||
|
- https://vault.hrajfrisbee.cz/
|
||||||
|
- https://idm.home.hrajfrisbee.cz/
|
||||||
|
- https://maru-hleda-byt.home.hrajfrisbee.cz/mapa_bytu.html
|
||||||
|
# - https://nonexistent.home.hrajfrisbee.cz/
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox-exporter:9115
|
||||||
|
|
||||||
|
# ── HTTP probes (plain HTTP, no TLS) ──────────────────────────────────────
|
||||||
|
- job_name: blackbox-http
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [http_2xx]
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
# - http://192.168.0.30:8080/
|
||||||
|
# - http://some-internal-service:port/healthz
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox-exporter:9115
|
||||||
|
|
||||||
|
# ── HTTP POST probes ───────────────────────────────────────────────────────
|
||||||
|
- job_name: blackbox-http-post
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [http_post_2xx]
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
# - http://some-api/endpoint
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox-exporter:9115
|
||||||
|
|
||||||
|
# ── TCP port probes ────────────────────────────────────────────────────────
|
||||||
|
- job_name: blackbox-tcp
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [tcp_connect]
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
# - 192.168.0.30:5432 # postgres
|
||||||
|
# - 192.168.0.30:6379 # redis
|
||||||
|
# - 192.168.0.30:22 # ssh
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox-exporter:9115
|
||||||
|
|
||||||
|
# ── ICMP ping probes ───────────────────────────────────────────────────────
|
||||||
|
- job_name: blackbox-icmp
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [icmp]
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
# - 192.168.0.30
|
||||||
|
# - 192.168.0.1 # gateway
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox-exporter:9115
|
||||||
|
|
||||||
|
# ── DNS probes ─────────────────────────────────────────────────────────────
|
||||||
|
- job_name: blackbox-dns
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [dns_udp]
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
# - 8.8.8.8 # Google DNS
|
||||||
|
# - 1.1.1.1 # Cloudflare DNS
|
||||||
|
# - 192.168.0.1 # local resolver
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox-exporter:9115
|
||||||
@@ -1,3 +1,37 @@
|
|||||||
|
## Services
|
||||||
|
|
||||||
|
- wireguard
|
||||||
|
- openvpn server
|
||||||
|
- dns server (bind)
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# installation
|
||||||
|
apt-get install -y ca-certificates curl
|
||||||
|
|
||||||
|
# 3. Add Docker's official GPG key
|
||||||
|
sudo install -m 0755 -d /etc/apt/keyrings
|
||||||
|
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||||
|
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||||
|
|
||||||
|
# 4. Add Docker apt repository
|
||||||
|
echo \
|
||||||
|
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||||
|
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||||
|
|
||||||
|
# 5. Install Docker Engine
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||||
|
|
||||||
|
# 6. Enable and start Docker
|
||||||
|
sudo systemctl enable docker
|
||||||
|
sudo systemctl start docker
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
## named tweaks
|
## named tweaks
|
||||||
|
|
||||||
1. Generate TSIG key
|
1. Generate TSIG key
|
||||||
|
|||||||
Reference in New Issue
Block a user