vms: add monitoring stack and node-exporter for docker host
utility-101-shadow: - Add full monitoring stack (Prometheus + Blackbox Exporter + Alertmanager) with Docker Compose and a systemd unit (monitoring.service) - Prometheus scrapes: itself, blackbox-exporter, and node-exporter on the docker host (docker:9100); blackbox probes cover HTTPS endpoints with TLS cert monitoring - Alertmanager routes warnings to Slack/Discord, critical alerts also to email (Gmail SMTP); inhibit rule suppresses SSLCertExpiringSoon when SSLCertExpired already fires - Alert rules: 11 node-exporter alerts (host down, CPU, memory, disk fill/prediction, iowait, OOM kill, systemd failed units) + 3 blackbox alerts (probe failed, SSL expiring, SSL expired) - readme: add services list and Docker Engine installation steps docker host: - Add node-exporter container running with host pid/network and read-only mounts of /proc, /sys, / for full host metrics visibility - Enable --collector.systemd for systemd unit state metrics - Add systemd unit (node-exporter.service) to manage the container Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
68
vms/utility-101-shadow/docker/monitoring/alertmanager.yml
Normal file
68
vms/utility-101-shadow/docker/monitoring/alertmanager.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
# ── Email (SMTP) defaults ──────────────────────────────────────────────────
|
||||
smtp_smarthost: 'smtp.gmail.com:587'
|
||||
smtp_from: 'kacerr.cz+utility-101-shadow@gmail.com'
|
||||
smtp_auth_username: 'kacerr.cz@gmail.com'
|
||||
smtp_auth_password_file: '/run/secrets/smtp_password'
|
||||
smtp_require_tls: true
|
||||
|
||||
route:
|
||||
receiver: default
|
||||
group_by: [alertname, instance]
|
||||
group_wait: 30s # wait before sending the first alert in a group
|
||||
group_interval: 5m # wait before sending alerts for a group that has been updated
|
||||
repeat_interval: 4h # resend still-firing alerts after this interval
|
||||
|
||||
routes:
|
||||
# warnings only go to slack + discord, not email
|
||||
- matchers:
|
||||
- severity = warning
|
||||
receiver: non-critical
|
||||
|
||||
# critical alerts go to all channels
|
||||
- matchers:
|
||||
- severity = critical
|
||||
receiver: default
|
||||
|
||||
receivers:
|
||||
|
||||
# Sends to all three channels
|
||||
- name: default
|
||||
email_configs:
|
||||
- to: 'kacerr.cz@gmail.com'
|
||||
send_resolved: true
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
|
||||
channel: '#alerts'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}:red_circle:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
||||
discord_configs:
|
||||
- webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}🔴{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
|
||||
message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
||||
|
||||
# Warnings: slack + discord only
|
||||
- name: non-critical
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
|
||||
channel: '#alerts'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}:warning:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
||||
discord_configs:
|
||||
- webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}⚠️{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
|
||||
message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
||||
|
||||
inhibit_rules:
|
||||
# Suppress SSLCertExpiringSoon if SSLCertExpired is already firing for the same instance
|
||||
- source_matchers:
|
||||
- alertname = SSLCertExpired
|
||||
target_matchers:
|
||||
- alertname = SSLCertExpiringSoon
|
||||
equal: [instance]
|
||||
Reference in New Issue
Block a user