utility-101-shadow: - Add full monitoring stack (Prometheus + Blackbox Exporter + Alertmanager) with Docker Compose and a systemd unit (monitoring.service) - Prometheus scrapes: itself, blackbox-exporter, and node-exporter on the docker host (docker:9100); blackbox probes cover HTTPS endpoints with TLS cert monitoring - Alertmanager routes warnings to Slack/Discord, critical alerts also to email (Gmail SMTP); inhibit rule suppresses SSLCertExpiringSoon when SSLCertExpired already fires - Alert rules: 11 node-exporter alerts (host down, CPU, memory, disk fill/prediction, iowait, OOM kill, systemd failed units) + 3 blackbox alerts (probe failed, SSL expiring, SSL expired) - readme: add services list and Docker Engine installation steps docker host: - Add node-exporter container running with host pid/network and read-only mounts of /proc, /sys, / for full host metrics visibility - Enable --collector.systemd for systemd unit state metrics - Add systemd unit (node-exporter.service) to manage the container Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
69 lines
2.7 KiB
YAML
69 lines
2.7 KiB
YAML
global:
|
|
resolve_timeout: 5m
|
|
|
|
# ── Email (SMTP) defaults ──────────────────────────────────────────────────
|
|
smtp_smarthost: 'smtp.gmail.com:587'
|
|
smtp_from: 'kacerr.cz+utility-101-shadow@gmail.com'
|
|
smtp_auth_username: 'kacerr.cz@gmail.com'
|
|
smtp_auth_password_file: '/run/secrets/smtp_password'
|
|
smtp_require_tls: true
|
|
|
|
route:
|
|
receiver: default
|
|
group_by: [alertname, instance]
|
|
group_wait: 30s # wait before sending the first alert in a group
|
|
group_interval: 5m # wait before sending alerts for a group that has been updated
|
|
repeat_interval: 4h # resend still-firing alerts after this interval
|
|
|
|
routes:
|
|
# warnings only go to slack + discord, not email
|
|
- matchers:
|
|
- severity = warning
|
|
receiver: non-critical
|
|
|
|
# critical alerts go to all channels
|
|
- matchers:
|
|
- severity = critical
|
|
receiver: default
|
|
|
|
receivers:
|
|
|
|
# Sends to all three channels
|
|
- name: default
|
|
email_configs:
|
|
- to: 'kacerr.cz@gmail.com'
|
|
send_resolved: true
|
|
slack_configs:
|
|
- api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
|
|
channel: '#alerts'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}:red_circle:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
|
|
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
|
discord_configs:
|
|
- webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}🔴{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
|
|
message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
|
|
|
# Warnings: slack + discord only
|
|
- name: non-critical
|
|
slack_configs:
|
|
- api_url: 'https://hooks.slack.com/services/REPLACE/WITH/YOUR_WEBHOOK'
|
|
channel: '#alerts'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}:warning:{{ else }}:large_green_circle:{{ end }} {{ .CommonAnnotations.summary }}'
|
|
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
|
discord_configs:
|
|
- webhook_url: 'https://discord.com/api/webhooks/REPLACE/WITH_YOUR_WEBHOOK'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}⚠️{{ else }}🟢{{ end }} {{ .CommonAnnotations.summary }}'
|
|
message: '{{ range .Alerts }}{{ .Annotations.description }}{{ "\n" }}{{ end }}'
|
|
|
|
inhibit_rules:
|
|
# Suppress SSLCertExpiringSoon if SSLCertExpired is already firing for the same instance
|
|
- source_matchers:
|
|
- alertname = SSLCertExpired
|
|
target_matchers:
|
|
- alertname = SSLCertExpiringSoon
|
|
equal: [instance]
|