gitops/velero: add manifests and runbook - kustomization is yet to be

created
This commit is contained in:
Jan Novak
2026-01-17 00:07:03 +01:00
parent b9f99c2950
commit 0d97a796e9
5 changed files with 788 additions and 0 deletions

View File

@@ -0,0 +1,141 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: velero
namespace: velero
spec:
interval: 30m
chart:
spec:
chart: velero
version: "11.3.2" # Velero 1.16.x - latest stable as of Jan 2025
sourceRef:
kind: HelmRepository
name: vmware-tanzu
namespace: flux-system
install:
crds: CreateReplace
remediation:
retries: 3
upgrade:
crds: CreateReplace
remediation:
retries: 3
values:
# Node agent for filesystem backups (kopia/restic)
deployNodeAgent: true
nodeAgent:
podVolumePath: /var/lib/kubelet/pods
# nodeAgent.privileged removed in chart 8.x+, use containerSecurityContext instead
containerSecurityContext:
privileged: true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
memory: 1Gi
configuration:
# backupStorageLocation - note: provider is at same level as bucket, not nested
backupStorageLocation:
- name: default
provider: aws
bucket: velero-backups # create this bucket in minio first
accessMode: ReadWrite
default: true
config:
region: us-east-1 # minio ignores but required
s3ForcePathStyle: "true"
s3Url: http://192.168.0.2:9000 # adjust to your minio service
# Volume snapshot location (for CSI snapshots, optional)
volumeSnapshotLocation:
- name: default
provider: aws
config:
region: us-east-1
# Use kopia for fs backups (restic deprecated, kopia is default in 1.14+)
uploaderType: kopia
# Default TTL for backups
defaultBackupTTL: 720h # 30 days
# Features
defaultVolumesToFsBackup: false # opt-in via annotation per-pod
# Credentials
credentials:
useSecret: true
existingSecret: velero-minio-credentials
# Velero server resources
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
memory: 512Mi
# Schedules
schedules:
daily-all-namespaces:
disabled: false
schedule: "0 3 * * *" # 3 AM daily
useOwnerReferencesInBackup: false
template:
ttl: 168h # 7 days
storageLocation: default
includedNamespaces:
- "*"
excludedNamespaces:
- kube-system
- kube-public
- kube-node-lease
- flux-system
- velero
excludedResources:
- events
- events.events.k8s.io
snapshotVolumes: false
defaultVolumesToFsBackup: true
weekly-full:
disabled: false
schedule: "0 4 * * 0" # Sunday 4 AM
template:
ttl: 720h # 30 days
storageLocation: default
includedNamespaces:
- "*"
excludedNamespaces:
- kube-system
- kube-public
- kube-node-lease
snapshotVolumes: false
defaultVolumesToFsBackup: true
# Init containers for plugins - AWS plugin for S3-compatible storage
# Note: CSI plugin merged into velero core in v1.14, no separate initContainer needed
initContainers:
- name: velero-plugin-for-aws
image: velero/velero-plugin-for-aws:v1.11.0 # compatible with Velero 1.15/1.16
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /target
name: plugins
# Metrics
metrics:
enabled: true
serviceMonitor:
enabled: false # set true if using prometheus-operator
additionalLabels: {}
# Disable volume snapshots if not using CSI snapshotter
snapshotsEnabled: false
# Pod annotations/labels
podAnnotations: {}
podLabels: {}

View File

@@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: vmware-tanzu
namespace: flux-system
spec:
interval: 24h
url: https://vmware-tanzu.github.io/helm-charts

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: velero

View File

@@ -0,0 +1,10 @@
apiVersion: v1
kind: Secret
metadata:
name: velero-minio-credentials
namespace: velero
stringData:
cloud: |
[default]
aws_access_key_id=k8s
aws_secret_access_key=poh9ieceHohnum5e

View File

@@ -0,0 +1,625 @@
# Velero Backup & Recovery Runbook
## Quick Reference
| Operation | Command |
|-----------|---------|
| List backups | `velero backup get` |
| Backup status | `velero backup describe <name> --details` |
| Browse backup contents | `velero backup describe <name> --details \| grep -A100 "Resource List"` |
| Restore full namespace | `velero restore create --from-backup <name> --include-namespaces <ns>` |
| Restore single PVC | `velero restore create --from-backup <name> --include-resources pvc,pv --selector app=<label>` |
| Restore specific files | See [Specific File Restore](#specific-file-restore) |
---
## 1. Browsing Backup Contents
### List All Backups
```bash
# All backups with status
velero backup get
# Backups for specific schedule
velero backup get -l velero.io/schedule-name=daily-all-namespaces
# JSON output for scripting
velero backup get -o json | jq '.items[] | {name: .metadata.name, phase: .status.phase, started: .status.startTimestamp}'
```
### Inspect Backup Contents
```bash
# Full backup details including all resources
velero backup describe <backup-name> --details
# List backed-up namespaces
velero backup describe <backup-name> --details | grep -A 5 "Namespaces:"
# List all resources in backup
velero backup describe <backup-name> --details | grep -A 200 "Resource List:" | head -100
# Check which PVCs were backed up
velero backup describe <backup-name> --details | grep -i persistentvolumeclaim
# Check pod volume backups (kopia/restic)
velero backup describe <backup-name> --details | grep -A 50 "Pod Volume Backups"
```
### View Backup Logs
```bash
# Stream logs
velero backup logs <backup-name>
# Search for errors
velero backup logs <backup-name> | grep -i error
# Check specific namespace backup
velero backup logs <backup-name> | grep "namespace=seafile"
```
### Browse Kopia Repository Directly
For direct file-level inspection of kopia backups in MinIO:
```bash
# Get kopia repository password from velero secret
KOPIA_PASSWORD=$(kubectl get secret -n velero velero-repo-credentials -o jsonpath='{.data.repository-password}' | base64 -d)
# Connect to repository (run from a pod with minio access or port-forward)
kopia repository connect s3 \
--bucket=velero-backups \
--endpoint=minio.minio.svc.cluster.local:9000 \
--access-key=<MINIO_ACCESS_KEY> \
--secret-access-key=<MINIO_SECRET_KEY> \
--password="${KOPIA_PASSWORD}" \
--prefix=kopia/<cluster-name>/
# List snapshots
kopia snapshot list --all
# Browse specific snapshot
kopia snapshot list <snapshot-id>
kopia ls <snapshot-id>
# Mount for browsing (requires FUSE)
mkdir /tmp/kopia-mount
kopia mount <snapshot-id> /tmp/kopia-mount &
ls /tmp/kopia-mount/
```
---
## 2. Full Namespace Restore
### Restore to Same Cluster (Disaster Recovery)
```bash
# Restore entire namespace
velero restore create seafile-restore \
--from-backup daily-all-namespaces-20250115030000 \
--include-namespaces seafile \
--wait
# Monitor restore progress
velero restore describe seafile-restore --details
velero restore logs seafile-restore -f
```
### Restore to Different Namespace
```bash
velero restore create seafile-test-restore \
--from-backup daily-all-namespaces-20250115030000 \
--include-namespaces seafile \
--namespace-mappings seafile:seafile-restored \
--wait
```
### Restore with Resource Filtering
```bash
# Restore only specific resource types
velero restore create restore-pvcs-only \
--from-backup <backup-name> \
--include-namespaces seafile \
--include-resources persistentvolumeclaims,persistentvolumes \
--wait
# Exclude certain resources
velero restore create restore-no-secrets \
--from-backup <backup-name> \
--include-namespaces seafile \
--exclude-resources secrets \
--wait
# Restore by label selector
velero restore create restore-app \
--from-backup <backup-name> \
--selector app.kubernetes.io/name=seafile \
--wait
```
---
## 3. Single PVC/Volume Restore
### Restore Specific PVC
```bash
# First, scale down the workload using the PVC
kubectl scale deployment seafile -n seafile --replicas=0
# Delete the corrupted/problematic PVC (data will be restored)
kubectl delete pvc seafile-data -n seafile
# Restore just that PVC
velero restore create restore-seafile-pvc \
--from-backup <backup-name> \
--include-namespaces seafile \
--include-resources persistentvolumeclaims,persistentvolumes \
--selector app=seafile \
--wait
# Scale back up
kubectl scale deployment seafile -n seafile --replicas=1
```
### Restore PVC to New Name (Side-by-Side)
```bash
# Create restore with transforms
cat <<EOF | kubectl apply -f -
apiVersion: velero.io/v1
kind: Restore
metadata:
name: restore-pvc-new-name
namespace: velero
spec:
backupName: <backup-name>
includedNamespaces:
- seafile
includedResources:
- persistentvolumeclaims
- persistentvolumes
labelSelector:
matchLabels:
app: seafile
restorePVs: true
namespaceMapping:
seafile: seafile-recovery
EOF
# Or use restore hooks to rename
velero restore create restore-pvc-renamed \
--from-backup <backup-name> \
--include-namespaces seafile \
--namespace-mappings seafile:seafile-temp \
--wait
```
---
## 4. Specific File Restore
Velero doesn't support single-file restore natively. Use kopia directly:
### Method 1: Kopia Direct Restore
```bash
# Find the backup/snapshot containing your file
# First, get velero's kopia repo credentials
REPO_PASSWORD=$(kubectl get secret -n velero velero-repo-credentials \
-o jsonpath='{.data.repository-password}' | base64 -d)
# Run a debug pod with kopia
kubectl run kopia-restore --rm -it \
--image=kopia/kopia:latest \
--restart=Never \
--namespace=velero \
-- /bin/sh
# Inside the pod:
kopia repository connect s3 \
--bucket=velero-backups \
--endpoint=minio.minio.svc.cluster.local:9000 \
--access-key=<ACCESS_KEY> \
--secret-access-key=<SECRET_KEY> \
--password="<REPO_PASSWORD>" \
--prefix=kopia/<cluster>/
# List snapshots for specific PVC
kopia snapshot list --all | grep seafile
# Restore specific file
kopia restore <snapshot-id>/path/to/file.txt /tmp/restored-file.txt
# Restore specific directory
kopia restore <snapshot-id>/data/uploads/ /tmp/restored-uploads/
```
### Method 2: Mount and Copy
```bash
# Create a temporary pod that mounts the backup
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: backup-browser
namespace: velero
spec:
containers:
- name: browser
image: kopia/kopia:latest
command: ["sleep", "3600"]
env:
- name: KOPIA_PASSWORD
valueFrom:
secretKeyRef:
name: velero-repo-credentials
key: repository-password
volumeMounts:
- name: restore-target
mountPath: /restore
volumes:
- name: restore-target
emptyDir: {}
EOF
# Exec in and restore files
kubectl exec -it -n velero backup-browser -- /bin/sh
# ... run kopia commands inside
```
### Method 3: Full PVC Restore + Copy + Delete
```bash
# 1. Restore PVC to temp namespace
velero restore create temp-restore \
--from-backup <backup-name> \
--include-namespaces seafile \
--namespace-mappings seafile:temp-restore \
--include-resources pvc,pv \
--wait
# 2. Create a pod to access both PVCs
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: file-copier
namespace: seafile
spec:
containers:
- name: copier
image: alpine
command: ["sleep", "3600"]
volumeMounts:
- name: current
mountPath: /current
- name: restored
mountPath: /restored
volumes:
- name: current
persistentVolumeClaim:
claimName: seafile-data
- name: restored
persistentVolumeClaim:
claimName: seafile-data # in temp-restore namespace - need cross-ns mount or copy via node
EOF
# Alternative: use rsync between namespaces
kubectl exec -n temp-restore deployment/temp-pod -- tar cf - /data/specific-file.txt | \
kubectl exec -i -n seafile deployment/seafile -- tar xf - -C /
```
---
## 5. Database-Specific Recovery
### MariaDB (via mariadb-operator)
Velero fs-backup of running DB may be inconsistent. Prefer operator backups:
```bash
# List operator backups
kubectl get backups.k8s.mariadb.com -n mariadb
# Restore from operator backup
kubectl apply -f - <<EOF
apiVersion: k8s.mariadb.com/v1alpha1
kind: Restore
metadata:
name: mariadb-restore
namespace: mariadb
spec:
mariaDbRef:
name: mariadb
backupRef:
name: mariadb-backup-20250115
EOF
```
If you must restore from Velero:
```bash
# 1. Scale down mariadb
kubectl scale statefulset mariadb -n mariadb --replicas=0
# 2. Restore PVC
velero restore create mariadb-pvc-restore \
--from-backup <backup-name> \
--include-namespaces mariadb \
--include-resources pvc,pv \
--wait
# 3. Scale back up - DB will recover from WAL
kubectl scale statefulset mariadb -n mariadb --replicas=1
# 4. Verify data integrity
kubectl exec -it -n mariadb mariadb-0 -- mariadb -e "CHECK TABLE important_table;"
```
### Redis
```bash
# If Redis is persistent (RDB/AOF)
kubectl scale statefulset redis -n redis --replicas=0
velero restore create redis-restore \
--from-backup <backup-name> \
--include-namespaces redis \
--wait
kubectl scale statefulset redis -n redis --replicas=1
```
---
## 6. Backup Management
### Create On-Demand Backup
```bash
# Full backup
velero backup create manual-backup-$(date +%Y%m%d-%H%M%S) \
--default-volumes-to-fs-backup \
--snapshot-volumes=false \
--wait
# Specific namespace pre-maintenance
velero backup create pre-upgrade-seafile-$(date +%Y%m%d) \
--include-namespaces seafile \
--default-volumes-to-fs-backup \
--wait
```
### Delete Old Backups
```bash
# Delete specific backup
velero backup delete <backup-name> --confirm
# Delete backups older than 30 days (careful!)
velero backup get -o json | jq -r '.items[] | select(.status.startTimestamp < (now - 2592000 | todate)) | .metadata.name' | xargs -I {} velero backup delete {} --confirm
```
### Check Backup Storage Location Health
```bash
velero backup-location get
velero backup-location describe default
# Verify connectivity
kubectl logs -n velero deployment/velero | grep -i "backup storage location"
```
---
## 7. Disaster Recovery Procedures
### Complete Cluster Rebuild
```bash
# 1. Install Velero on new cluster with same config
helm upgrade --install velero vmware-tanzu/velero \
-n velero --create-namespace \
-f velero-values.yaml
# 2. Wait for velero to sync backup list from S3
sleep 60
velero backup get
# 3. Restore namespaces in order (dependencies first)
# Restore storage/infra
velero restore create restore-infra \
--from-backup <latest-backup> \
--include-namespaces minio,cert-manager \
--wait
# Restore databases
velero restore create restore-databases \
--from-backup <latest-backup> \
--include-namespaces mariadb,redis \
--wait
# Restore applications
velero restore create restore-apps \
--from-backup <latest-backup> \
--include-namespaces seafile,plane \
--wait
```
### Restore Schedule After Accidental Deletion
```bash
# Schedules are cluster resources, restore from backup
velero restore create restore-schedules \
--from-backup <backup-name> \
--include-resources schedules.velero.io \
--wait
```
---
## 8. Troubleshooting
### Backup Stuck/Failed
```bash
# Check velero logs
kubectl logs -n velero deployment/velero --tail=100
# Check node-agent on specific node
kubectl logs -n velero -l name=node-agent --tail=100
# Check backup details for errors
velero backup describe <backup-name> --details | grep -i -A5 "error\|warning\|failed"
# Common issues:
# - Node-agent not running on node with PV
kubectl get pods -n velero -l name=node-agent -o wide
# - PVC not annotated for backup
kubectl get pvc -A -o json | jq '.items[] | select(.metadata.annotations["backup.velero.io/backup-volumes"] != null)'
```
### Restore Not Restoring Volumes
```bash
# Check if backup has pod volume backups
velero backup describe <backup-name> --details | grep -A20 "Pod Volume Backups"
# Verify restore is configured to restore PVs
velero restore describe <restore-name> --details | grep -i "restorePVs"
# Force PV restore
velero restore create <name> \
--from-backup <backup> \
--restore-volumes=true \
--wait
```
### Kopia Repository Issues
```bash
# Check repository status
kubectl exec -n velero deployment/velero -- \
velero repo get
# Unlock stuck repository
kubectl exec -n velero deployment/velero -- \
velero repo unlock <repo-name>
# Maintenance (runs automatically, but can trigger manually)
kubectl exec -n velero deployment/velero -- \
velero repo maintenance run
```
---
## 9. Monitoring & Alerting
### Prometheus Metrics
Key metrics to monitor:
```promql
# Backup success rate
sum(velero_backup_success_total) / sum(velero_backup_attempt_total)
# Backup duration
velero_backup_duration_seconds{schedule="daily-all-namespaces"}
# Backup size
velero_backup_items_total{backup="<name>"}
# Failed backups in last 24h
increase(velero_backup_failure_total[24h])
```
### AlertManager Rules
```yaml
groups:
- name: velero
rules:
- alert: VeleroBackupFailed
expr: increase(velero_backup_failure_total[1h]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Velero backup failed"
- alert: VeleroBackupMissing
expr: time() - velero_backup_last_successful_timestamp{schedule="daily-all-namespaces"} > 86400
for: 1h
labels:
severity: warning
annotations:
summary: "No successful backup in 24h"
- alert: VeleroNodeAgentDown
expr: kube_daemonset_status_number_unavailable{daemonset="node-agent"} > 0
for: 15m
labels:
severity: warning
```
---
## 10. Regular Maintenance Tasks
### Weekly
```bash
# Verify recent backup integrity
velero backup describe $(velero backup get -o json | jq -r '.items | sort_by(.status.startTimestamp) | last | .metadata.name') --details
# Check backup storage usage
mc ls minio/velero-backups --summarize
```
### Monthly
```bash
# Test restore to scratch namespace
velero restore create monthly-test-$(date +%Y%m) \
--from-backup $(velero backup get -o json | jq -r '.items[0].metadata.name') \
--include-namespaces seafile \
--namespace-mappings seafile:restore-test \
--wait
# Verify restored data
kubectl exec -n restore-test deploy/seafile -- ls -la /data
# Cleanup test
kubectl delete namespace restore-test
velero restore delete monthly-test-$(date +%Y%m) --confirm
```
### Quarterly
- Full DR test: restore to separate cluster
- Review retention policies
- Audit backup coverage (new namespaces/PVCs added?)
- Update velero/plugin versions
---
## Appendix: Common Label Selectors
```bash
# Backup by app label
--selector app.kubernetes.io/name=seafile
# Backup by component
--selector app.kubernetes.io/component=database
# Exclude specific pods from backup
# (add to pod annotation)
kubectl annotate pod <pod> backup.velero.io/backup-volumes-excludes=cache,tmp
```