Backup & Disaster Recovery - Kubernetes Data Protection
Tổng quan
Backup và disaster recovery đảm bảo data protection và business continuity cho Kubernetes clusters.
etcd Backup
Manual Backup
# Create etcd snapshot
ETCDCTL_API=3 etcdctl snapshot save backup-$(date +%Y%m%d-%H%M%S).db
--endpoints=https://127.0.0.1:2379
--cacert=/etc/kubernetes/pki/etcd/ca.crt
--cert=/etc/kubernetes/pki/etcd/server.crt
--key=/etc/kubernetes/pki/etcd/server.key
# Verify backup
ETCDCTL_API=3 etcdctl snapshot status backup-$(date +%Y%m%d-%H%M%S).db --write-out=table
Automated Backup
apiVersion: batch/v1
kind: CronJob
metadata:
name: etcd-backup
namespace: kube-system
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: etcd-backup
image: k8s.gcr.io/etcd:3.5.0-0
command:
- /bin/sh
- -c
- |
ETCDCTL_API=3 etcdctl snapshot save /backup/etcd-$(date +%Y%m%d-%H%M%S).db
--endpoints=https://etcd:2379
--cacert=/etc/etcd/ca.crt
--cert=/etc/etcd/server.crt
--key=/etc/etcd/server.key
volumeMounts:
- name: backup-storage
mountPath: /backup
- name: etcd-certs
mountPath: /etc/etcd
volumes:
- name: backup-storage
persistentVolumeClaim:
claimName: backup-pvc
- name: etcd-certs
secret:
secretName: etcd-certs
restartPolicy: OnFailure
etcd Restore
Restore Process
# Stop API server
sudo mv /etc/kubernetes/manifests/kube-apiserver.yaml /tmp/
# Stop etcd
sudo systemctl stop etcd
# Restore from backup
ETCDCTL_API=3 etcdctl snapshot restore backup.db
--data-dir=/var/lib/etcd-restore
--name etcd-1
--initial-cluster etcd-1=https://10.0.0.1:2380
--initial-advertise-peer-urls https://10.0.0.1:2380
# Update data directory
sudo mv /var/lib/etcd /var/lib/etcd-backup
sudo mv /var/lib/etcd-restore /var/lib/etcd
sudo chown -R etcd:etcd /var/lib/etcd
# Start etcd
sudo systemctl start etcd
# Start API server
sudo mv /tmp/kube-apiserver.yaml /etc/kubernetes/manifests/
Application Data Backup
Velero Installation
# Download Velero
wget https://github.com/vmware-tanzu/velero/releases/download/v1.9.0/velero-v1.9.0-linux-amd64.tar.gz
# Install Velero
tar -zxvf velero-v1.9.0-linux-amd64.tar.gz
sudo mv velero-v1.9.0-linux-amd64/velero /usr/local/bin/
# Install on cluster (AWS example)
velero install
--provider aws
--plugins velero/velero-plugin-for-aws:v1.5.0
--bucket velero-backup
--backup-location-config region=us-west-2
--snapshot-location-config region=us-west-2
--secret-file ./credentials-velero
Backup Configuration
apiVersion: velero.io/v1
kind: Backup
metadata:
name: full-cluster-backup
spec:
includedNamespaces:
- '*'
excludedNamespaces:
- velero
- kube-system
includedResources:
- '*'
excludedResources:
- events
- events.events.k8s.io
includeClusterResources: true
snapshotVolumes: true
ttl: 720h0m0s
Scheduled Backups
apiVersion: velero.io/v1
kind: Schedule
metadata:
name: daily-backup
spec:
schedule: "0 1 * * *" # Daily at 1 AM
template:
includedNamespaces:
- production
- staging
snapshotVolumes: true
ttl: 168h0m0s # 7 days
Volume Backup
CSI Snapshots
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
name: pvc-snapshot
spec:
volumeSnapshotClassName: csi-hostpath-snapclass
source:
persistentVolumeClaimName: my-pvc
Volume Snapshot Class
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshotClass
metadata:
name: csi-hostpath-snapclass
driver: hostpath.csi.k8s.io
deletionPolicy: Delete
parameters:
retention: "7d"
Disaster Recovery Strategies
Multi-Cluster Setup
# Primary cluster configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: cluster-config
data:
cluster-role: "primary"
backup-schedule: "0 */6 * * *"
replication-target: "dr-cluster.example.com"
---
# DR cluster configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: cluster-config
data:
cluster-role: "disaster-recovery"
sync-schedule: "*/15 * * * *"
primary-cluster: "primary-cluster.example.com"
Cross-Region Replication
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: app-replica
namespace: argocd
spec:
project: default
source:
repoURL: https://github.com/myorg/app-config
targetRevision: HEAD
path: overlays/dr
destination:
server: https://dr-cluster-api.example.com
namespace: production
syncPolicy:
automated:
prune: true
selfHeal: true
Recovery Procedures
Application Recovery
# List available backups
velero backup get
# Restore specific backup
velero restore create --from-backup full-cluster-backup-20231201
# Monitor restore progress
velero restore describe <restore-name>
# Check restored resources
kubectl get all -n production
Namespace Recovery
# Restore specific namespace
velero restore create ns-restore
--from-backup daily-backup-20231201
--include-namespaces production
# Restore with label selector
velero restore create app-restore
--from-backup daily-backup-20231201
--selector app=my-application
Point-in-Time Recovery
# Restore to specific timestamp
velero restore create time-restore
--from-backup daily-backup-20231201
--restore-volumes=true
--wait
Monitoring & Testing
Backup Monitoring
# Prometheus alerts
groups:
- name: backup-alerts
rules:
- alert: BackupFailed
expr: velero_backup_failure_total > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Velero backup failed"
description: "Backup {{ $labels.backup_name }} has failed"
- alert: BackupTooOld
expr: time() - velero_backup_last_successful_timestamp > 86400
for: 1h
labels:
severity: warning
annotations:
summary: "Backup is too old"
description: "Last successful backup was more than 24 hours ago"
DR Testing
# Regular DR testing script
#!/bin/bash
# Test backup integrity
velero backup get --selector="velero.io/backup-name=test-backup"
# Test restore process
velero restore create test-restore
--from-backup test-backup
--include-namespaces test
--wait
# Validate restored applications
kubectl get pods -n test
kubectl run test-pod --image=busybox -n test -- /bin/sh -c "echo 'DR test successful'"
# Cleanup test resources
velero restore delete test-restore
kubectl delete namespace test
Backup Storage
S3 Configuration
apiVersion: v1
kind: Secret
metadata:
name: cloud-credentials
namespace: velero
type: Opaque
data:
cloud: |
W2RlZmF1bHRd
YXdzX2FjY2Vzc19rZXlfaWQ9QUtJQVlPVVJBQ0NFU1NLRVlJRA==
YXdzX3NlY3JldF9hY2Nlc3Nfa2V5PVlPVVJTRUNSRVRBQ0NFU1NLRVk=
Azure Blob Storage
apiVersion: velero.io/v1
kind: BackupStorageLocation
metadata:
name: azure-backup-location
spec:
provider: azure
objectStorage:
bucket: velero-backups
config:
resourceGroup: velero-rg
storageAccount: velerostorage
subscriptionId: 12345678-1234-1234-1234-123456789012
Security Considerations
Backup Encryption
# Encrypt backups at rest
apiVersion: velero.io/v1
kind: Backup
metadata:
name: encrypted-backup
spec:
storageLocation: encrypted-location
encryptionConfig:
encryptionKey: backup-encryption-key
encryptionAlgorithm: AES256
Access Control
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: backup-operator
rules:
- apiGroups: ["velero.io"]
resources: ["backups", "restores"]
verbs: ["get", "list", "create", "delete"]
- apiGroups: [""]
resources: ["persistentvolumes", "persistentvolumeclaims"]
verbs: ["get", "list"]
Best Practices
Backup Strategy
- Regular automated backups
- Test restore procedures regularly
- Multiple backup locations
- Encrypt sensitive data
- Document recovery procedures
- Monitor backup health
Recovery Planning
- Define RTO (Recovery Time Objective)
- Define RPO (Recovery Point Objective)
- Create runbooks for different scenarios
- Train team on recovery procedures
- Regular disaster recovery drills
Compliance & Auditing
Backup Policies
apiVersion: velero.io/v1
kind: BackupRepository
metadata:
name: compliance-backups
spec:
retentionPolicy:
daily: 30
weekly: 12
monthly: 12
yearly: 7
encryptionRequired: true
offSiteReplication: true
Java Example: Automating Velero Operations
Bạn có thể tự động hóa các tác vụ sao lưu và khôi phục Velero bằng Java thông qua việc thực thi các lệnh Velero CLI hoặc tương tác với Custom Resources của Velero (nếu có).
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
public class VeleroAutomation {
public static void executeCommand(String command) throws IOException, InterruptedException {
System.out.println("Executing: " + command);
ProcessBuilder processBuilder = new ProcessBuilder();
processBuilder.command("bash", "-c", command);
processBuilder.redirectErrorStream(true);
Process process = processBuilder.start();
BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()));
String line;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
boolean finished = process.waitFor(5, TimeUnit.MINUTES); // Wait up to 5 minutes
if (!finished) {
process.destroyForcibly();
throw new RuntimeException("Command timed out: " + command);
}
int exitVal = process.exitValue();
if (exitVal == 0) {
System.out.println("Command completed successfully.");
} else {
throw new RuntimeException("Command failed with exit code: " + exitVal + ": " + command);
}
}
public static void main(String[] args) {
try {
// Example 1: Create a full cluster backup
executeCommand("velero backup create my-daily-full-backup --include-cluster-resources=true");
// Example 2: List all backups
executeCommand("velero backup get");
// Example 3: Restore a specific backup
// executeCommand("velero restore create --from-backup my-daily-full-backup");
// Example 4: Schedule a backup
// Note: This creates a Velero Schedule resource, which is then managed by Velero itself.
String scheduleYaml = """
apiVersion: velero.io/v1
kind: Schedule
metadata:
name: my-app-daily
spec:
schedule: "0 3 * * *"
template:
includedNamespaces:
- my-app-namespace
snapshotVolumes: true
ttl: 720h0m0s # 30 days
""";
// To apply this, you'd typically save it to a file and use kubectl apply
// For demonstration, we'll just print it.
System.out.println("\nExample Velero Schedule YAML:\n" + scheduleYaml);
// In a real scenario, you might write this to a temp file and then kubectl apply -f
// executeCommand("echo \"" + scheduleYaml.replace("\n", "\\n") + "\" | kubectl apply -f -");
} catch (IOException | InterruptedException e) {
System.err.println("Automation failed: " + e.getMessage());
e.printStackTrace();
} catch (RuntimeException e) {
System.err.println("Automation error: " + e.getMessage());
e.printStackTrace();
}
}
}
Nội dung đã được mở rộng với detailed recovery scenarios và compliance frameworks, cùng các ví dụ Java.