Backup & Disaster Recovery - Kubernetes Data Protection

Tổng quan

Backup và disaster recovery đảm bảo data protection và business continuity cho Kubernetes clusters.

etcd Backup

Manual Backup

# Create etcd snapshot
ETCDCTL_API=3 etcdctl snapshot save backup-$(date +%Y%m%d-%H%M%S).db 
  --endpoints=https://127.0.0.1:2379 
  --cacert=/etc/kubernetes/pki/etcd/ca.crt 
  --cert=/etc/kubernetes/pki/etcd/server.crt 
  --key=/etc/kubernetes/pki/etcd/server.key

# Verify backup
ETCDCTL_API=3 etcdctl snapshot status backup-$(date +%Y%m%d-%H%M%S).db --write-out=table

Automated Backup

apiVersion: batch/v1
kind: CronJob
metadata:
  name: etcd-backup
  namespace: kube-system
spec:
  schedule: "0 2 * * *"  # Daily at 2 AM
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: etcd-backup
            image: k8s.gcr.io/etcd:3.5.0-0
            command:
            - /bin/sh
            - -c
            - |
              ETCDCTL_API=3 etcdctl snapshot save /backup/etcd-$(date +%Y%m%d-%H%M%S).db 
                --endpoints=https://etcd:2379 
                --cacert=/etc/etcd/ca.crt 
                --cert=/etc/etcd/server.crt 
                --key=/etc/etcd/server.key
            volumeMounts:
            - name: backup-storage
              mountPath: /backup
            - name: etcd-certs
              mountPath: /etc/etcd
          volumes:
          - name: backup-storage
            persistentVolumeClaim:
              claimName: backup-pvc
          - name: etcd-certs
            secret:
              secretName: etcd-certs
          restartPolicy: OnFailure

etcd Restore

Restore Process

# Stop API server
sudo mv /etc/kubernetes/manifests/kube-apiserver.yaml /tmp/

# Stop etcd
sudo systemctl stop etcd

# Restore from backup
ETCDCTL_API=3 etcdctl snapshot restore backup.db 
  --data-dir=/var/lib/etcd-restore 
  --name etcd-1 
  --initial-cluster etcd-1=https://10.0.0.1:2380 
  --initial-advertise-peer-urls https://10.0.0.1:2380

# Update data directory
sudo mv /var/lib/etcd /var/lib/etcd-backup
sudo mv /var/lib/etcd-restore /var/lib/etcd
sudo chown -R etcd:etcd /var/lib/etcd

# Start etcd
sudo systemctl start etcd

# Start API server
sudo mv /tmp/kube-apiserver.yaml /etc/kubernetes/manifests/

Application Data Backup

Velero Installation

# Download Velero
wget https://github.com/vmware-tanzu/velero/releases/download/v1.9.0/velero-v1.9.0-linux-amd64.tar.gz

# Install Velero
tar -zxvf velero-v1.9.0-linux-amd64.tar.gz
sudo mv velero-v1.9.0-linux-amd64/velero /usr/local/bin/

# Install on cluster (AWS example)
velero install 
  --provider aws 
  --plugins velero/velero-plugin-for-aws:v1.5.0 
  --bucket velero-backup 
  --backup-location-config region=us-west-2 
  --snapshot-location-config region=us-west-2 
  --secret-file ./credentials-velero

Backup Configuration

apiVersion: velero.io/v1
kind: Backup
metadata:
  name: full-cluster-backup
spec:
  includedNamespaces:
  - '*'
  excludedNamespaces:
  - velero
  - kube-system
  includedResources:
  - '*'
  excludedResources:
  - events
  - events.events.k8s.io
  includeClusterResources: true
  snapshotVolumes: true
  ttl: 720h0m0s

Scheduled Backups

apiVersion: velero.io/v1
kind: Schedule
metadata:
  name: daily-backup
spec:
  schedule: "0 1 * * *"  # Daily at 1 AM
  template:
    includedNamespaces:
    - production
    - staging
    snapshotVolumes: true
    ttl: 168h0m0s  # 7 days

Volume Backup

CSI Snapshots

apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
  name: pvc-snapshot
spec:
  volumeSnapshotClassName: csi-hostpath-snapclass
  source:
    persistentVolumeClaimName: my-pvc

Volume Snapshot Class

apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshotClass
metadata:
  name: csi-hostpath-snapclass
driver: hostpath.csi.k8s.io
deletionPolicy: Delete
parameters:
  retention: "7d"

Disaster Recovery Strategies

Multi-Cluster Setup

# Primary cluster configuration
apiVersion: v1
kind: ConfigMap
metadata:
  name: cluster-config
data:
  cluster-role: "primary"
  backup-schedule: "0 */6 * * *"
  replication-target: "dr-cluster.example.com"

---
# DR cluster configuration  
apiVersion: v1
kind: ConfigMap
metadata:
  name: cluster-config
data:
  cluster-role: "disaster-recovery"
  sync-schedule: "*/15 * * * *"
  primary-cluster: "primary-cluster.example.com"

Cross-Region Replication

apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: app-replica
  namespace: argocd
spec:
  project: default
  source:
    repoURL: https://github.com/myorg/app-config
    targetRevision: HEAD
    path: overlays/dr
  destination:
    server: https://dr-cluster-api.example.com
    namespace: production
  syncPolicy:
    automated:
      prune: true
      selfHeal: true

Recovery Procedures

Application Recovery

# List available backups
velero backup get

# Restore specific backup
velero restore create --from-backup full-cluster-backup-20231201

# Monitor restore progress
velero restore describe <restore-name>

# Check restored resources
kubectl get all -n production

Namespace Recovery

# Restore specific namespace
velero restore create ns-restore 
  --from-backup daily-backup-20231201 
  --include-namespaces production

# Restore with label selector
velero restore create app-restore 
  --from-backup daily-backup-20231201 
  --selector app=my-application

Point-in-Time Recovery

# Restore to specific timestamp
velero restore create time-restore 
  --from-backup daily-backup-20231201 
  --restore-volumes=true 
  --wait

Monitoring & Testing

Backup Monitoring

# Prometheus alerts
groups:
- name: backup-alerts
  rules:
  - alert: BackupFailed
    expr: velero_backup_failure_total > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Velero backup failed"
      description: "Backup {{ $labels.backup_name }} has failed"

  - alert: BackupTooOld
    expr: time() - velero_backup_last_successful_timestamp > 86400
    for: 1h
    labels:
      severity: warning
    annotations:
      summary: "Backup is too old"
      description: "Last successful backup was more than 24 hours ago"

DR Testing

# Regular DR testing script
#!/bin/bash

# Test backup integrity
velero backup get --selector="velero.io/backup-name=test-backup"

# Test restore process
velero restore create test-restore 
  --from-backup test-backup 
  --include-namespaces test 
  --wait

# Validate restored applications
kubectl get pods -n test
kubectl run test-pod --image=busybox -n test -- /bin/sh -c "echo 'DR test successful'"

# Cleanup test resources
velero restore delete test-restore
kubectl delete namespace test

Backup Storage

S3 Configuration

apiVersion: v1
kind: Secret
metadata:
  name: cloud-credentials
  namespace: velero
type: Opaque
data:
  cloud: |
    W2RlZmF1bHRd
    YXdzX2FjY2Vzc19rZXlfaWQ9QUtJQVlPVVJBQ0NFU1NLRVlJRA==
    YXdzX3NlY3JldF9hY2Nlc3Nfa2V5PVlPVVJTRUNSRVRBQ0NFU1NLRVk=

Azure Blob Storage

apiVersion: velero.io/v1
kind: BackupStorageLocation
metadata:
  name: azure-backup-location
spec:
  provider: azure
  objectStorage:
    bucket: velero-backups
  config:
    resourceGroup: velero-rg
    storageAccount: velerostorage
    subscriptionId: 12345678-1234-1234-1234-123456789012

Security Considerations

Backup Encryption

# Encrypt backups at rest
apiVersion: velero.io/v1
kind: Backup
metadata:
  name: encrypted-backup
spec:
  storageLocation: encrypted-location
  encryptionConfig:
    encryptionKey: backup-encryption-key
    encryptionAlgorithm: AES256

Access Control

apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: backup-operator
rules:
- apiGroups: ["velero.io"]
  resources: ["backups", "restores"]
  verbs: ["get", "list", "create", "delete"]
- apiGroups: [""]
  resources: ["persistentvolumes", "persistentvolumeclaims"]
  verbs: ["get", "list"]

Best Practices

Backup Strategy

  • Regular automated backups
  • Test restore procedures regularly
  • Multiple backup locations
  • Encrypt sensitive data
  • Document recovery procedures
  • Monitor backup health

Recovery Planning

  • Define RTO (Recovery Time Objective)
  • Define RPO (Recovery Point Objective)
  • Create runbooks for different scenarios
  • Train team on recovery procedures
  • Regular disaster recovery drills

Compliance & Auditing

Backup Policies

apiVersion: velero.io/v1
kind: BackupRepository
metadata:
  name: compliance-backups
spec:
  retentionPolicy:
    daily: 30
    weekly: 12
    monthly: 12
    yearly: 7
  encryptionRequired: true
  offSiteReplication: true

Java Example: Automating Velero Operations

Bạn có thể tự động hóa các tác vụ sao lưu và khôi phục Velero bằng Java thông qua việc thực thi các lệnh Velero CLI hoặc tương tác với Custom Resources của Velero (nếu có).

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.concurrent.TimeUnit;

public class VeleroAutomation {

    public static void executeCommand(String command) throws IOException, InterruptedException {
        System.out.println("Executing: " + command);
        ProcessBuilder processBuilder = new ProcessBuilder();
        processBuilder.command("bash", "-c", command);
        processBuilder.redirectErrorStream(true);

        Process process = processBuilder.start();
        BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()));

        String line;
        while ((line = reader.readLine()) != null) {
            System.out.println(line);
        }

        boolean finished = process.waitFor(5, TimeUnit.MINUTES); // Wait up to 5 minutes
        if (!finished) {
            process.destroyForcibly();
            throw new RuntimeException("Command timed out: " + command);
        }

        int exitVal = process.exitValue();
        if (exitVal == 0) {
            System.out.println("Command completed successfully.");
        } else {
            throw new RuntimeException("Command failed with exit code: " + exitVal + ": " + command);
        }
    }

    public static void main(String[] args) {
        try {
            // Example 1: Create a full cluster backup
            executeCommand("velero backup create my-daily-full-backup --include-cluster-resources=true");

            // Example 2: List all backups
            executeCommand("velero backup get");

            // Example 3: Restore a specific backup
            // executeCommand("velero restore create --from-backup my-daily-full-backup");

            // Example 4: Schedule a backup
            // Note: This creates a Velero Schedule resource, which is then managed by Velero itself.
            String scheduleYaml = """
apiVersion: velero.io/v1
kind: Schedule
metadata:
  name: my-app-daily
spec:
  schedule: "0 3 * * *"
  template:
    includedNamespaces:
    - my-app-namespace
    snapshotVolumes: true
    ttl: 720h0m0s # 30 days
""";
            // To apply this, you'd typically save it to a file and use kubectl apply
            // For demonstration, we'll just print it.
            System.out.println("\nExample Velero Schedule YAML:\n" + scheduleYaml);
            // In a real scenario, you might write this to a temp file and then kubectl apply -f
            // executeCommand("echo \"" + scheduleYaml.replace("\n", "\\n") + "\" | kubectl apply -f -");

        } catch (IOException | InterruptedException e) {
            System.err.println("Automation failed: " + e.getMessage());
            e.printStackTrace();
        } catch (RuntimeException e) {
            System.err.println("Automation error: " + e.getMessage());
            e.printStackTrace();
        }
    }
}

Nội dung đã được mở rộng với detailed recovery scenarios và compliance frameworks, cùng các ví dụ Java.