Cluster Management - Kubernetes Operations
Tổng quan
Cluster management bao gồm installation, configuration, upgrades, và maintenance của Kubernetes clusters.
Cluster Installation
kubeadm Installation
# Initialize cluster
sudo kubeadm init --pod-network-cidr=10.244.0.0/16
# Configure kubectl
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
# Install network plugin
kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml
# Join worker nodes
kubeadm join <master-ip>:6443 --token <token> --discovery-token-ca-cert-hash sha256:<hash>
Cluster Configuration
# kubeadm-config.yaml
apiVersion: kubeadm.k8s.io/v1beta3
kind: ClusterConfiguration
kubernetesVersion: v1.27.0
controlPlaneEndpoint: "cluster-endpoint:6443"
networking:
serviceSubnet: "10.96.0.0/12"
podSubnet: "10.244.0.0/16"
etcd:
external:
endpoints:
- https://10.2.0.1:2379
- https://10.2.0.2:2379
- https://10.2.0.3:2379
Node Management
Adding Nodes
# Generate join token
kubeadm token create --print-join-command
# Label nodes
kubectl label nodes worker-1 node-role.kubernetes.io/worker=worker
# Taint nodes
kubectl taint nodes master-1 node-role.kubernetes.io/control-plane:NoSchedule
Node Maintenance
# Drain node for maintenance
kubectl drain node-1 --ignore-daemonsets --delete-emptydir-data
# Mark node unschedulable
kubectl cordon node-1
# Mark node schedulable
kubectl uncordon node-1
# Remove node from cluster
kubectl delete node node-1
Cluster Upgrades
kubeadm Upgrade Process
# Check upgrade plan
sudo kubeadm upgrade plan
# Upgrade control plane
sudo kubeadm upgrade apply v1.27.1
# Upgrade kubelet và kubectl
sudo apt-mark unhold kubeadm && \
sudo apt-get update && sudo apt-get install -y kubeadm=1.27.1-00 && \
sudo apt-mark hold kubeadm
sudo systemctl restart kubelet
Rolling Node Upgrades
# For each node:
# 1. Drain the node
kubectl drain node-1 --ignore-daemonsets --delete-emptydir-data
# 2. Upgrade packages
sudo apt-get update
sudo apt-get install -y kubelet=1.27.1-00 kubectl=1.27.1-00
sudo systemctl restart kubelet
# 3. Uncordon the node
kubectl uncordon node-1
etcd Management
etcd Backup
# Create snapshot
ETCDCTL_API=3 etcdctl snapshot save backup.db \
--endpoints=https://127.0.0.1:2379 \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key
# Verify snapshot
ETCDCTL_API=3 etcdctl snapshot status backup.db --write-out=table
Automated etcd Backup with CronJob
apiVersion: batch/v1
kind: CronJob
metadata:
name: etcd-backup
namespace: kube-system
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: etcd-backup
image: k8s.gcr.io/etcd:3.5.0-0
command:
- /bin/sh
- -c
- |
ETCDCTL_API=3 etcdctl snapshot save /backup/etcd-$(date +%Y%m%d-%H%M%S).db \
--endpoints=https://etcd:2379 \
--cacert=/etc/etcd/ca.crt \
--cert=/etc/etcd/server.crt \
--key=/etc/etcd/server.key
volumeMounts:
- name: backup-storage
mountPath: /backup
- name: etcd-certs
mountPath: /etc/etcd
volumes:
- name: backup-storage
persistentVolumeClaim:
claimName: etcd-backup-pvc # Ensure this PVC exists and is mounted to a persistent storage
- name: etcd-certs
secret:
secretName: etcd-certs # Secret containing etcd client certificates
restartPolicy: OnFailure
etcd Restore
# Stop kube-apiserver
sudo mv /etc/kubernetes/manifests/kube-apiserver.yaml /tmp/
# Restore from snapshot
ETCDCTL_API=3 etcdctl snapshot restore backup.db \
--data-dir=/var/lib/etcd-restore \
--name etcd-1 \
--initial-cluster etcd-1=https://10.0.0.1:2380 \
--initial-advertise-peer-urls https://10.0.0.1:2380
# Update etcd configuration
sudo mv /var/lib/etcd /var/lib/etcd-backup
sudo mv /var/lib/etcd-restore /var/lib/etcd
sudo chown -R etcd:etcd /var/lib/etcd
# Start kube-apiserver
sudo mv /tmp/kube-apiserver.yaml /etc/kubernetes/manifests/
Resource Management
Resource Quotas
apiVersion: v1
kind: ResourceQuota
metadata:
name: compute-quota
namespace: production
spec:
hard:
requests.cpu: "100"
requests.memory: 200Gi
limits.cpu: "200"
limits.memory: 400Gi
persistentvolumeclaims: "10"
pods: "100"
Limit Ranges
apiVersion: v1
kind: LimitRange
metadata:
name: limit-range
namespace: production
spec:
limits:
- default:
memory: "512Mi"
cpu: "500m"
defaultRequest:
memory: "256Mi"
cpu: "100m"
type: Container
High Availability
Control Plane HA
# Load balancer configuration
upstream kubernetes {
server 10.0.0.1:6443;
server 10.0.0.2:6443;
server 10.0.0.3:6443;
}
server {
listen 6443;
proxy_pass kubernetes;
}
etcd HA
# 3-node etcd cluster
etcd-1: https://10.0.0.1:2379
etcd-2: https://10.0.0.2:2379
etcd-3: https://10.0.0.3:2379
Monitoring & Health Checks
Cluster Health
# Check cluster info
kubectl cluster-info
# Check node status
kubectl get nodes
# Check component status
kubectl get cs
# Check pod status across namespaces
kubectl get pods --all-namespaces
Performance Monitoring
# Top nodes
kubectl top nodes
# Top pods
kubectl top pods --all-namespaces
# Describe node resources
kubectl describe node node-1
Troubleshooting
Common Issues
# Check kubelet logs
sudo journalctl -u kubelet -f
# Check system pods
kubectl get pods -n kube-system
# Check events
kubectl get events --sort-by=.metadata.creationTimestamp
# Check certificates
sudo kubeadm certs check-expiration
Best Practices
- Regular backups of etcd
- Monitor cluster health metrics
- Plan upgrade windows
- Use infrastructure as code
- Implement proper RBAC
- Regular security updates
- Document procedures
- Test disaster recovery
Automation Tools
| Tool | Purpose | Use Case |
|---|---|---|
| Cluster API | Declarative cluster management | Multi-cloud |
| kops | Production cluster management | AWS |
| Rancher | Cluster management platform | Enterprise |
| Terraform | Infrastructure as code | Cloud provisioning |
Java Example: Automating Cluster Operations with Fabric8
Bạn có thể sử dụng Fabric8 Kubernetes Client để tự động hóa các tác vụ quản lý cluster, chẳng hạn như kiểm tra trạng thái node, triển khai ứng dụng, hoặc thực hiện các thao tác bảo trì.
import io.fabric8.kubernetes.client.DefaultKubernetesClient;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.api.model.Node;
import io.fabric8.kubernetes.api.model.NodeList;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.api.model.PodList;
import io.fabric8.kubernetes.api.model.apps.Deployment;
import io.fabric8.kubernetes.api.model.apps.DeploymentBuilder;
public class ClusterAutomation {
public static void main(String[] args) {
try (KubernetesClient client = new DefaultKubernetesClient()) {
// 1. Check Node Health
System.out.println("\n--- Node Health ---");
NodeList nodeList = client.nodes().list();
for (Node node : nodeList.getItems()) {
boolean isReady = node.getStatus().getConditions().stream()
.anyMatch(c -> c.getType().equals("Ready") && c.getStatus().equals("True"));
System.out.println("Node: " + node.getMetadata().getName() + ", Ready: " + isReady);
}
// 2. Deploy a new application (if not already deployed)
String appName = "automated-nginx";
if (client.apps().deployments().inNamespace("default").withName(appName).get() == null) {
Deployment deployment = new DeploymentBuilder()
.withNewMetadata().withName(appName).endMetadata()
.withNewSpec()
.withReplicas(2)
.withNewSelector().addToMatchLabels("app", appName).endSelector()
.withNewTemplate()
.withNewMetadata().addToLabels("app", appName).endMetadata()
.withNewSpec()
.addNewContainer()
.withName(appName)
.withImage("nginx:latest")
.addNewPort().withContainerPort(80).endPort()
.endContainer()
.endSpec()
.endTemplate()
.endSpec()
.build();
client.apps().deployments().inNamespace("default").createOrReplace(deployment);
System.out.println("\nDeployment " + appName + " created.");
} else {
System.out.println("\nDeployment " + appName + " already exists.");
}
// 3. Scale up a deployment
System.out.println("\nScaling deployment " + appName + " to 3 replicas...");
client.apps().deployments().inNamespace("default").withName(appName).scale(3, true);
System.out.println("Deployment " + appName + " scaled.");
// 4. Get Pods for a deployment
System.out.println("\nPods for " + appName + ":");
PodList pods = client.pods().inNamespace("default").withLabel("app", appName).list();
for (Pod pod : pods.getItems()) {
System.out.println("- " + pod.getMetadata().getName() + " (Status: " + pod.getStatus().getPhase() + ")");
}
} catch (Exception e) {
System.err.println("Error during cluster automation: " + e.getMessage());
e.printStackTrace();
}
}
}
Next Steps
- 📚 Học về Troubleshooting
- 🎯 Practice cluster upgrades
- 🏗️ Explore cluster automation
- 💻 Setup monitoring
Nội dung đã được mở rộng với detailed operational procedures và automation examples, cùng các ví dụ Java.