Monitoring & Logging - Kubernetes Observability
Tổng quan
Monitoring và logging essential cho Kubernetes operations. Observability includes metrics, logs, và traces.
Prometheus & Grafana
Prometheus Installation
# prometheus-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
Grafana Dashboard
apiVersion: v1
kind: Service
metadata:
name: grafana
spec:
ports:
- port: 3000
targetPort: 3000
selector:
app: grafana
type: LoadBalancer
Metrics
Key Metrics
- CPU utilization
- Memory usage
- Network I/O
- Disk usage
- Pod status
- API server latency
Custom Metrics (Java Example)
Bạn có thể sử dụng thư viện Prometheus Java client để tạo custom metrics trong ứng dụng Java của mình.
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import io.prometheus.client.exporter.HTTPServer;
import java.io.IOException;
public class CustomMetricsExample {
private static final Counter requests = Counter.build()
.name("my_app_requests_total").help("Total requests.").register();
private static final Histogram requestLatency = Histogram.build()
.name("my_app_request_duration_seconds").help("Request duration in seconds.").register();
public static void processRequest() {
requests.inc(); // Tăng bộ đếm mỗi khi có yêu cầu
Histogram.Timer requestTimer = requestLatency.startTimer();
try {
// Simulate some work
Thread.sleep(100);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
requestTimer.observeDuration(); // Ghi lại thời gian xử lý yêu cầu
}
}
public static void main(String[] args) throws IOException {
// Khởi tạo HTTP server để Prometheus có thể scrape metrics
HTTPServer server = new HTTPServer(8080);
System.out.println("Prometheus metrics exposed on port 8080");
// Simulate continuous requests
while (true) {
processRequest();
try {
Thread.sleep(500);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
}
Logging
Container Logs
# View pod logs
kubectl logs pod-name
# Follow logs
kubectl logs -f pod-name
# Logs from specific container
kubectl logs pod-name -c container-name
ELK Stack
- Elasticsearch: Log storage
- Logstash: Log processing
- Kibana: Log visualization
Fluentd
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: fluentd
spec:
selector:
matchLabels:
name: fluentd
template:
metadata:
labels:
name: fluentd
spec:
containers:
- name: fluentd
image: fluent/fluentd-kubernetes-daemonset:v1-debian-elasticsearch
Structured Logging in Java (Logback/Log4j2)
Sử dụng thư viện logging trong Java để tạo structured logs, giúp dễ dàng phân tích và tìm kiếm.
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
public class StructuredLoggingExample {
private static final Logger logger = LoggerFactory.getLogger(StructuredLoggingExample.class);
public void processOrder(String orderId, String userId) {
MDC.put("orderId", orderId);
MDC.put("userId", userId);
try {
logger.info("Processing order.");
// Simulate order processing logic
if (Math.random() < 0.1) {
logger.error("Failed to process payment for order.", new RuntimeException("Payment gateway error"));
} else {
logger.debug("Order processed successfully.");
}
} finally {
MDC.remove("orderId");
MDC.remove("userId");
}
}
public static void main(String[] args) {
StructuredLoggingExample app = new StructuredLoggingExample();
app.processOrder("ORD-12345", "USER-67890");
app.processOrder("ORD-54321", "USER-09876");
}
}
Để Logback/Log4j2 xuất JSON logs, bạn cần cấu hình logback.xml hoặc log4j2.xml tương ứng. Ví dụ với Logback:
<!-- logback.xml -->
<configuration>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="JSON_FILE" class="ch.qos.logback.core.FileAppender">
<file>logs/app.json</file>
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>{"timestamp":"%d{yyyy-MM-dd'T'HH:mm:ss.SSSZ}", "level":"%level", "thread":"%thread", "logger":"%logger{36}", "message":"%msg", "mdc":%m%n}</pattern>
</encoder>
</appender>
<root level="info">
<appender-ref ref="CONSOLE" />
<appender-ref ref="JSON_FILE" />
</root>
</configuration>
Alerting
AlertManager
groups:
- name: kubernetes
rules:
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.pod }} is crash looping"
Tracing
Jaeger
Distributed tracing cho microservices.
OpenTelemetry (Java Example)
Sử dụng OpenTelemetry để instrument ứng dụng Java và gửi traces đến Jaeger.
import io.opentelemetry.api.OpenTelemetry;
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.api.trace.Tracer;
import io.opentelemetry.sdk.OpenTelemetrySdk;
import io.opentelemetry.sdk.trace.SdkTracerProvider;
import io.opentelemetry.sdk.trace.export.BatchSpanProcessor;
import io.opentelemetry.exporter.jaeger.thrift.JaegerThriftSpanExporter;
public class OpenTelemetryExample {
private static final String SERVICE_NAME = "my-java-app";
private static final String JAEGER_HOST = "localhost";
private static final int JAEGER_PORT = 6831;
private static OpenTelemetry openTelemetry;
private static Tracer tracer;
public static void initOpenTelemetry() {
JaegerThriftSpanExporter jaegerExporter = JaegerThriftSpanExporter.builder()
.setEndpoint(String.format("http://%s:%d", JAEGER_HOST, JAEGER_PORT))
.build();
SdkTracerProvider tracerProvider = SdkTracerProvider.builder()
.addSpanProcessor(BatchSpanProcessor.builder(jaegerExporter).build())
.build();
openTelemetry = OpenTelemetrySdk.builder()
.setTracerProvider(tracerProvider)
.buildAndRegisterGlobal();
tracer = openTelemetry.getTracer(SERVICE_NAME);
}
public void doSomeWork() {
Span parentSpan = tracer.spanBuilder("parent-operation").startSpan();
try {
// Put the span into the current Context
// Make sure to close the scope after the operation
try (io.opentelemetry.context.Scope scope = parentSpan.makeCurrent()) {
System.out.println("Doing some work...");
Span childSpan = tracer.spanBuilder("child-operation").startSpan();
try {
Thread.sleep(50);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
childSpan.end();
}
}
} finally {
parentSpan.end();
}
}
public static void main(String[] args) {
initOpenTelemetry();
OpenTelemetryExample app = new OpenTelemetryExample();
app.doSomeWork();
// Ensure spans are flushed before application exits
((SdkTracerProvider) openTelemetry.getTracerProvider()).shutdown();
}
}
Best Practices
- Monitor golden signals (latency, traffic, errors, saturation)
- Set up proper alerting rules
- Use structured logging
- Implement health checks
- Monitor cluster resources
- Set up log retention policies
Tools Comparison
| Tool | Type | Pros | Cons |
|---|---|---|---|
| Prometheus | Metrics | Pull-based, powerful queries | Resource intensive |
| Grafana | Visualization | Rich dashboards | Complex setup |
| ELK Stack | Logging | Full-text search | Heavy resource usage |
| Jaeger | Tracing | Good for microservices | Learning curve |
Nội dung đã được mở rộng với hands-on monitoring setup và alerting best practices, cùng các ví dụ Java.