Linux Docker容器监控与告警实战指南
一 监控方案选型与总体架构
二 快速上手 Prometheus + cAdvisor + Alertmanager + Grafana
docker run --name=cadvisor \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:ro \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--publish=8080:8080 --detach \
google/cadvisor:latest
docker run -d --name=prometheus -p 9090:9090 \
-v $PWD/prometheus.yml:/etc/prometheus/prometheus.yml \
prom/prometheus
docker run -d --name=alertmanager -p 9093:9093 \
-v $PWD/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
prom/alertmanager
docker run -d --name=grafana -p 3000:3000 grafana/grafana
scrape_configs:
- job_name: 'cadvisor'
static_configs:
- targets: ['<HOST_IP>:8080']
metrics_path: '/metrics'
scrape_interval: 15s
三 告警规则与通知落地
groups:
- name: container.rules
rules:
- alert: HighCPUUsage
expr: sum(rate(container_cpu_usage_seconds_total{name!=""}[1m])) by (name) > 0.8
for: 2m
labels:
severity: critical
annotations:
summary: "容器 {{ $labels.name }} CPU使用率过高"
description: "当前使用率: {{ $value | humanizePercentage }}"
- alert: MemoryLeak
expr: container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "容器 {{ $labels.name }} 内存接近上限"
description: "已用/上限: {{ $value | humanizePercentage }}"
rule_files:
- "/etc/prometheus/rules/*.rules.yml"
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'ops@example.com'
from: 'alert@example.com'
smarthost: 'smtp.example.com:587'
auth_username: 'user'
auth_password: 'pass'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
四 进阶与运维优化
五 其他工具与场景建议