CentOS 上 Kafka 故障排查手册
一 快速定位流程
二 常见故障与修复要点
三 配置与系统检查清单
四 日志与监控
五 一键排查脚本示例
#!/usr/bin/env bash
set -Eeuo pipefail
KAFKA_HOME=${KAFKA_HOME:-/opt/kafka}
KAFKA_CONF=$KAFKA_HOME/config/server.properties
LOG_DIR=$(grep '^log\.dirs=' "$KAFKA_CONF" 2>/dev/null | cut -d'=' -f2- | tr -d '[:space:]')
ZK_CONN=$(grep '^zookeeper\.connect=' "$KAFKA_CONF" 2>/dev/null | cut -d'=' -f2- | tr -d '[:space:]')
LISTENERS=$(grep '^listeners=' "$KAFKA_CONF" 2>/dev/null | cut -d'=' -f2- | tr -d '[:space:]')
ADV_LISTENERS=$(grep '^advertised\.listeners=' "$KAFKA_CONF" 2>/dev/null | cut -d'=' -f2- | tr -d '[:space:]')
PORT=${LISTENERS##*:}
echo "=== Kafka 快速健康检查 ==="
echo "时间: $(date '+%F %T')"
echo
echo "[1/7] 端口 $PORT 占用情况"
ss -lntp | grep -E "(:$PORT|\*$PORT)" || echo "端口 $PORT 未被监听"
echo
echo "[2/7] Kafka 进程"
pgrep -x java | xargs -r ps -fp || echo "未检测到 Kafka Java 进程"
echo
echo "[3/7] Zookeeper 连通性"
if command -v nc >/dev/null 2>&1; then
timeout 3 nc -z "${ZK_CONN%:*}" "${ZK_CONN##*:}" && echo "Zookeeper $ZK_CONN 可达" || echo "Zookeeper $ZK_CONN 不可达"
else
echo "nc 未安装,跳过 Zookeeper 端口探测"
fi
echo
echo "[4/7] 关键配置"
echo "broker.id: $(grep '^broker\.id=' "$KAFKA_CONF" 2>/dev/null | cut -d'=' -f2-)"
echo "log.dirs: $LOG_DIR"
echo "zookeeper.connect: $ZK_CONN"
echo "listeners: $LISTENERS"
echo "advertised.listeners: $ADV_LISTENERS"
echo
echo "[5/7] 目录与权限"
if [[ -n "$LOG_DIR" ]]; then
if [[ -d "$LOG_DIR" ]]; then
echo "目录 $LOG_DIR 存在,权限: $(stat -c '%A %U:%G' "$LOG_DIR")"
else
echo "目录 $LOG_DIR 不存在"
fi
else
echo "未配置 log.dirs"
fi
echo
echo "[6/7] 防火墙状态"
if command -v firewall-cmd >/dev/null 2>&1; then
firewall-cmd --list-ports | grep -qE "${PORT}/tcp" && echo "防火墙已放行 $PORT/tcp" || echo "防火墙未放行 $PORT/tcp"
else
echo "firewalld 未安装"
fi
echo
echo "[7/7] 最近错误日志 (server.log)"
if [[ -f "$KAFKA_HOME/logs/server.log" ]]; then
tail -n50 "$KAFKA_HOME/logs/server.log" | grep -i -E 'ERROR|Exception' | tail -20 || echo "最近 50 行中未检索到 ERROR/Exception"
elif [[ -f "/var/log/kafka/server.log" ]]; then
tail -n50 "/var/log/kafka/server.log" | grep -i -E 'ERROR|Exception' | tail -20 || echo "最近 50 行中未检索到 ERROR/Exception"
else
echo "未找到 server.log"
fi
提示:将脚本中的 KAFKA_HOME 与实际安装路径保持一致;若使用 systemd 管理服务,可先执行 systemctl status kafka 获取更详细的启动失败原因。