Nagios检测一些记序

发布时间：2020-06-25 14:14:15 作者：煮酒品茶
来源：网络阅读：6291

检测命令篇：
文中内容包括：序述nagios从发现主机到web界面显示出状态再到邮件报警的整个过程。方面以后进行排错，还有如何编写特定应用的特定检测程序。
煮酒品茶：文章需要改进的有如何做触发报警的条件，警告等。

更新：
1、说明一些定义一些宏资料。

$ARGn$：The nth argument passed to the command (notification, event handler, service check, etc.). Nagios supports up to 32 argument macros ($ARG1$ through $ARG32$).
$USERn$：The nth user-definable macro. User macros can be defined in one or more resource files. Nagios supports up to 256 user macros ($USER1$ through $USER32$).

2、解决问题，报警四种状态。

Nagios检测一些记序

服务和主机配置文件中的“check_command check-host-alive”是什么意思呢？

[root@weihack objects]# pwd

/usr/local/nagios/etc/objects

[root@weihack objects]# cat commands.cfg #发现这么一项：

define command{

command_name check-host-alive

command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5

}

看看命令行：$USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5

[root@weihack libexec]# pwd

/usr/local/nagios/libexec

[root@weihack libexec]# ./check_ping -H 192.168.100.85 -w 3000.0,80% -c 5000.0,100% -p 5

PING OK - Packet loss = 0%, RTA = 0.05 ms|rta=0.055000ms;3000.000000;5000.000000;0.000000 pl=0%;80;100;0

# Web界面 Status Information 里面是不是出现了PING OK - Packet loss = 0%, RTA = 0.05 ms这样的东东。

# -h 使用方法都出来了，由此我们可以看出。ping 192.168.100.85 3000警告，5000直接报警。目前为0.055ms 发送五个包。那很很清晰了。

[root@weihack libexec]# ./check_ping -h

Use ping to check connection statistics for a remote host.

Usage:check_ping -H <host_address> -w <wrta>,<wpl>% -c <crta>,<cpl>%

[-p packets] [-t timeout] [-4|-6]

Options:

-h, --help

Print detailed help screen

-V, --version

Print version information

-4, --use-ipv4

Use IPv4 connection

-6, --use-ipv6

Use IPv6 connection

-H, --hostname=HOST

host to ping

-w, --warning=THRESHOLD

warning threshold pair

-c, --critical=THRESHOLD

critical threshold pair

-p, --packets=INTEGER

number of ICMP ECHO packets to send (Default: 5)

-L, --link

show HTML in the plugin output (obsoleted by urlize)

-t, --timeout=INTEGER

Seconds before connection times out (default: 10)

# 倒底有多少个这样的定义的命令的？

[root@weihack objects]# cat commands.cfg |grep command_name

command_name notify-host-by-email

command_name notify-service-by-email

command_name check-host-alive

command_name check_local_disk

command_name check_local_load

command_name check_local_procs

command_name check_local_users

command_name check_local_swap

command_name check_local_mrtgtraf

command_name check_ftp

command_name check_hpjd

command_name check_snmp

command_name check_http

command_name check_ssh

command_name check_dhcp

command_name check_ping

command_name check_pop

command_name check_imap

command_name check_smtp

command_name check_tcp

command_name check_udp

command_name check_nt

command_name process-host-perfdata

command_name process-service-perfdata

#挑一条notify-host-by-email，可以清楚的看到发送邮件的过程。

define command{

command_name notify-host-by-email

command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$

}

#打散开来看，是不是可以定制邮件发送格式了？

/usr/bin/printf "%b" "***** Nagios *****\n\n

Notification Type: $NOTIFICATIONTYPE$\n

Host: $HOSTNAME$\nState: $HOSTSTATE$\n

Address: $HOSTADDRESS$\n

Info: $HOSTOUTPUT$\n\n

Date/Time: $LONGDATETIME$\n

" | /bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$

#到手的邮件是这样子的。

主　题： ** RECOVERY Host Alert: rsync-89 is UP ** [新窗口打开]

时　间： 2013-03-13 22:57 (星期三)

发件人： nagios<nagios@phx2-ss-5-lb.cnet.com> [添加联系人] [邮件往来] [拒收]

收件人：我<zwhset@163.com>

**** Nagios *****

Notification Type: RECOVERY

Host: rsync-89

State: UP

Address: 192.168.100.89

Info: PING OK - Packet loss = 0%, RTA = 0.32 ms

Date/Time: Wed Mar 13 22:57:44 CST 2013

#那我们加一个监控服务，看看全程如何工作的。查看端口22是否保持链接。check_tcp，我们先看看用法。

[root@weihack libexec]# ./check_tcp -h

Usage:check_tcp -H host -p port [-w <warning time>] [-c <critical time>] [-s <send string>]

[-e <expect string>] [-q <quit string>][-m <maximum bytes>] [-d <delay>]

[-t <timeout seconds>] [-r <refuse state>] [-M <mismatch state>] [-v] [-4|-6] [-j]

[-D <days to cert expiry>] [-S <use SSL>] [-E]

Options:

-h, --help

Print detailed help screen

-V, --version

Print version information

-H, --hostname=ADDRESS

Host name, IP Address, or unix socket (must be an absolute path)

-p, --port=INTEGER

Port number (default: none)

-4, --use-ipv4

Use IPv4 connection

-6, --use-ipv6

Use IPv6 connection

-E, --escape

Can use \n, \r, \t or \ in send or quit string. Must come before send or quit option

Default: nothing added to send, \r\n added to end of quit

-s, --send=STRING

String to send to the server

-e, --expect=STRING

String to expect in server response (may be repeated)

-A, --all

All expect strings need to occur in server response. Default is any

-q, --quit=STRING

String to send server to initiate a clean close of the connection

-r, --refuse=ok|warn|crit

Accept TCP refusals with states ok, warn, crit (default: crit)

-M, --mismatch=ok|warn|crit

Accept expected string mismatches with states ok, warn, crit (default: warn)

-j, --jail

Hide output from TCP socket

-m, --maxbytes=INTEGER

Close connection once more than this number of bytes are received

-d, --delay=INTEGER

Seconds to wait between sending string and polling for response

-w, --warning=DOUBLE

Response time to result in warning status (seconds)

-c, --critical=DOUBLE

Response time to result in critical status (seconds)

-t, --timeout=INTEGER

Seconds before connection times out (default: 10)

-v, --verbose

Show details for command-line debugging (Nagios may truncate output)

#这似乎是具体方法，让我们看看command里定义的。

define command{

command_name check_tcp

command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$

}

#对照上表-H 主机地址，-p 端口接受参数1 2

#我找不到定义的文件在哪呢，$USER1$是路径也就是/usr/local/nagios/libexec,后面三个也一样。那么可构造 check_tcp 22，$ARG1$ $ARG2$用!号隔开。$USER1$的定义在文件：

[root@weihack objects]# cat /usr/local/nagios/etc/resource.cfg |grep USER1

# Nagios supports up to 32 $USERx$ macros ($USER1$ through $USER32$)

# Sets $USER1$ to be the path to the plugins

$USER1$=/usr/local/nagios/libexec

$ARGn$：The nth argument passed to the command (notification, event handler, service check, etc.). Nagios supports up to 32 argument macros ($ARG1$ through $ARG32$).

$USERn$：The nth user-definable macro. User macros can be defined in one or more resource files. Nagios supports up to 256 user macros ($USER1$ through $USER32$).
我们添加服务

[root@weihack objects]# vim services.cfg

define service {

host_name rsync-89

service_description check_tcp 80

check_period 24x7

max_check_attempts 4

normal_check_interval 3

retry_check_interval 2

contact_groups ktm

notification_interval 10

notification_period 24x7

notification_options w,u,c,r

check_command check_tcp!80

}

#验证下并滑溜运行程序。

[root@weihack objects]# /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg

[root@weihack nagios]# kill -Hup 8670

#运行成功，那我们想自由自在的构造一些检测，如何实现?做个实验.

#check_ping 主要输出这些：PING OK - Packet loss = 0%, RTA = 0.06 ms|rta=0.061000ms;3000.000000;5000.000000;0.000000 pl=0%;80;90;0

[root@weihack libexec]# cat /test/passwd |wc -l

#假设有用户则显示用户数，无用户为空则报警。如何设计？

[root@weihack libexec]# touch /test/passwda

[root@weihack libexec]# cat check_user

#check_user_nagios cwtea

#blog: cwtea.blog.51cto.com

cu=`cat /test/passwda |wc -l`

if [ $cu -ne 0 ]; then

echo "User OK - User is running (UserNumber: ${cu})"

else

echo "User CRITICAL,"User is none""

[root@weihack libexec]# ./check_user

User CRITICAL,User is none

[root@weihack libexec]# ./check_user

User OK - User is running (UserNumber: 25)

#添加一个定义check_user

[root@weihack objects]# vim commands.cfg

#check user

define command{

command_name check_user

command_line $USER1$/check_user

}

#添加一项服务

[root@weihack objects]# vim services.cfg

define service {

host_name rsync-89

service_description check_user

check_period 24x7

max_check_attempts 4

normal_check_interval 3

retry_check_interval 2

contact_groups ktm

notification_interval 10

notification_period 24x7

notification_options w,u,c,r

check_command check_user

}

#kill -Hup 23377

#web界面看看，已经出现了。

Nagios检测一些记序

#我们把文件弄成空的。

[root@weihack objects]# rm -rf /test/passwd

[root@weihack objects]# touch /test/passwd

#状态显示是OK的，但是信息栏已经出现了我们想要的。

check_user OK 03-14-2013 00:20:29 0d 0h 3m 25s 1/4 User CRITICAL,User is none

#我们加个返回状态码 exit 2

[root@weihack libexec]# cat check_user

#check_user_nagios cwtea

#blog: cwtea.blog.51cto.com

cu=`cat /test/passwd |wc -l`

if [ $cu -ne 0 ]; then

echo "User OK - User is running (UserNumber: ${cu})"

#Is OK.

exit 0

else

echo "User CRITICAL,"User is none""

exit 2

#过会儿，再看已经down掉了。

#邮件报警等了好久才来

主　题： ** PROBLEM Service Alert: rsync 89/check_user is CRITICAL ** [新窗口打开]

时　间： 2013-03-14 00:32 (星期四)

发件人： nagios<nagios@phx2-ss-5-lb.cnet.com> [添加联系人] [邮件往来] [拒收]

收件人：我<zwhset@163.com>

***** Nagios *****

Notification Type: PROBLEM

Service: check_user

Host: rsync 89

Address: 192.168.100.89

State: CRITICAL

Date/Time: Thu Mar 14 00:32:39 CST 2013

Additional Info:

User CRITICAL,User is none

Nagios检测一些记序

相关阅读