任务:需要在nagios中定义服务去检测3个DC的状态(1.主机状态、2.consul cluster状态、3.nomad cluster状态),只要其中某个服务状态失效,就触发nagios eventhandler去改变dns服务器的链接文件,如上图所示。
脚本:脚本中的服务器地址和实际的不同
脚本1:该脚本检测3个DC的服务状态,根据检测到的结果会输出目前dns应该链接的文件名,nagios上会显示该文件名。如果dns没有链接到正确的文件名,nagios就会报警并触发event-handler。
#!/bin/bash#Detection DC host status、consul cluster status、nomad cluster statusDATE=`date +%Y%m%d%H%M%S`#DC:US(tier1001 and tier1002)#DC:EU(tier2001 and tier2002)#DC:AS(tier3001 and tier3002)#All DC -> axel-geo_us_eu_as.yml default#DC-EU down -> axel-geo_us_as.yml if DC-EU down#DC-AS down -> axel-geo_us_eu.yml if DC-AS down#DC-US down -> axel-geo_eu_as.yml if DC-US down#detection dc(US) ping status #检测3个DC的主机状态,通过nagios自带插件check_ping去检测PING_1001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier1001 -w 3000.0,80% -c 5000.0,100% -p 5|awk ‘{print $2}‘`PING_1002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier1002 -w 3000.0,80% -c 5000.0,100% -p 5|awk ‘{print $2}‘`#detection dc(EU) ping statusPING_2001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier2001 -w 3000.0,80% -c 5000.0,100% -p 5|awk ‘{print $2}‘`PING_2002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier2002 -w 3000.0,80% -c 5000.0,100% -p 5|awk ‘{print $2}‘`#detection dc(AS) ping statusPING_3001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier3001 -w 3000.0,80% -c 5000.0,100% -p 5|awk ‘{print $2}‘`PING_3002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier3002 -w 3000.0,80% -c 5000.0,100% -p 5|awk ‘{print $2}‘`#detection dc(US) consul #检测3个DC的consul cluster状态,通过nrpe调用远程主机上的脚本if /usr/lib64/nagios/plugins/check_nrpe -H tier1001.axel.network -c check_consul_cluster &>/dev/null ; then CON_US=0 ; else CON_US=1 ; fi#detection dc(EU) consulif /usr/lib64/nagios/plugins/check_nrpe -H tier2001.axel.network -c check_consul_cluster &>/dev/null ; then CON_EU=0 ; else CON_EU=1 ; fi#detection dc(AS) consulif /usr/lib64/nagios/plugins/check_nrpe -H tier3001.axel.network -c check_consul_cluster &>/dev/null ; then CON_AS=0 ; else CON_AS=1 ; fi#detection dc(US) nomad #检测3个DC的nomad cluster状态,通过nrpe调用远程主机上的脚本if /usr/lib64/nagios/plugins/check_nrpe -H tier1001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_US=0 ; else NOM_US=1 ; fi#detection dc(EU) nomadif /usr/lib64/nagios/plugins/check_nrpe -H tier2001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_EU=0 ; else NOM_EU=1 ; fi#detection dc(AS) nomadif /usr/lib64/nagios/plugins/check_nrpe -H tier3001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_AS=0 ; else NOM_AS=1 ; fi#detection corrent linkfile #检测dns服务器上目前链接的文件名是什么FILE=`/usr/lib64/nagios/plugins/check_nrpe -H romeo.zencoo.com -c check_pdns_link`[ ! -n "$FILE" ] && {echo ‘$FILE is NULL‘exit 1}#detection service function #将每个DC的三个服务做判断,一个DC中,只有所有服务状态都正常,该DC的变量被赋值0(比如US被赋值为0)function service {#detection ping [ "$PING_1001" == "OK" -a "$PING_1002" == "OK" ] && PING_US=0 || PING_US=1[ "$PING_2001" == "OK" -a "$PING_2001" == "OK" ] && PING_EU=0 || PING_EU=1[ "$PING_3001" == "OK" -a "$PING_3002" == "OK" ] && PING_AS=0 || PING_AS=1#detection all status [ "$PING_US" -eq 0 ] && [ "$CON_US" -eq 0 ] && [ "$NOM_US" -eq 0 ] && US=0 || US=1[ "$PING_EU" -eq 0 ] && [ "$CON_EU" -eq 0 ] && [ "$NOM_EU" -eq 0 ] && EU=0 || EU=1[ "$PING_AS" -eq 0 ] && [ "$CON_AS" -eq 0 ] && [ "$NOM_AS" -eq 0 ] && AS=0 || AS=1}service#判断是否需要切换链接文件,如果需要,退出状态码就是2,nagios就会报警,触发event-handlerif [ ${US} -eq 0 ] && [ ${EU} -eq 0 ] && [ ${AS} -eq 0 ] && [ "$FILE" == "axel-geo_us_eu_as.yml" ];then echo "all-DC-is ok,->already axel-geo_us_eu_as.yml";exit 0elif [ ${US} -eq 0 ] && [ ${EU} -eq 0 ] && [ ${AS} -eq 0 ] && [ "$FILE" != "axel-geo_us_eu_as.yml" ];then echo "axel-geo_us_eu_as.yml";exit 2elif [ ${US} -eq 1 -a "$FILE" != "axel-geo_eu_as.yml" ];then echo "axel-geo_eu_as.yml";exit 2elif [ ${EU} -eq 1 -a "$FILE" != "axel-geo_us_as.yml" ];then echo "axel-geo_us_as.yml";exit 2elif [ ${AS} -eq 1 -a "$FILE" != "axel-geo_us_eu.yml" ];then echo "axel-geo_us_eu.yml";exit 2else echo "link file is ${FILE}" exit 0fi
脚本2:触发event-handler的脚本
#!/bin/bash#check_service_status.sh dection All dc host status、consul status、nomad status.#script return a file name ($2 following four)#All DC -> axel-geo_us_eu_as.yml default#DC-EU down -> axel-geo_us_as.yml if DC-EU down#DC-AS down -> axel-geo_us_eu.yml if DC-AS down#DC-US down -> axel-geo_eu_as.yml if DC-US downWORKDIR=/usr/lib64/nagios/pluginsDATE=`date +%Y%m%d%H%M%S`LOG=/tmp/.dns_linkfileexec &>>${LOG}case $1 in #$1就是nagios检测服务的状态码,如果报警就是CRITICALOK) #correct link file exit 0 ;;CRITICAL) #$2是nagios上显示的信息,也就是文件名,然后通过nrpe去调用dns服务器上的脚本更改链接文件 #need to switch link file case $2 in axel-geo_us_eu_as.yml) #DC-EU、DC-AS、DC-US state ok,linkfile->axel-geo_us_eu_as.yml REMOTE_CMD=update_us_eu_as ;; axel-geo_us_as.yml) #DC-EU down,linkfile->axel-geo_us_as.yml REMOTE_CMD=update_us_as ;; axel-geo_us_eu.yml) #DC-AS down, linkfile->axel-geo_us_eu.yml REMOTE_CMD=update_us_eu ;; axel-geo_eu_as.yml) #DC-US down, linkfile->axel-geo_eu_as.yml REMOTE_CMD=update_eu_as ;; *) #default output echo "${DATE}--warining,no file match" exit 1 ;; esac echo "${DATE}--${WORKDIR}/check_nrpe -H {ns1,ns2}.zencoo.com -c ${REMOTE_CMD}" ${WORKDIR}/check_nrpe -H DNS1 -c ${REMOTE_CMD} ${WORKDIR}/check_nrpe -H DNS2 -c ${REMOTE_CMD} ;;esacexit 0
脚本3:更改DNS服务上的链接文件
#!/bin/bash#The script is called in the check_dc_status and change_dns_linkfile scriptsLOG=/tmp/.dns_linkfileDATE=`date +%Y%m%d%H%M%S`DIR=/etc/pdnsLN=axel-geo.ymlFILE="`ls -l ${DIR}/${LN} | sed -n ‘/^l/p‘|sed ‘s/.*-> //g‘`"#$1 is check_dc_status and change_dns_linkfile passed parameterscase $1 in #前两个脚本会通过nrpe来调用该脚本,$1就是传入的参数check) FILE="`ls -l ${DIR}/${LN} | sed -n ‘/^l/p‘|sed ‘s/.*-> //g‘`" echo "$FILE" exit 0 ;;us_eu_as) TAGETFILE="${DIR}/axel-geo_us_eu_as.yml" ;;us_as) TAGETFILE="${DIR}/axel-geo_us_as.yml" ;;us_eu) TAGETFILE="${DIR}/axel-geo_us_eu.yml" ;;eu_as) TAGETFILE="${DIR}/axel-geo_eu_as.yml" ;;*) echo ‘$1 error‘ >>${LOG} exit 1 ;;esacif [ ! -f ${TAGETFILE} ];then echo ‘$TAGETFILE does not exist/${DATE}‘ >>${LOG} exit 1elif [ "$FILE" == "$TAGETFILE" ];then echo "${DATE}-Link file is correct, no need to switch" >>${LOG} exit 0else echo "${HOSTNAME}/${DATE} ln -snf $TAGETFILE ${DIR}/${LN}" >>${LOG}sudo /usr/bin/ln -snf $TAGETFILE ${DIR}/${LN} sudo /bin/pdns_control reload && echo "${DATE}-reload dns ok" >>${LOG} || echo "${DATE}-reload dns failed" >>${LOG} exit 0fi
nagios配置 #定义检测服务,定义event-handler
define service{ use generic-service host_name xxx service_description check_dc_status contact_groups admins,admins_jabber check_command check_nrpe_t60!check_dc_status #调用检测服务状态的脚本(脚本1) event_handler change_dns_linkfile #调用event命令 }define command { command_name change_dns_linkfile #$SERVICESTATE$ $SERVICEOUTPUT$ 对应脚本2中的$1和$2 command_line $USER1$/eventhandlers/change_dns_linkfile $SERVICESTATE$ $SERVICEOUTPUT$ }
puppet配置 #脚本1和脚本2会通过nrpe调用脚本3,需要定义相应的命令以及对应的参数
<% if @fqdn == ‘dns1xxxx‘ or @fqdn == ‘dns2xxxx‘ -%>command[check_pdns_link]=<%= @pluginsdir %>/dns_file_check.sh check command[update_us_eu_as]=<%= @pluginsdir %>/dns_file_check.sh us_eu_ascommand[update_us_eu]=<%= @pluginsdir %>/dns_file_check.sh us_eucommand[update_us_as]=<%= @pluginsdir %>/dns_file_check.sh us_ascommand[update_eu_as]=<%= @pluginsdir %>/dns_file_check.sh eu_as<% end -%>
第一次弄nagios event-handler,感觉很乱,脚本还要再继续完善