一步步撸脚本监控
监控什么
目标通过脚本快速监控,获取服务器和各中间件信息(nginx,tomcat,mq,redis等)。
一步步实现
环境
考虑到监控脚本可能会监控多台服务器,或者监控不同环境(dev,uat,prod)下的多台服务器。
所以第一步我们需要通过读取不同配置文件,去监控不同的服务器和不同服务器下指定的中间件去监控。
我们以两台服务器为例子:
192.168.2.217 需要监控os信息,监控redis信息,监控rocketmq信息
192.168.2.218 需要监控os信息,监控nginx信息,监控tomcat信息
两台服务器的用户是app,通过ssh-copy-id设置无密码登录,如果不允许,可以考虑加入expect,但是这里我们不引入expect。
配置文件config
####################配置信息start####################
ip1=192.168.2.217
ip2=192.168.2.218
server_user=app
#ping->ping_info=ip,如果有多台用","号分隔
ping_info=${ip1},${ip2}
#监控os信息-> os_info=ip--用户名,如果有多台用","号分隔
os_info=${ip1}--${server_user},${ip2}--${server_user}
#监控nginx-> nginx_info=ip--用户名--access.log path--异常数百分比超过多少打印红色提醒x%,如果有多台用","号分隔
nginx_info=${ip2}--${server_user}--80--/var/log/nginx/access.log--10%
#监控redis->redis_info=ip--用户名--端口--redis的路径--redis密码,其中redis集群任意一台机器都可以,如果有多台用","号分隔,密码如果为空写no
reids_info=${ip1}--${server_user}--8379--/app/redis/bin--no
#监控rocketmq->roketmq_info=ip--username--port--rocketmq的bin路径--未处理消息数多少打印红色警告数量,其中集群只需配置一台,如果有多台用","号分隔
rocketmq_info=${ip1}--${server_user}--9876--/app/rocketmq/bin--1000
#监控tomcat->tomcat_info=ip--username--port--log日志路径,如果有多台用","号分隔
tomcat_info=${ip2}--${server_user}--8501--/app/servers/tomcat/logs/catalina.out
####################配置信息stop####################
采集信息monitor.sh
#!/bin/bash
#===============================================================================
# FILE: monitor.sh
# DESCRIPTION: 后台监控脚本
# OPTIONS: 配置文件路径
# AUTHOR: 余很多
# ORGANIZATION: yumore
# VERSION: 1.4
# CREATED: 17/02/20xx 13:31
#===============================================================================
if [ $# == 0 ];then
echo "请指定配置文件路径";
exit
fi
set -o nounset
#配置读取
source $1
##ping 确认ip是否ping的通
pinfo=$(./monitor_ping.sh ${ping_info} | grep "error")
if [[ $pinfo == *error* ]]
then
echo ${pinfo}
exit
fi
echo "ping ok next..."
##查看机器使用基本信息
if [ ${os_info:=no} == "no" ]
then
echo "no os info"
else
os_info_list=(${os_info//,/ })
for info in ${os_info_list[@]}
do
infos=(${info//--/ })
echo "-->${infos[0]}基本信息:"
ssh -o StrictHostKeyChecking=no ${infos[1]}@${infos[0]} 'bash -s' < monitor_os.sh ${infos[0]}
done
fi
##查看redis信息
if [ ${redis_info:=no} == "no" ]
then
echo "no redis info"
else
redis_infos=(${redis_info//,/ })
for info in ${redis_infos[@]}
do
infos=(${info//--/ })
echo "-->redis${infos[0]}基本信息:"
ssh -o StrictHostKeyChecking=no ${infos[1]}@${infos[0]} 'bash -s' < monitor_redis.sh ${infos[3]} ${infos[0]} ${infos[2]} ${infos[4]}
done
fi
##查看rocketmq信息
if [ ${rocketmq_info:=no} == "no" ]
then
echo "no rocketmq info"
else
rocketmq_infos=(${rocketmq_info//,/ })
for info in ${rocketmq_infos[@]}
do
infos=(${info//--/ })
echo "-->rocketmq${infos[0]}基本信息:"
ssh -o StrictHostKeyChecking=no ${infos[1]}@${infos[0]} 'bash -s' < monitor_rocketmq.sh ${infos[3]} ${infos[0]} ${infos[2]} ${infos[4]}
done
fi
##查看nginx信息
if [ ${nginx_info:=no} == "no" ]
then
echo "no nginx info"
else
nginx_infos=(${nginx_info//,/ })
for info in ${nginx_infos[@]}
do
infos=(${info//--/ })
echo "-->nginx${infos[0]}基本信息:"
ssh -o StrictHostKeyChecking=no ${infos[1]}@${infos[0]} 'bash -s' < monitor_nginx.sh ${infos[2]} ${infos[3]} ${infos[4]}
done
fi
##查看tomcat信息
if [ ${tomcat_info:=no} == "no" ]
then
echo "no tomcat info"
else
tomcat_infos=(${tomcat_info//,/ })
for info in ${tomcat_infos[@]}
do
infos=(${info//--/ })
echo "-->tomcat${infos[0]}基本信息:"
ssh -o StrictHostKeyChecking=no ${infos[1]}@${infos[0]} 'bash -s' < monitor_tomcat.sh ${infos[0]} ${infos[2]} ${infos[3]}
done
fi
- 必须有参数:配置文件,如果没有结束执行
- set -o nounset unset下
- 先ping下,如果有服务器ping不通,就结束
- 判断xx_info是否配置,有配置信息才开始采集监控信息
- ssh -o StrictHostKeyChecking=no {infos[0]} 'bash -s' < monitor_os.sh ${infos[0]} 通过ssh连接,再在服务器上执行monitor_os.sh脚本,其它类似。
- monitor_ping.sh 判断服务器是否ping通
- monitor_os.sh 采集监控服务器内存,cpu,硬盘信息
- monitor_redis.sh 采集监控redis信息
- monitor_rocketmq.sh 采集监控mq信息,支持未处理消息数多少打印红色警告数量
- monitor_nginx.sh 采集监控nginx信息,支持异常数百分比超过多少打印红色提醒
- monitor_tomcat.sh 采集监控tomcat信息
具体单个信息监控脚本
monitor_ping.sh
#!/bin/bash
#===============================================================================
# FILE: monitor_ping.py
# DESCRIPTION: ping各台服务器是否通
# OPTIONS: 参数1个, 例子:ip1,ip2,ip3
# AUTHOR: 余很多
# ORGANIZATION: yumore
# CREATED: 17/02/20xx 16:21
#===============================================================================
if [ $# == 0 ];then
echo "请给定参数";
exit
fi
ips=(${1//,/ })
#测试是否能ping的通
for ip in ${ips[@]}
do
ping -c 2 -i 0.3 -W 1 $ip &> /dev/null
if [ $? -eq 0 ];then
echo -e "ping success->$ip is up"
else
echo -e "ping error->\033[31m$ip is down\033[0m"
fi
done
monitor_os.sh
#!/bin/bash
#===============================================================================
# FILE: monitor_os.py
# DESCRIPTION: 机器基本信息采集观察
# OPTIONS: 参数:ip
# AUTHOR: 余很多
# ORGANIZATION: yumore
# CREATED: 17/02/20xx 16:21
#===============================================================================
if [ $# == 0 ];then
echo "请给定参数";
exit
fi
set -o nounset
host=$1
cpu_num=`grep -c "model name" /proc/cpuinfo` > /dev/null
echo "cpu监控(总核数:$cpu_num)"
# 1、获取CPU利用率
# 获取用户空间占用CPU百分比
cpu_user=`top -b -n 1 | grep Cpu | awk '{print $2}' | cut -f 1 -d "%"`
echo " 用户空间占用CPU百分比:"$cpu_user
# 获取内核空间占用CPU百分比
cpu_system=`top -b -n 1 | grep Cpu | awk '{print $3}' | cut -f 1 -d "%"`
echo " 内核空间占用CPU百分比:"$cpu_system
# 获取空闲CPU百分比
cpu_idle=`top -b -n 1 | grep Cpu | awk '{print $5}' | cut -f 1 -d "%"`
echo " 空闲CPU百分比:"$cpu_idle
# 获取等待输入输出占CPU百分比
cpu_iowait=`top -b -n 1 | grep Cpu | awk '{print $3}' | cut -f 1 -d "%"`
echo " 等待输入输出占CPU百分比:"$cpu_iowait
# 获取CPU5分钟前到现在的负载平均值
cpu_load_5min=`uptime | awk '{print $12}' | cut -f 1 -d ','`
echo " CPU 5分钟前到现在的负载平均值:"$cpu_load_5min
# 获取CPU1分钟前到现在的负载平均值
cpu_load_1min=`uptime | awk '{print $11}' | cut -f 1 -d ','`
echo " CPU 1分钟前到现在的负载平均值:"$cpu_load_1min
#4、获取内存信息
# 获取物理内存总量
mem_total=`free -m| grep Mem | awk '{print $2}'`
echo "内存监控:(物理内存总量:$mem_total)"
# 获取操作系统已使用内存总量
mem_sys_used=`free -m| grep Mem | awk '{print $3}'`
echo " 已使用内存总量(操作系统):"$mem_sys_used
# 获取操作系统未使用内存总量
mem_sys_free=`free -m| grep Mem | awk '{print $4}'`
echo " 剩余内存总量(操作系统):"$mem_sys_free
# 获取应用程序已使用的内存总量
mem_user_used=`free -m| sed -n 3p | awk '{print $3}'`
echo " 已使用内存总量(应用程序):"$mem_user_used
# 获取应用程序未使用内存总量
mem_user_free=`free -m| sed -n 3p | awk '{print $4}'`
echo " 剩余内存总量(应用程序):"$mem_user_free
# 获取已使用交换分区大小
mem_swap_used=`free -m| grep Swap | awk '{print $3}'`
echo " 已使用交换分区大小:"$mem_swap_used
#获取磁盘信息
echo "磁盘监控:"
df_info=`df -h | awk '{if($5 ~ /^[0-9]+%$/) print $5;}'`
df_infos=(${df_info// / })
echo " 磁盘使用:"$df_info
for info in ${df_infos[@]}
do
len=${#info}-1
pp=${info:0:$len}
if [ ${pp} -gt 90 ];then
echo -e "tinyError->\033[31m${host}磁盘使用异常\033[0m"
fi
done
monitor_redis.sh
#!/bin/bash
# -*- coding: utf-8 -*-
#===============================================================================
# FILE: monitor_redis.sh
# USAGE: ./monitor_redis.sh
# DESCRIPTION: reis监控信息
# OPTIONS: 参数4个, 1:redis-cli所在目录 2:reids启动的ip 3:redis port 4:redis密码
# AUTHOR: 余很多
# ORGANIZATION: yumore
# CREATED: 17/02/20xx 16:21
#===============================================================================
#echo $1
if [ $# < 3 ];then
echo "请给定参数";
exit
fi
set -o nounset
host=$2
port=$3
pwd=$4
netstat -an|grep LISTEN |grep ":$port" > /dev/null
if [ $? -eq 0 ];then
echo -e "redis is running"
else
echo -e "tinyError->\033[31mredis is stop\033[0m"
exit
fi
cd $1
info=`./redis-cli -h $host -p $port -a ${pwd} info > ~/tmp_redis_info 2>&1`
used_memory_human=`cat ~/tmp_redis_info |grep used_memory_human | cut -d ":" -f 2`
echo "内存的使用量:$used_memory_human"
connected_clients=`cat ~/tmp_redis_info | grep connected_clients | cut -d ":" -f 2`
echo "连接客户端数:$connected_clients"
total_connections_received=`cat ~/tmp_redis_info | grep total_connections_received | cut -d ":" -f 2`
echo "已接受请求数:$total_connections_received"
uptime_in_seconds=`cat ~/tmp_redis_info | grep uptime_in_seconds | cut -d ":" -f 2`
vv=`echo "$uptime_in_seconds"| awk '{print int($0)/60/60}'`
echo "运行至今时长:$vv小时"
evicted_keys=`cat ~/tmp_redis_info |grep evicted_keys | cut -d ":" -f 2`
echo "$evicted_keys"| awk '{print int($0)}' > /dev/null
if [ $? -eq 0 ];then
echo "无内存达到上限被剔除的键"
else
echo -e "\033[31m因内存达到上限被剔除的键数:$evicted_keys\033[0m"
exit
fi
rm -rf ~/tmp_redis_info
monitor_rocketmq.sh
#!/bin/bash
# -*- coding: utf-8 -*-
#===============================================================================
# FILE: monitor_rocketmq.sh
# USAGE: ./monitor_rocketmq.sh
# DESCRIPTION: rocketmq监控信息
# OPTIONS: 参数4个, 1:rocketmq bin目录 2:rocketmq ip 3:rocketmq port 4:error_print
# AUTHOR: 余很多
# ORGANIZATION: yumore
# CREATED: 19/02/20xx 14:11
#===============================================================================
if [ $# !=4 ];then
echo "请给定参数";
exit
fi
set -o nounset
host=$2
port=$3
error_num=$4
source ~/.bash_profile
#端口监听
netstat -an|grep LISTEN |grep ":$port" > /dev/null
if [ $? -eq 0 ];then
echo "rocketmq namesrv is running"
else
echo -e "tinyError->\033[31mrocketmq namesrv is stop\033[0m"
exit
fi
netstat -an|grep LISTEN |grep ":10911" > /dev/null
if [ $? -eq 0 ];then
echo "rocketmq broker is running"
else
echo -e "tinyError->\033[31mrocketmq broker is stop\033[0m"
exit
fi
cd $1
#./mqadmin topicStatus -n $host:$port -t test_app
#Broker节点信息(brokerStatus)
# ./mqadmin brokerStatus -n $host:$port -b $host:10911
#消息队列负载情况(allocateMQ)
diffTotal=`sh $1/mqadmin consumerProgress -n $host:$port 2>&1 | awk '{if($5 ~ /^[0-9]+$/) print $5;}' `
offsets=(${diffTotal// / })
#多少消息未被消费
for offset in ${offsets[@]}
do
if [ $offset -gt $error_num ];then
echo -e "tinyError->\033[31m消息队列${offset}消息未被消费\033[0m"
else
if [ $offset -gt 0 ];then
echo "消息队列${offset}消息未被消费"
fi
fi
done
infoss=`sh $1/mqadmin brokerStatus -n $host:$port -b $host:10911 2>&1 | grep -E "msgPutTotalTodayNow|msgGetTotalTodayNow" | awk '{if($3 ~ /^[0-9]+$/) print $3;}' `
infos=(${infoss// / })
echo "总的生产消息:${infos[0]}"
echo "总的消费消息:${infos[1]}"
monitor_nginx.sh
#!/bin/bash -
#===============================================================================
# FILE: monitor_nginx.sh
# DESCRIPTION: nginx信息
# OPTIONS: 参数3个 1:port 2:access.log日志路径 3:百分比(异常数百分比超过多少打印红色提醒) 比如10%
# AUTHOR: 余很多
# ORGANIZATION: yumore
# CREATED: 19/02/20xx 15:21
#===============================================================================
if [ $# !=3 ];then
echo "请给定参数";
exit
fi
set -o nounset
NGINX_LOG_PATH=$2
netstat -an|grep LISTEN |grep ":$1" > /dev/null
if [ $? -eq 0 ];then
echo -e "nginx is running"
else
echo -e "error->\033[31mnginx is stop\033[0m"
fi
allcount=`cat ${NGINX_LOG_PATH}|wc -l`
echo "今天nginx访问总数:$allcount"
count=`awk '($9 ~ /404/||/500/)' ${NGINX_LOG_PATH} |wc -l`
echo "今天nginx访问异常数:$count"
percent=$(printf "%d" $(($count*100/$allcount)))
len=${#3}-1
pp=${3:0:$len}
if [ $percent -ge $pp ];then
echo -e "error->\033[31mnginx异常数${percent}% \033[0m"
fi
monitor_tomcat.sh
#!/bin/bash
# -*- coding: utf-8 -*-
#===============================================================================
# FILE: monitor_tomcat.sh
# DESCRIPTION: tomcat监控信息
# OPTIONS: 参数3个, 1:tomcat ip 2:tomcat port 3 日志path
# AUTHOR: 余很多
# ORGANIZATION: sunline
# CREATED: 19/02/20xx 23:21
#===============================================================================
#echo $1
if [ $# !=3 ];then
echo "请给定参数";
exit
fi
set -o nounset
host=$1
port=$2
#端口监听
netstat -an|grep LISTEN |grep ":$port" > /dev/null
if [ $? -eq 0 ];then
echo "tomcat is running"
else
echo -e "tinyError->\033[31mtomcat is stop\033[0m"
exit
fi
#异常数量
time=$(date "+%Y-%m-%d")
echo
error_count=`cat $3 |grep $time | grep ERROR|wc -l 2>&1`
if [ $error_count -eq 0 ];then
echo "tomcat${host}:${port}-今天无异常"
else
echo -e "tinyError->\033[31mtomcat${host}:${port}-异常数=${error_count}\033[0m"
exit
fi
#需要增加jmx配置
#java堆使用情况
#java -jar cmdline-jmxclient-0.10.3.jar - 10.25.0.39:9999 java.lang:type=Memory HeapMemoryUsage 2>&1 | grep used | awk '{print $2}'
# current_busy_threads
#java -jar cmdline-jmxclient-0.10.3.jar - 10.25.0.39:9999 Catalina:name=\"http-nio-8980\",type=ThreadPool currentThreadsBusy 2>&1 | awk '{print $NF}'
#ThreadCount: 24
#java -jar cmdline-jmxclient-0.10.3.jar - 10.25.0.39:9999 java.lang:type=Threading ThreadCount 2>&1 | awk '{print $NF}'
最后
运行 sh monitor.sh config. (config是一开始的配置文件名字)
采集的信息收集进行监控,可以通过watch命令实时刷新,也可以落地到文件或者数据库。
略。
网友评论