以A机房为例,安装部署在192.168.10.103服务器上。
- 添加Zabbix安装源
rpm -Uvh https://repo.zabbix.com/zabbix/4.4/rhel/7/x86_64/zabbix-release-4.4-1.el7.noarch.rpm
yum clean all
yum install munin --nogpgcheck
- 安装Agent服务
yum install zabbix-agent
- 引入GPU查询脚本
mkdir -p /etc/zabbix/scripts
把脚本get_gpus_info.sh放入其中,并且添加执行权限
chmod +x /etc/zabbix/scripts/get_gpus_info.sh
get_gpus_info.sh的内容如下
#!/bin/bash
result=$(/usr/bin/nvidia-smi -L | sed 's/^GPU \([0-9]*\):.*(UUID: \(.*\))$/,{"{#GPUINDEX}":"\1","{#GPUUUID}":"\2"}/g')
first=1
echo "{"
echo "\"data\":["
for line in ${result[@]}
do
if [ "$first" == "1" ]; then
echo ${line:1}
first=0
else
echo -n $line
fi
done
echo
echo "]"
echo "}"
- 配置Agent
vi /etc/zabbix/zabbix_agent.conf
Server=192.168.10.101 # Zabbix proxy地址
LogFileSize=512
ServerActive=192.168.10.101
Hostname=DOMAIN_ZONEA_192.168.10.102_CPU
Timeout=10 # 超时时间,默认是3秒,根据网络情况而定,建议设置为10秒
UserParameter=gpu.number,/usr/bin/nvidia-smi -L | /usr/bin/wc -l
UserParameter=gpu.discovery,/etc/zabbix/scripts/get_gpus_info.sh
UserParameter=gpu.fanspeed[*],/usr/bin/nvidia-smi --query-gpu=fan.speed --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.power[*],/usr/bin/nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.temp[*],/usr/bin/nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.utilization[*],/usr/bin/nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.memfree[*],/usr/bin/nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.memused[*],/usr/bin/nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.memtotal[*],/usr/bin/nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i $1 | tr -d "\n"
- 在Server端导入GPU监控模板
Configuration ---> Templates ---> Import,导入文件zbx_nvidia-smi-multi-gpu-active.xml,该文件内容如下:
<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
<version>3.0</version>
<date>2018-06-05T20:56:12Z</date>
<groups>
<group>
<name>Templates</name>
</group>
</groups>
<templates>
<template>
<template>Template Nvidia GPUs Performance active</template>
<name>Template Nvidia GPUs Performance active</name>
<description/>
<groups>
<group>
<name>Templates</name>
</group>
</groups>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<items>
<item>
<name>Number of GPUs</name>
<type>7</type>
<snmp_community/>
<multiplier>0</multiplier>
<snmp_oid/>
<key>gpu.number</key>
<delay>30</delay>
<history>90</history>
<trends>365</trends>
<status>0</status>
<value_type>0</value_type>
<allowed_hosts/>
<units/>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description>The number of GPUs present on this system.</description>
<inventory_link>0</inventory_link>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
</item>
</items>
<discovery_rules>
<discovery_rule>
<name>GPU discovery</name>
<type>7</type>
<snmp_community/>
<snmp_oid/>
<key>gpu.discovery</key>
<delay>600</delay>
<status>0</status>
<allowed_hosts/>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<delay_flex/>
<params/>
<ipmi_sensor/>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<filter>
<evaltype>0</evaltype>
<formula/>
<conditions/>
</filter>
<lifetime>30</lifetime>
<description>Discovery of graphics cards.</description>
<item_prototypes>
<item_prototype>
<name>GPU $1 Fan Speed</name>
<type>7</type>
<snmp_community/>
<multiplier>1</multiplier>
<snmp_oid/>
<key>gpu.fanspeed[{#GPUINDEX}]</key>
<delay>60</delay>
<history>7</history>
<trends>365</trends>
<status>0</status>
<value_type>3</value_type>
<allowed_hosts/>
<units>%</units>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link>0</inventory_link>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>GPU $1 Memory Free</name>
<type>7</type>
<snmp_community/>
<multiplier>0</multiplier>
<snmp_oid/>
<key>gpu.memfree[{#GPUINDEX}]</key>
<delay>60</delay>
<history>7</history>
<trends>365</trends>
<status>0</status>
<value_type>3</value_type>
<allowed_hosts/>
<units>MB</units>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link>0</inventory_link>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>GPU $1 Memory Total</name>
<type>7</type>
<snmp_community/>
<multiplier>0</multiplier>
<snmp_oid/>
<key>gpu.memtotal[{#GPUINDEX}]</key>
<delay>60</delay>
<history>7</history>
<trends>365</trends>
<status>0</status>
<value_type>3</value_type>
<allowed_hosts/>
<units>MB</units>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link>0</inventory_link>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>GPU $1 Memory Used</name>
<type>7</type>
<snmp_community/>
<multiplier>0</multiplier>
<snmp_oid/>
<key>gpu.memused[{#GPUINDEX}]</key>
<delay>60</delay>
<history>7</history>
<trends>365</trends>
<status>0</status>
<value_type>3</value_type>
<allowed_hosts/>
<units>MB</units>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link>0</inventory_link>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>GPU $1 Power in decaWatts</name>
<type>7</type>
<snmp_community/>
<multiplier>1</multiplier>
<snmp_oid/>
<key>gpu.power[{#GPUINDEX}]</key>
<delay>60</delay>
<history>7</history>
<trends>365</trends>
<status>0</status>
<value_type>0</value_type>
<allowed_hosts/>
<units>dW</units>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>0.1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link>0</inventory_link>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>GPU $1 Temperature</name>
<type>7</type>
<snmp_community/>
<multiplier>0</multiplier>
<snmp_oid/>
<key>gpu.temp[{#GPUINDEX}]</key>
<delay>60</delay>
<history>7</history>
<trends>365</trends>
<status>0</status>
<value_type>0</value_type>
<allowed_hosts/>
<units>C</units>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link>0</inventory_link>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>GPU $1 Utilization</name>
<type>7</type>
<snmp_community/>
<multiplier>0</multiplier>
<snmp_oid/>
<key>gpu.utilization[{#GPUINDEX}]</key>
<delay>60</delay>
<history>7</history>
<trends>365</trends>
<status>0</status>
<value_type>3</value_type>
<allowed_hosts/>
<units>%</units>
<delta>0</delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel>0</snmpv3_securitylevel>
<snmpv3_authprotocol>0</snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol>0</snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula>1</formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type>0</data_type>
<authtype>0</authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link>0</inventory_link>
<applications>
<application>
<name>Nvidia</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
</item_prototypes>
<trigger_prototypes>
<trigger_prototype>
<expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>80</expression>
<name>GPU {#GPUINDEX} Temperature is extremely high</name>
<url/>
<status>0</status>
<priority>5</priority>
<description>A GPU's temperature is getting extremely high!</description>
<type>0</type>
<dependencies/>
</trigger_prototype>
<trigger_prototype>
<expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>70</expression>
<name>GPU {#GPUINDEX} Temperature is high</name>
<url/>
<status>0</status>
<priority>2</priority>
<description>A GPU's temperature is getting high!</description>
<type>0</type>
<dependencies>
<dependency>
<name>GPU {#GPUINDEX} Temperature is very high</name>
<expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>75</expression>
</dependency>
</dependencies>
</trigger_prototype>
<trigger_prototype>
<expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>75</expression>
<name>GPU {#GPUINDEX} Temperature is very high</name>
<url/>
<status>0</status>
<priority>4</priority>
<description>A GPU's temperature is getting very high!</description>
<type>0</type>
<dependencies>
<dependency>
<name>GPU {#GPUINDEX} Temperature is extremely high</name>
<expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}>80</expression>
</dependency>
</dependencies>
</trigger_prototype>
</trigger_prototypes>
<graph_prototypes>
<graph_prototype>
<name>GPU {#GPUINDEX} Memory</name>
<width>900</width>
<height>200</height>
<yaxismin>0.0000</yaxismin>
<yaxismax>100.0000</yaxismax>
<show_work_period>1</show_work_period>
<show_triggers>1</show_triggers>
<type>0</type>
<show_legend>1</show_legend>
<show_3d>0</show_3d>
<percent_left>0.0000</percent_left>
<percent_right>0.0000</percent_right>
<ymin_type_1>0</ymin_type_1>
<ymax_type_1>0</ymax_type_1>
<ymin_item_1>0</ymin_item_1>
<ymax_item_1>0</ymax_item_1>
<graph_items>
<graph_item>
<sortorder>0</sortorder>
<drawtype>0</drawtype>
<color>00AA00</color>
<yaxisside>0</yaxisside>
<calc_fnc>2</calc_fnc>
<type>0</type>
<item>
<host>Template Nvidia GPUs Performance</host>
<key>gpu.memfree[{#GPUINDEX}]</key>
</item>
</graph_item>
<graph_item>
<sortorder>1</sortorder>
<drawtype>0</drawtype>
<color>0000DD</color>
<yaxisside>0</yaxisside>
<calc_fnc>2</calc_fnc>
<type>0</type>
<item>
<host>Template Nvidia GPUs Performance</host>
<key>gpu.memused[{#GPUINDEX}]</key>
</item>
</graph_item>
</graph_items>
</graph_prototype>
<graph_prototype>
<name>GPU {#GPUINDEX} Temperature, Fan Speed and Power</name>
<width>900</width>
<height>200</height>
<yaxismin>0.0000</yaxismin>
<yaxismax>100.0000</yaxismax>
<show_work_period>1</show_work_period>
<show_triggers>1</show_triggers>
<type>0</type>
<show_legend>1</show_legend>
<show_3d>0</show_3d>
<percent_left>0.0000</percent_left>
<percent_right>0.0000</percent_right>
<ymin_type_1>0</ymin_type_1>
<ymax_type_1>0</ymax_type_1>
<ymin_item_1>0</ymin_item_1>
<ymax_item_1>0</ymax_item_1>
<graph_items>
<graph_item>
<sortorder>0</sortorder>
<drawtype>0</drawtype>
<color>1A7C11</color>
<yaxisside>0</yaxisside>
<calc_fnc>2</calc_fnc>
<type>0</type>
<item>
<host>Template Nvidia GPUs Performance</host>
<key>gpu.power[{#GPUINDEX}]</key>
</item>
</graph_item>
<graph_item>
<sortorder>1</sortorder>
<drawtype>0</drawtype>
<color>2774A4</color>
<yaxisside>0</yaxisside>
<calc_fnc>2</calc_fnc>
<type>0</type>
<item>
<host>Template Nvidia GPUs Performance</host>
<key>gpu.fanspeed[{#GPUINDEX}]</key>
</item>
</graph_item>
<graph_item>
<sortorder>2</sortorder>
<drawtype>0</drawtype>
<color>F63100</color>
<yaxisside>0</yaxisside>
<calc_fnc>2</calc_fnc>
<type>0</type>
<item>
<host>Template Nvidia GPUs Performance</host>
<key>gpu.temp[{#GPUINDEX}]</key>
</item>
</graph_item>
</graph_items>
</graph_prototype>
<graph_prototype>
<name>GPU {#GPUINDEX} Utilization</name>
<width>900</width>
<height>200</height>
<yaxismin>0.0000</yaxismin>
<yaxismax>100.0000</yaxismax>
<show_work_period>1</show_work_period>
<show_triggers>1</show_triggers>
<type>0</type>
<show_legend>1</show_legend>
<show_3d>0</show_3d>
<percent_left>0.0000</percent_left>
<percent_right>0.0000</percent_right>
<ymin_type_1>0</ymin_type_1>
<ymax_type_1>0</ymax_type_1>
<ymin_item_1>0</ymin_item_1>
<ymax_item_1>0</ymax_item_1>
<graph_items>
<graph_item>
<sortorder>0</sortorder>
<drawtype>0</drawtype>
<color>2774A4</color>
<yaxisside>0</yaxisside>
<calc_fnc>2</calc_fnc>
<type>0</type>
<item>
<host>Template Nvidia GPUs Performance</host>
<key>gpu.utilization[{#GPUINDEX}]</key>
</item>
</graph_item>
</graph_items>
</graph_prototype>
</graph_prototypes>
<host_prototypes/>
</discovery_rule>
</discovery_rules>
<macros/>
<templates/>
<screens/>
</template>
</templates>
</zabbix_export>
- 在Server端创建Host
以管理员身份登录
Configuration ---> Hosts ---> Create host
其中,Host name填写DOMAIN_ZONEA_192.168.10.102_CPU,Visible name填写:机房A_192.168.10.102,Groups:选Linux servers、Templates以及自定义的分组,Agent interfaces:填写机房A的防火墙IP123.123.123.124,Monitored by proxy选择刚创建的代理DOMAIN_ZONEA_192.168.10.101_PROXY,其他默认即可。
添加模板
Hosts --->DOMAIN_ZONEA_192.168.10.102_CPU ---> Templates
选择 “Template OS Linux by Zabbix agent active”和“Template Nvidia GPUs Performance active”两个模板,Update添加即可。 - 启动Proxy服务
systemctl restart zabbix-agent
- 添加为开启自启动
systemctl enable zabbix-agent
- 回到Server查看host
Monitoring ---> Graph
其中,Group选Linux servers,Host选DOMAIN_ZONEA_192.168.10.102_CPU,Graph选想查看的监控项,不出意外的话,几十秒内就会有结果了,或者多等几分钟。
网友评论