美文网首页
Linux GPU Prometheus监控脚本

Linux GPU Prometheus监控脚本

作者: 微笑与礼貌 | 来源:发表于2019-03-22 17:16 被阅读0次

    monitor.sh

    GPU跨平台通用监控脚本

    功能: Useage: monitor.sh fast|mem|gpu|temp|all|[pathToLog sleepTimeNum]

    注意: ./monitor.sh fast速度最快

    #!/bin/bash

    #. /etc/profile

    #. ~/.bash_profile

    #. ~/.bashrc

    # 判断nvidia-smi命令是否存在

    /usr/bin/nvidia-smi > /dev/null

    if [ $? -eq 0 ]

    then

      echo 'nvidia-smi check pass' `date`

    else

      echo 'nvidia-smi not exists'

      exit 1

    fi

    # 获取GPU Count

    function get_gpu_list()

    {

      count=`nvidia-smi -L|wc -l`

      echo $count

    }

    #获取GPU id对应uuid

    function get_uuid()

    {

      uuid=`nvidia-smi -q -i $1|grep 'UUID'|awk '{print $4}'`

      echo $uuid

    }

    #获取显存使用率 

    function get_memory_usage()

    {

      usage=`nvidia-smi -q -d MEMORY -i $1|grep -E 'Total|Used'|head -2|awk '{print $3}'|xargs echo|awk '{print $2/$1}'`

      echo $usage

    }

    #获取内存详细信息

    function get_memory_detail()

    {

      detail=`nvidia-smi -q -d MEMORY -i $1|grep -E 'Total|Used|Free'|head -3|awk '{print $3}'|xargs echo`

      echo $detail

    }

    #获取GPU使用率

    function get_volatile_gpu()

    {

      vol=`nvidia-smi -q -d UTILIZATION -i $1 |grep -A 5 "GPU Utilization"|tail -1|awk '{print $3}'` 

      echo $vol

    }

    #获取GPU Current 温度

    function get_temperature()

    {

      temp=`nvidia-smi -q -d Temperature -i $1|grep 'GPU Current'|awk '{print $5}'`

      echo $temp

    }

    #获取Pod_id

    function get_pod_id()

    {

      echo `hostname`

    }

    #数据output

    #output $1 $2 $3 $4 $5

    #$1 字段名 $2 pod_id $3 gpu编号 $4 uuid $5 监控值

    function output()

    {

      echo $1"{podid=\""$2"\",gpu=\""$3"\",uuid=\""$4"\"}" $5

    }

    #输出mem prometheus格式数据

    #dcgm_mem_usage{pod_id="localhost"}

    function mem_prm()

    {

      for((i=0;i<`get_gpu_list`;i++))

      do

          name="dcgm_mem_usage"

          pod_id=`get_pod_id`

          uuid=`get_uuid $i`

          value=`get_memory_usage $i`

          output $name $pod_id $i $uuid $value

      done

    }

    #输出mem detail prometheus格式数据

    #dcgm_mem_detail{pod_id="localhost"}

    function mem_detail_prm()

    {

      for((i=0;i<`get_gpu_list`;i++))

      do

          pod_id=`get_pod_id`

          uuid=`get_uuid $i`

          value=`get_memory_detail $i`

          output "dcgm_fb_total" $pod_id $i $uuid `echo $value|awk '{print $1}'`

          output "dcgm_fb_used" $pod_id $i $uuid `echo $value|awk '{print $2}'`

          output "dcgm_fb_free" $pod_id $i $uuid `echo $value|awk '{print $3}'`

      done

    }

    #输出gpu prometheus格式数据

    #dcgm_gpu_utilization{...}

    function gpu_prm()

    {

      for((i=0;i<`get_gpu_list`;i++))

      do

          name="dcgm_gpu_utilization"

          pod_id=`get_pod_id`

          uuid=`get_uuid $i`

          value=`get_volatile_gpu $i`

          output $name $pod_id $i $uuid $value

      done

    }

    #输出温度 prometheus格式数据

    #dcgm_temp{...}

    function temp_prm()

    {

      for((i=0;i<`get_gpu_list`;i++))

      do

          name="dcgm_temp"

          pod_id=`get_pod_id`

          uuid=`get_uuid $i`

          value=`get_temperature $i`

          output $name $pod_id $i $uuid $value

      done

    }

    function allinone()

    {

      mem_prm

      mem_detail_prm

      gpu_prm

      temp_prm

    }

    #快速获取

    function fast()

    {

      nvidia-smi -q > /tmp/1

      num=0

      count=0

      uuid=''

      first=0

      for i in `cat /tmp/1|grep -E 'Minor Number|UUID|GPU Current Temp|Gpu|Total|Used|Free'|cut -d ':' -f2|awk '{print $1}'`

      do

        if [ $num -eq 0 ];then

            uuid=$i

        elif [ $num -eq 1 ];then

            count=$i

        elif [ $num -eq 2 ];then

            if [ $first -lt 13 ];then

              echo '# HELP dcgm_fb_total Framebuffer memory total (in MiB).'

              echo '# TYPE dcgm_fb_total gauge'

            fi

            output 'dcgm_fb_total' ${HOSTNAME} $count $uuid $i

        elif [ $num -eq 3 ];then

            if [ $first -lt 13 ];then

              echo '# HELP dcgm_fb_used Framebuffer memory used (in MiB).'

              echo '# TYPE dcgm_fb_used gauge'

            fi

            output 'dcgm_fb_used' ${HOSTNAME} $count $uuid $i

        elif [ $num -eq 4 ];then

            if [ $first -lt 13 ];then

              echo '# HELP dcgm_fb_free Framebuffer memory free (in MiB).'

              echo '# TYPE dcgm_fb_free gauge'

            fi

            output 'dcgm_fb_free' ${HOSTNAME} $count $uuid $i

        elif [ $num -eq 8 ];then

            if [ $first -lt 13 ];then

              echo '# HELP dcgm_gpu_utilization GPU utilization (in %).'

              echo '# TYPE dcgm_gpu_utilization gauge'

            fi

            output 'dcgm_gpu_utilization' ${HOSTNAME} $count $uuid $i

        elif [ $num -eq 13 ];then

            if [ $first -le 13 ];then

              echo '# HELP dcgm_gpu_temp GPU temperature (in C).'

              echo '# TYPE dcgm_gpu_temp gauge'

            fi

            output 'dcgm_gpu_temp' ${HOSTNAME} $count $uuid $i

        fi

        if [ $num -eq 13 ];then

            num=0

        else

            ((num++))

        fi

        ((first++))

      done

    }

    case $1 in

      "help")

        echo 'Useage: monitor.sh fast|mem|gpu|temp|all|[pathToLog sleepTimeNum]'

      ;;

      "mem")

        mem_prm     

        mem_detail_prm

      ;;

      "gpu")

        gpu_prm

      ;;

      "temp")

        temp_prm

      ;;

      "fast")

        fast

      ;;

      "all")

        allinone

      ;;

      "onebyone")

        if [ ! -n "$1" ];then

            if [ ! -d "/run/prometheus" ];then

              mkdir -p /run/prometheus

            fi

            while true;do allinone > /run/prometheus/`hostname`_dcgm.prom;sleep 15;done

        else

            if [ ! -n "$2" ];then

              while true;do allinone > $1;sleep 15;done

            else

              while true;do allinone > $1;sleep $2;done

            fi

        fi

      ;;

      *)

        if [ ! -n "$1" ];then

            if [ ! -d "/run/prometheus" ];then

              mkdir -p /run/prometheus

            fi

            while true;do fast > /run/prometheus/`hostname`_dcgm.prom;sleep 15;done

        else

            if [ ! -n "$2" ];then

              while true;do fast > $1;sleep 15;done

            else

              while true;do fast > $1;sleep $2;done

            fi

        fi

      ;;

    esac

    相关文章

      网友评论

          本文标题:Linux GPU Prometheus监控脚本

          本文链接:https://www.haomeiwen.com/subject/szrrvqtx.html