美文网首页
python实现系统指标采集

python实现系统指标采集

作者: getyouyou | 来源:发表于2016-10-01 00:01 被阅读0次

    前言

    这周公司新上的项目需要压测,根据各个压测场景,需要拿到linux服务器不同的系统消耗指标。

    思来想去觉得还是使用python更轻量,也更容易被后续的第三方agent来执行,就写了这样的一个指标采集工具。

    指标采集

    指标包括cpu、内存、io、网卡等一系列常见的性能指标,具体的指标以及计算也可以参考github上的淘宝开源项目tsar

    整体的采集思路非常简单,分为两种:

    • 读取特定的文件,解析文件,格式化数据;
    • 执行指定命令,获取输出,格式化数据

    所有的指标都乘以了一个系数,我贪快,所以全都直接写的10000 :(

    具体的数据解析可以自行cat输出对应的文件,结合命令输出来对比

    1.负载

    从/proc/loadavg文件中读取

    def collector_load():
        # 读取负载文件
        load_file = open("/proc/loadavg")
        content = load_file.read().split()
        load_file.close()
        load_avg = {
            "load1": int(string.atof(content[0]) * 10000),
            "load5": int(string.atof(content[1]) * 10000),
            "load15": int(string.atof(content[2]) * 10000)
        }
        return load_avg
    

    2. 内存

    从/proc/meminfo中读取

    # 采集内存信息
    def collect_memory_info():
        # 读取内存信息文件
        memory_buffer = {}
        with open("/proc/meminfo") as mem_file:
            for line in mem_file:
                memory_buffer[line.split(':')[0]] = string.atoi(line.split(':')[1].split()[0])
        # 过滤只取关注的指标
        mem_total = memory_buffer["MemTotal"]
        mem_free = memory_buffer["MemFree"] + memory_buffer["Buffers"] + memory_buffer["Cached"]
        mem_util = int((float(mem_total - mem_free)/float(mem_total)) * 10000)
        mem_buff = int(float(memory_buffer["Buffers"])/float(mem_total) * 10000)
        mem_cache = int(float(memory_buffer["Cached"])/float(mem_total) * 10000)
        mem_info = {
            "mem_buff": mem_buff,
            "mem_util": mem_util,
            "mem_cache": mem_cache,
        }
        return mem_info
    

    3. cpu信息

    从/proc/stat中获取

    # 采集cpu信息
    def collect_cpu_info():
        cpu_buffer = {}
        with open("/proc/stat") as cpu_file:
            for line in cpu_file:
                line_fields = line.split()
                if line_fields[0] == "cpu":
                    total = 0
                    for field in line_fields:
                        if field == "cpu":
                            continue
                        total += string.atoi(field)
    
                    cpu_buffer = {
                        "User": string.atoi(line_fields[1]),
                        "Sys": string.atoi(line_fields[3]),
                        "Idle": string.atoi(line_fields[4]),
                        "Steal": string.atoi(line_fields[8]),
                        "Wait": string.atoi(line_fields[5]),
                        "Total": total
                    }
                    break
        return cpu_buffer
    

    这个指标在系统中是累加的,因此需要再次进行计算,即本次结果与上次结果的差值才是本段时间内的指标值:

    # 计算cpu数据
    def calculate_cpu_info():
        global last_cpu_info
        cpu_info = collect_cpu_info()
        if last_cpu_info is None:
            last_cpu_info = cpu_info
            return {}
        else:
            delta_total = cpu_info["Total"] - last_cpu_info["Total"]
            delta_user = cpu_info["User"] - last_cpu_info["User"]
            delta_sys = cpu_info["Sys"] - last_cpu_info["Sys"]
            delta_idle = cpu_info["Idle"] - last_cpu_info["Idle"]
            delta_wait = cpu_info["Wait"] - last_cpu_info["Wait"]
            delta_steal = cpu_info["Steal"] - last_cpu_info["Steal"]
            last_cpu_info = cpu_info
            return {
                "cpu_user": int(float(delta_user)/float(delta_total) * 10000),
                "cpu_sys": int(float(delta_sys)/float(delta_total) * 10000),
                "cpu_wait": int(float(delta_wait)/float(delta_total) * 10000),
                "cpu_steal": int(float(delta_steal)/float(delta_total) * 10000),
                "cpu_idle": int(float(delta_idle)/float(delta_total) * 10000),
                "cpu_util": int(float(delta_total - delta_idle - delta_wait - delta_steal)/float(delta_total) * 10000)
            }
    

    4. IO相关

    从文件/proc/diskstats中读取

    # 采集io
    def collect_io_info():
        io_buffer = {}
        with open("/proc/diskstats") as io_file:
            for line in io_file:
                line_fields = line.split()
                device_name = line_fields[2]
                if line_fields[3] == "0":
                    continue
                if should_handle_device(device_name):
                    io_buffer[device_name] = {
                        "ReadRequest": string.atoi(line_fields[3]),
                        "WriteRequest": string.atoi(line_fields[7]),
                        "MsecRead": string.atoi(line_fields[6]),
                        "MsecWrite": string.atoi(line_fields[10]),
                        "MsecTotal": string.atoi(line_fields[12]),
                        "Timestamp": int(time.time())
                    }
        return io_buffer
    
    # 当前的硬盘设备是否需要使用
    def should_handle_device(device):
        normal = len(device) == 3 and device.startswith("sd") or device.startswith("vd")
        aws = len(device) >= 4 and device.startswith("xvd") or device.startswith("sda")
        return normal or aws
    

    这个指标也是累加的,需要进行求差:

    # 计算io信息
    def calculate_io_info():
        global last_io_info
        io_info = collect_io_info()
        result = []
        if last_io_info is not None:
            for key in io_info.keys():
                total_duration = io_info[key]["Timestamp"] - last_io_info[key]["Timestamp"]
                read_use_io = io_info[key]["MsecRead"] - last_io_info[key]["MsecRead"]
                write_use_io = io_info[key]["MsecWrite"] - last_io_info[key]["MsecWrite"]
                read_io = io_info[key]["ReadRequest"] - last_io_info[key]["ReadRequest"]
                write_io = io_info[key]["WriteRequest"] - last_io_info[key]["WriteRequest"]
                read_write_io = io_info[key]["MsecTotal"] - last_io_info[key]["MsecTotal"]
                readwrite_io = read_io + write_io
                io_awit = 0
                if readwrite_io > 0:
                    io_awit = int(float(read_use_io + write_use_io) / float(readwrite_io) * 10000)
                result.append({
                    "io_rs": int((read_io/total_duration) * 10000),
                    "io_ws": int((write_io/total_duration) * 10000),
                    "io_await": io_awit,
                    "io_util": int(float(read_write_io) / (total_duration * 1000) * 10000),
                })
    
        last_io_info = io_info
        return result
    

    5. 采集网卡

    网卡数据从/proc/net/dev中读取

    # 采集网卡流量数据
    def collect_net_info():
        net_buffer = {}
        with open("/proc/net/dev") as net_file:
            for line in net_file:
                if line.find(":") < 0:
                    continue
                card_name = line.split(":")[0].strip()
                if should_collect_card(card_name):
                    line_fields = line.split(":")[1].lstrip().split()
                    net_buffer[card_name] = {
                        "InBytes": string.atoi(line_fields[0]),
                        "InPackets": string.atoi(line_fields[1]),
                        "InErrors": string.atoi(line_fields[2]),
                        "InDrops": string.atoi(line_fields[3]),
                        "OutBytes": string.atoi(line_fields[8]),
                        "OutPackets": string.atoi(line_fields[9]),
                        "OutErrors": string.atoi(line_fields[10]),
                        "OutDrops": string.atoi(line_fields[11])
                    }
        return net_buffer
    
    # 是否需要采集相应的网卡
    def should_collect_card(line):
        return line.startswith("eth") or line.startswith("em")
    

    网卡指标也是一个累加值,需要求差:

    # 计算网卡的指标
    def calculate_net_info():
        global last_net_info
        net_info = collect_net_info()
        result = []
        if last_net_info is not None:
            for key in net_info.keys():
                result.append({
                    "in_bytes": (net_info[key]["InBytes"] - last_net_info[key]["InBytes"]) * 10000,
                    "in_packets": (net_info[key]["InPackets"] - last_net_info[key]["InPackets"]) * 10000,
                    "in_errors": (net_info[key]["InErrors"] - last_net_info[key]["InErrors"]) * 10000,
                    "in_drops": (net_info[key]["InDrops"] - last_net_info[key]["InDrops"]) * 10000,
                    "out_bytes": (net_info[key]["OutBytes"] - last_net_info[key]["OutBytes"]) * 10000,
                    "out_packets": (net_info[key]["OutPackets"] - last_net_info[key]["OutPackets"]) * 10000,
                    "out_errors": (net_info[key]["OutErrors"] - last_net_info[key]["OutErrors"]) * 10000,
                    "out_drops": (net_info[key]["OutDrops"] - last_net_info[key]["OutDrops"]) * 10000
                })
        last_net_info = net_info
        return result
    

    6. 采集tcp指标

    tcp与udp的指标信息都可以从/proc/net/snmp中读取

    # 采集tcp相关数据
    def collect_tcp_info():
        tcp_buffer = {}
        is_title = True
        with open("/proc/net/snmp") as tcp_file:
            for line in tcp_file:
                protocol_name = line.split(":")[0].strip()
                if protocol_name == "Tcp":
                    if is_title:
                        is_title = False
                        continue
                    else:
                        line_fields = line.split(":")[1].lstrip().split()
                        tcp_buffer = {
                            "ActiveOpens": string.atoi(line_fields[4]),
                            "PassiveOpens": string.atoi(line_fields[5]),
                            "InSegs": string.atoi(line_fields[9]),
                            "OutSegs": string.atoi(line_fields[10]),
                            "RetransSegs": string.atoi(line_fields[11]),
                            "CurrEstab": string.atoi(line_fields[8]),
                        }
                        break
        return tcp_buffer
    

    里面有累加值也有实时值,当前的连接数为实时值:

    # 计算tcp数据
    def calculate_tcp_info():
        global last_tcp_info
        tcp_info = collect_tcp_info()
        result = {}
        if last_tcp_info is not None:
            outSegsTcp = tcp_info["OutSegs"] - last_tcp_info["OutSegs"]
            retransRate = float(tcp_info["RetransSegs"] - last_tcp_info["RetransSegs"])/float(outSegsTcp)
            result = {
                "tcp_active": (tcp_info["ActiveOpens"] - last_tcp_info["ActiveOpens"]) * 10000,
                "tcp_passive": (tcp_info["PassiveOpens"] - last_tcp_info["PassiveOpens"]) * 10000,
                "tcp_inseg": (tcp_info["InSegs"] - last_tcp_info["InSegs"]) * 10000,
                "tcp_outseg": outSegsTcp * 10000,
                "tcp_established": tcp_info["CurrEstab"] * 10000,
                "tcp_retran": int(retransRate * 10000)
            }
        last_tcp_info = tcp_info
        return result
    

    7. 采集指定进程的cpu与内存

    有两种方式,其一是执行ps命令,取到的是当前进程启动之后的平均cpu与内存占用;其二是在proc/pid下面读取,在这里用的是第一种。

    指定的进程的名称通过ps auxc | grep "进程名1|进程名2|...."来获取进程id

    # 采集指定进程数据
    def collect_process_info():
        global processes
        process_info = {}
        if processes == "":
            return process_info
        process_filter = processes.replace(",", "\|")
        process_filter = "'" + process_filter + "'"
        commandline = "ps auxc | grep " + process_filter
        status_code, result = commands.getstatusoutput(commandline)
        if status_code == 0:
            # 分割结果
            result_array = result.split("\n")
            for item in result_array:
                item_fields = item.split()
                process_info[item_fields[10]] = {
                    "process_cpu_util": int(string.atof(item_fields[2]) * 10000),
                    "process_mem_util": int(string.atof(item_fields[3]) * 10000)
                }
        return process_info
    

    如果需实时的数据,应该从proce/pid中的文件夹去读取数据,拿pid的方式和上述的方式是一样的

    相关文章

      网友评论

          本文标题:python实现系统指标采集

          本文链接:https://www.haomeiwen.com/subject/modryttx.html