美文网首页
Prometheus

Prometheus

作者: 胡萝卜苗儿 | 来源:发表于2021-01-18 16:08 被阅读0次

    组件架构图

    Prometheus组件架构.png

    部署

    软件下载:https://prometheus.io/download/
    测试版本:prometheus-2.21.0.linux-amd64.tar.gz

    配置prometheus.yml
    # global config
    global:
      scrape_interval: 15s     # 监控项数据收集时间间隔,15秒向目标抓取一次数据.默认1min
      evaluation_interval: 15s # 规则发现时间间隔. 默认1min
      scrape_timeout: 10s      # 收集数据超时时间. 默认10s
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
           - localhost:9093   #指定 Alertmanager 组件的IP和端口
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
       - "./rules/rule_*.yml"
       
    remote_write:
      - url: "http://localhost:8086/api/v1/prom/write?db=prometheus"
     #   write_relabel_configs:
     #   - source_labels: [__name__]
     #     regex: expensive.*
     #     action: drop
     # - url: http://remote2/push
    remote_read:
      - url: "http://localhost:8086/api/v1/prom/write?db=prometheus"
    #    read_recent: false/true
    #    required_matchers:
    #      job: special
    #  - url: http://remote3/push
      
    # Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.   
        scrape_interval: 30s    #重写全局抓取间隔时间,由15秒重写成30秒
        static_configs:
        - targets: ['localhost:9090']
    #instance: 收集数据的目标端点,一般对应一个进程,即指定其来源,如某个机器
    #job: 实现同一功能或目标的一组instance。 如一组机器的集合。  
    
      - job_name: 'base'
        file_sd_configs:
        - files:
          - /usr/local/prometheus/node_discovery.json
          refresh_interval: 60s
      - job_name: 'mysql'
        file_sd_configs:
        - files:
          - /usr/local/prometheus/mysql_discovery.json
          refresh_interval: 60s
          
      - job_name: 'openstack'
        static_configs:
        - targets: ['127.0.0.1:9091']
          labels: 
            instance: gateway
    
    tar -xvzf prometheus-2.21.0.linux-amd64.tar.gz 
    mv prometheus-2.21.0.linux-amd64 /usr/local/prometheus
    
    检查配置正确性
    ./promtool check config prometheus.yml 
    Checking prometheus.yml
      SUCCESS: 2 rule files found
    
    Checking rules/rule_mysql.yml
      SUCCESS: 5 rules found
    
    Checking rules/rule_node.yml
      SUCCESS: 0 rules found
    
    ./promtool check rules rules/rule_mysql.yml 
    Checking rules/rule_mysql.yml
      SUCCESS: 5 rules found
    
    启动prometheus
    ./prometheus --config.file=prometheus.yml &
    启动参数:
    --config.file=prometheus.yml         #指定配置文件
    --storage.tsdb.path=/prometheus      #指定tsdb路径
    --storage.tsdb.retention.time=24h    #指定数据存储时间
    --web.enable-lifecycle               #配置热加载
    --storage.tsdb.no-lockfile           #如果使用k8s的deployment管理要开启
    

    node_exporter

    被监控节点安装node_exporter,获取当前CPU负载、系统负载、内存消耗、硬盘使用量、网络IO等监控项。

    #安装
    tar -zxvf node_exporter-1.0.1.linux-amd64.tar.gz -C /usr/local/
    cd /usr/local; mv node_exporter-1.0.1.linux-amd64 node_exporter-1.0.1
    #启动
    /usr/local/node_exporter &
    #监听
    ss -naltp  | grep 9100
    LISTEN     0    4096   :::9100   :::*  users:(("node_exporter",pid=171161,fd=3))
    

    mysqld_exporter

    #安装
    tar -zxvf mysqld_exporter-0.12.1.linux-amd64.tar.gz -C /usr/local/
    mv mysqld_exporter-0.12.1.linux-amd64 mysqld_exporter
    #目标服务器授权
    GRANT SELECT, PROCESS, REPLICATION CLIENT ON *.* TO 'prometheus'@'IP' IDENTIFIED BY '';
    #数据库相关配置
    vim .my.cnf 
    [client]
    user=prometheus
    password=########    #经过测试,尽量不使用特殊字符
    port=####
    host=localhost
    #启动mysqld_exporter
    /root/mysqld_exporter/mysqld_exporter --config.my-cnf=/root/.my.cnf 
    #查看监听
    ss -naltp  | grep 9104
    LISTEN     0      4096        :::9104                    :::*                   users:(("mysqld_exporter",pid=3010579,fd=3))
    

    Pushgateway

    使用场景:

    • 自定义监控指标
    • 网络限制
    tar -zxf  pushgateway-1.2.0.linux-amd64.tar.gz -C /usr/local/
    cd /usr/local/;mv pushgateway-1.2.0.linux-amd64 pushgateway-1.2.0;cd pushgateway-1.2.0/
    #启动pushgateway
    ./pushgateway --web.enable-admin-api --persistence.file="push_file" &
    #配置文件添加job
    vim /usr/local/prometheus/prometheus.yml 
      - job_name: gateway
        static_configs:
        - targets: ['127.0.0.1:9091']
          labels:
            instance: gateway
    #热加载生效  
    curl -XPOST http://IP:9090/-/reload
    #查看网页
    http://IP:9091
    
    API推送metrics

    格式:一般标签名采用 instance
    http://pustgatewayIP/metrices/job/job名/标签名/标签值

    • 单条数据
    echo "backup_status `cat /data/galera/backup/mysqldump/dumpdata/\`date +\"%Y%m%d\"\`/full_db.dmp |grep "completed" |wc -l`" | curl --data-binary @- http://IP:9091/metrics/job/backup_node/instance/IP
    
    • 多条数据
    #!/bin/bash
    #Author:mh
    #date:2020/11/17
    
    mysqlbin="/usr/local/mariadb/bin/mysql -uprometheus -p*** -h *** -P ***"
    mysqladminbin="/usr/local/mariadb/bin/mysqladmin -uprometheus -p*** -h *** -P ***"
    
    cat <<EOF | curl --data-binary @- http://IP:9091/metrics/job/guizou1/instance/IP*/node/control
    # HA/keepalived/httpd/memcaches
    service_haproxy `systemctl status haproxy | grep Active | grep -E "running|exited" | wc -l`
    service_keepalived `systemctl status keepalived | grep Active | grep -E "running|exited" | wc -l`
    service_httpd `systemctl status httpd | grep Active | grep -E "running|exited" | wc -l`
    service_memcached `systemctl status memcached | grep Active | grep -E "running|exited" | wc -l`
    #OpenStack_Server
    service_neutron_server `systemctl status neutron-server | grep Active | grep -E "running|exited" | wc -l`
    ...
    #DB_galera
    mysql_up2 `${mysqladminbin} ping  | grep -c alive`
    mysql_connections `${mysqlbin} -NBe "use information_schema;select count(*) from PROCESSLIST;"`
    mysql_galera_status `${mysqlbin} -e "SHOW STATUS LIKE 'wsrep_local_state'" |grep -i 'wsrep_local_state' |awk '{print $2}'`
    mysql_galera_cluster_size `${mysqlbin} -e "show status like 'wsrep_cluster_size'"|grep -i 'wsrep_cluster_size' |awk '{print $2}'`
    mysql_galera_cluster_conf_id `${mysqlbin} -e "show status like 'wsrep_cluster_conf_id'" |grep -i 'wsrep_cluster_conf_id' |awk '{print $2}'`
    mysql_status_com_commit `${mysqlbin} -e "show status like 'com_commit'" |grep -i 'com_commit' |awk '{print $2}'`
    mysql_status_com_rollback `${mysqlbin} -e "show status like 'com_rollback'" |grep -i 'com_rollback' |awk '{print $2}'`
    mysql_status_com_select `${mysqlbin} -e "show status like 'com_select'" |grep -i 'com_select' |awk '{print $2}'`
    mysql_status_com_update `${mysqlbin} -e "show status like 'com_update'" |grep -i 'com_update' |awk '{print $2}'`
    mysql_status_com_insert `${mysqlbin} -e "show status like 'com_insert'" |grep -i 'com_insert' |awk '{print $2}'`
    mysql_status_com_delete `${mysqlbin} -e "show status like 'com_delete'" |grep -i 'com_delete' |awk '{print $2}'`
    EOF
    

    Alertmanager

    通过在prometheus.yml配置文件中添加规则的方式,计算触发条件后发出警报

    Alert三种状态
    • pending:警报被激活,但是低于配置的持续时间【rule里for设置的时间】该状态下不发送报警。

    • firing: 警报已被激活,而且超出设置的持续时间。该状态下发送报警。

    • inactive:既不是pending也不是firing时

    触发一条告警过程

    prometheus-->触发阈值-->超出持续时间-->alertmanager-->分组|抑制|静默-->媒体:邮件|钉钉|微信等。

    • 分组: 通过route的group_by进行报警分组,多条消息一起发送,将性质类似告警组合成一条告警发出,从而减少告警数量

    • 抑制: 高级别报警抑制低级别报警。减少由于高级别告警引发的系列低级别告警,从而减少告警数量

    • 静默:故障静默(在页面配置),对已知故障在维护修复期间可以确保在接下来的时间内不会在收到同样报警信息。从而减少告警数量。

    rule_mysql.yml

    更新规则后热加载生效(需要在prometheus启动时加上--web.enable-lifecycle参数)
    curl -XPOST http://172.28.8.143:9090/-/reload

    配置rule

    vim /usr/local/prometheus/rules/rule_mysql.yml

    groups:
    - name: MySQL                           #报警分组名称
      rules:
      - alert: OS is down                   #报警名称 alertname
        expr: up{job="mysql_galera"} == 0   #条件表达式
        for: 1m                             #服务宕机持续时间
        labels:
          severity: critical                #报警级别
        annotations:                        #注释,添加实例
          summary: "Instance {{ $labels.instance }} OS is down"
          description: "MySQL OS is down. This requires immediate action!"
    
      - alert: MySQL server is down
        expr: mysql_up{job="mysql_galera"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} MySQL is down"
          description: "MySQL server is down. This requires immediate action!"
    

    Prometheus-webhook-dingtalk

    github下载地址 https://github.com/timonwong/prometheus-webhook-dingtalk/releases/tag/v1.4.0

    tar -zxf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz -C /usr/local/
    mv prometheus-webhook-dingtalk-1.4.0.linux-amd64 prometheus-webhook-dingtalk-1.4.0
    ./prometheus-webhook-dingtalk --config.file=webhook_config.yml --web.enable-lifecycle &
    
    配置 webhook_config.yml

    在钉钉设置群机器人获取token

    ## Request timeout
    # timeout: 5s
    templates:
      - contrib/templates/legacy/template.tmpl
    default_message:
      title: '{{ template "legacy.title" . }}'
    ## Targets, previously was known as "profiles"
    targets:
      webhook1:
        url: https://oapi.dingtalk.com/robot/send?access_token=***
        # secret for signature
    #    secret: SEC000000000000000000000
    #  webhook2:
    #    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxx
    #  webhook_legacy:
    #    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxx
    #    # Customize template content
    #    message:
    #      # Use legacy template
    #      title: '{{ template "legacy.title" . }}'
    #      text: '{{ template "legacy.content" . }}'
    #  webhook_mention_all:
    #    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxx
    #    mention:
    #      all: true
      webhook_mention_users:
        url: https://oapi.dingtalk.com/robot/send?access_token=***
    #default_message:
    #  at: { "atMobiles":["17****"] , "isAtAll":"false" }
    

    相关文章

      网友评论

          本文标题:Prometheus

          本文链接:https://www.haomeiwen.com/subject/tosxzktx.html