美文网首页
2.prometheus与alertmanager报警实现、ha

2.prometheus与alertmanager报警实现、ha

作者: 哆啦A梦_ca52 | 来源:发表于2019-12-12 12:34 被阅读0次
    prometheus采集cadvisor数据:
    添加cadvisor
    root@master:~# vim /usr/local/prometheus/prometheus.yml
      - job_name: 'prometheus--cadvisor'
        static_configs:
        - targets: ['192.168.200.206:8080','192.168.200.207:8080']
    重启prometheus:
    root@master:~#systemctl restart prometheus
    导入镜像
    root@master:~# docker load -i cadvisor_v0.33.0.tar.gz 
    打标签
    root@master:~# docker tag gcr.io/google-containers/cadvisor:v0.33.0 harbor.wyh.net/baseimages/cadvisor:v0.33.0
    
    上传镜像
    root@master:~# docker push harbor.wyh.net/baseimages/cadvisor:v0.33.0
    启动cadvisor容器:
    docker run \
    --volume=/:/rootfs:ro \
    --volume=/var/run:/var/run:rw \
    --volume=/sys:/sys:ro \
    --volume=/var/lib/docker/:/var/lib/docker:ro \
    --volume=/dev/disk/:/dev/disk:ro \
    --publish=8080:8080 \
    --detach=true \
    --name=cadvisor \
    harbor.wyh.net/baseimages/cadvisor:v0.33.0
    

    验证cadvisor web界面:
    访问node节点的cadvisor监听端口


    image.png
    image.png
    查看已经监控上了

    grafana添加pod监控模板:
    395 893 容器模板ID
    395模板


    image.png 查看pod信息

    prometheus报警设置:
    prometheus触发一条告警的过程:
    prometheus--->触发阈值--->超出持续时间--->alertmanager--->分组|抑制|静默--->媒体类型--->邮件|钉钉|微信
    等。

    分组(group): 将类似性质的警报合并为单个通知。
    静默(silences): 是一种简单的特定时间静音的机制,例如:服务器要升级维护可以先设置这个时间段告警静
    默。
    抑制(inhibition): 当警报发出后,停止重复发送由此警报引发的其他警报,可以消除冗余告警
    解压
    root@master2:/usr/local/src# tar xf alertmanager-0.19.0.linux-amd64.tar.gz 
    做个软连接
    root@master2:/usr/local/src# ln -sv /usr/local/src/alertmanager-0.19.0.linux-amd64 /usr/local/alertmanager
    设置启动脚本
    root@master2:/usr/local/alertmanager# vim /etc/systemd/system/alertmanager.service
    [Unit]
    Description=Prometheus Server
    Documentation=https://prometheus.io/docs/introduction/overview/
    After=network.target
    [Service]
    Restart=on-faiure
    ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
    [Install]
    WantedBy=multi-user.target
    
    配置alertmanager:
    root@master2:/usr/local/alertmanager# cat alertmanager.yml  | grep ^[^'#']
    global:
      resolve_timeout: 5m
      smtp_smarthost: 'smtp.qq.com:465'
      smtp_from: '50589143@qq.com'
      smtp_auth_username: '50589143@qq.com'
      smtp_auth_password: 'pzjypoauatdvcadh'
      smtp_hello: '@qq.com'
      smtp_require_tls: false
    route:
      group_by: ['alertname']
      group_wait: 10s
      group_interval: 10s
      repeat_interval: 60s
      receiver: 'web.hook'
    receivers:
    - name: 'web.hook'
      email_configs:
        - to: '2973707860@qq.com'
    inhibit_rules:
      - source_match:
          severity: 'critical'
        target_match:
          severity: 'warning'
        equal: ['alertname', 'dev', 'instance']
    重启服务
    root@master2:/usr/local/alertmanager# systemctl restart alertmanager
    查看端口
    root@master2:/usr/local/alertmanager# ss -tnl | grep 9093
    LISTEN  0         128                         *:9093                   *:*    
    验证是否会报警
    root@master2:/usr/local/alertmanager# ./amtool alert --alertmanager.url=http://192.168.200.197:9093
    Alertname  Starts At  Summary  
    配置prometheus报警规则:
    root@master:/etc/ansible# vim /usr/local/prometheus/prometheus.yml
      8 alerting:
      9   alertmanagers:
     10   - static_configs:
     11     - targets:
     12       - 192.168.200.197:9093
     13       # - alertmanager:9093
     14 
     15 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
     16 rule_files:
     17   - "/usr/local/prometheus/rule-linux37.yml"
    
    root@master:/etc/ansible# vim /usr/local/prometheus/rule-linux37.yml
    
    groups:
      - name: linux37_pod.rules
        rules:
        - alert: Pod_all_cpu_usage
          expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
          for: 5m
          labels:
            severity: critical
            service: pods
          annotations:
            description: 容器 {{ $labels.name }} CPU 资源利用率大于 75% , (current value is {{ $value }})
            summary: Dev CPU 负载告警
        - alert: Pod_all_memory_usage
          expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 1024*10^3*2
          for: 10m
          labels:
            severity: critical
          annotations:
            description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
            summary: Dev Memory 负载告警
        - alert: Pod_all_network_receive_usage
          expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1024*1024*50
          for: 10m
          labels:
            severity: critical
          annotations:
            description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})
    
    
    root@master:/etc/ansible# systemctl restart prometheus
    
    查看状态
    调整为5个 然后这个就变成红色的
    查看邮件已经发送了
    image.png

    修改为%25

    root@master:/usr/local/prometheus# vim rule-linux37.yml 
     11         description: 容器 {{ $labels.name }} CPU 资源利用率大于 25% , (current value is {{ $value }})
    root@master:/usr/local/prometheus# systemctl restart prometheus
    
    

    停止服务

    root@master:/usr/local/prometheus# systemctl stop prometheus
    root@master2:~# systemctl stop alertmanager.service 
    prometheus监控haproxy:
    部署haproxy_exporter:
    root@harbor:/usr/local/src# ln -sv /usr/local/src/haproxy_exporter-0.10.0.linux-amd64 /usr/local/harbor_exporter
    启动服务
    ./haproxy_exporter --haproxy.scrape-uri=unix:/run/haproxy/admin.sock
    
    root@master:~# vim /usr/local/prometheus/prometheus.yml
      - job_name: 'prometheus--haproxy'
        static_configs:
        - targets: ['192.168.200.200:9101']
    
    root@master:~# systemctl restart prometheus.service 
    
    
    查看haproxy的 grafana添加模板 image.png

    相关文章

      网友评论

          本文标题:2.prometheus与alertmanager报警实现、ha

          本文链接:https://www.haomeiwen.com/subject/qbrngctx.html