美文网首页
5.prometheus- config

5.prometheus- config

作者: Plenari | 来源:发表于2019-10-07 22:12 被阅读0次

    prometheus

    global:
      scrape_interval: 1m
      scrape_timeout: 10s
      evaluation_interval: 1m
      external_labels:
        monitor: codelab-monitor
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          - 127.0.0.1:9093
        scheme: http
        timeout: 10s
      - static_configs:
        - targets:
          - localhost:9093
        scheme: http
        timeout: 10s
    rule_files:
    - /storage/config/alert.rules
    scrape_configs:
    - job_name: NODE
      scrape_interval: 15s
      scrape_timeout: 10s
      metrics_path: /metrics
      scheme: http
      static_configs:
      - targets:
        - 192.168.119.132:9100
        labels:
          group: NODE_exporter
    - job_name: snmp
      params:
        module:
        - default
      scrape_interval: 1m
      scrape_timeout: 10s
      metrics_path: /snmp
      scheme: http
      static_configs:
      - targets:
        - 10.55.173.34
        - 192.168.119.132
      relabel_configs:
      - source_labels: [__address__]
        separator: ;
        regex: (.*)
        target_label: __param_target
        replacement: $1
        action: replace
      - source_labels: [__param_target]
        separator: ;
        regex: (.*)
        target_label: instance
        replacement: $1
        action: replace
      - source_labels: []
        separator: ;
        regex: (.*)
        target_label: __address__
        replacement: 127.0.0.1:9116
        action: replace
    

    rules

    ALERTS cpu_threshold_exceeded
      IF [(100 * (1 - avg(irate(node_cpu{mode="idle"}[5m])) BY (instacnce))) > 70](http://192.168.119.132:9090/graph?g0.expr=%28100+%2A+%281+-+avg%28irate%28node_cpu%7Bmode%3D%22idle%22%7D%5B5m%5D%29%29+BY+%28instacnce%29%29%29+%3E+70&g0.tab=0)
      FOR 1m
      LABELS {title="CPU占用率过高", urgency="immediate"}
      ANNOTATIONS {description="服务器当前CPU占用率为{{ $value }},超过了上限设置", summary="服务器{{ $labels.instance }}上的CPU占用率太高了"}
    ALERT [net_device_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22net_device_down%22%7D&g0.tab=0)
      IF [up{job="snmp"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22snmp%22%7D+%3D%3D+0&g0.tab=0)
      FOR 1m
      LABELS {title="网络设备掉线", urgency="immediate"}
      ANNOTATIONS {description="这个网络设备无法连接超过1分钟了,请及时检查", summary="IP地址为{{ $labels.instance }}的网络设备无法连接"}
    ALERT [server_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22server_down%22%7D&g0.tab=0)
      IF [up{job="NODE"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22NODE%22%7D+%3D%3D+0&g0.tab=0)
      FOR 1m
      LABELS {title="服务器掉线", urgency="immediate"}
      ANNOTATIONS {description="这台服务器无法访问已经超过1分钟了,怀疑是服务器宕机,或者node_export服务有问题", summary="IP地址为{{ $labels.instance }}的服务器无法访问"}
    ALERT [windows_server_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22windows_server_down%22%7D&g0.tab=0)
      IF [up{job="WIN_NODE"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22WIN_NODE%22%7D+%3D%3D+0&g0.tab=0)
      FOR 1m
      LABELS {title="服务器掉线", urgency="immediate"}
      ANNOTATIONS {description="这台Windows服务器无法访问已经超过1分钟了,怀疑是服务器宕机,或者node_export服务有问题", summary="IP地址为{{ $labels.instance }}的服务器无法访问"}
    ALERT [NodeRebootingFrequently](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22NodeRebootingFrequently%22%7D&g0.tab=0)
      IF [changes(node_boot_time{job="NODE"}[1h]) > 3](http://192.168.119.132:9090/graph?g0.expr=changes%28node_boot_time%7Bjob%3D%22NODE%22%7D%5B1h%5D%29+%3E+3&g0.tab=0)
      LABELS {title="服务器频繁重启", urgency="immediate"}
      ANNOTATIONS {description="这台服务器在过去1小时内重启了{{$value}}次,超过了3次的限制", summary="服务器{{$labels.instance}}重启的太频繁了"}
    ALERT [DiskWillFillIn4Hours](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22DiskWillFillIn4Hours%22%7D&g0.tab=0)
      IF [predict_linear(node_filesystem_avail{fstype="ext4",job="NODE"}[1h], 4 * 3600) < 0](http://192.168.119.132:9090/graph?g0.expr=predict_linear%28node_filesystem_avail%7Bfstype%3D%22ext4%22%2Cjob%3D%22NODE%22%7D%5B1h%5D%2C+4+%2A+3600%29+%3C+0&g0.tab=0)
      FOR 5m
      LABELS {title="硬盘空间不够", urgency="immediate"}
      ANNOTATIONS {description="挂载在服务器{{$labels.instance}}目录{{$labels.mountpoint}}上的分区{{$labels.device}},其空余空间在4小时内会用完", summary="服务器{{$labels.instance}}上的空余硬盘空间预计在4小时内会用完"}
    ALERT [HighErrorRate](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22HighErrorRate%22%7D&g0.tab=0)
      IF [sum(rate(http_requests_total{status=~"5.."}[1m])) BY (job, path) / sum(rate(http_requests_total[1m])) BY (job, path) * 100 > 1](http://192.168.119.132:9090/graph?g0.expr=sum%28rate%28http_requests_total%7Bstatus%3D~%225..%22%7D%5B1m%5D%29%29+BY+%28job%2C+path%29+%2F+sum%28rate%28http_requests_total%5B1m%5D%29%29+BY+%28job%2C+path%29+%2A+100+%3E+1&g0.tab=0)
      LABELS {title="HTTP 5xx错误太多", urgency="immediate"}
      ANNOTATIONS {description="工作任务{{$labels.job}}中web服务{{$labels.path}}有{{$value}}%的5xx错误", summary="web服务太多5xx错误"}</pre>
    
    

    aleertmanager

    global:
      resolve_timeout: 1m
      smtp_from: alertmanager@example.org
      smtp_smarthost: localhost:25
      smtp_auth_username: alertmanager
      smtp_auth_password: <secret>
      smtp_auth_secret: null
      smtp_auth_identity: ""
      smtp_require_tls: true
      slack_api_url: null
      pagerduty_url: ""
      hipchat_url: ""
      hipchat_auth_token: null
      opsgenie_api_host: ""
      victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/
    route:
      receiver: luhya
      group_by:
      - alertname
      group_wait: 1s
      group_interval: 5m
      repeat_interval: 1h
    receivers:
    - name: luhya
      email_configs:
      - send_resolved: false
        to: admin@com.cn
        from: alertmanager@example.org
        smarthost: localhost:25
        auth_username: alertmanager
        auth_password: <secret>
        auth_secret: null
        auth_identity: ""
        headers:
          From: alertmanager@example.org
          Subject: '{{ template "email.default.subject" . }}'
          To: admin@com.cn
        html: '{{ template "email.default.html" . }}'
        require_tls: true
      webhook_configs:
      - send_resolved: true
        url: http://127.0.0.1/portal/api/1.0/alert
    templates: []
    

    相关文章

      网友评论

          本文标题:5.prometheus- config

          本文链接:https://www.haomeiwen.com/subject/maqvpctx.html