1.prometheus相关配置说明
# cat /root/prometheus-2.25.0.linux-amd64/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. #每过15秒执行一次报警规则
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.8.124:9093 # 设置报警信息推送地址,一般而言设置的是alertManager的地址
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" # 设置报警规则
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus' #自己定义的监控的job_name
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['192.168.8.124:9090']
2.设置报警规则文件参考
# cat /root/prometheus-2.25.0.linux-amd64/rules/*
groups:
- name: example #报警规则组的名字
rules:
- alert: InstanceDown #检测job的状态,持续1分钟metrices不能访问会发给altermanager进行报警
expr: up == 0 #value : 0表示宕机 1 表示可用
for: 1m #持续时间 , 表示持续一分钟获取不到信息,则触发报警
labels:
serverity: page # 自定义标签
annotations:
summary: "Instance {{ $labels.instance }} down" # 自定义摘要
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than
labels参数说明
env : 数据源(通常用于区分环境)
instance : 实例名称
job : 应用名
3.AlertManager相关配置说明
# cat /root/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
http_config:
proxy_url: 'http://192.168.13.190:7777'
#templates:
# - 'template/*.tmpl'
route:
group_by: ['alertname']
group_wait: 10s #同一组间隔
group_interval: 10s #同一组的的告警消息间隔,在5m分钟内收到的同一个组的消息,会汇总统一发送
repeat_interval: 1h #相同的告警消息的重复发送的间隔时间
receiver: 'rocketchat' #接受者名称
receivers: #接受者
- name: 'rocketchat' #接受者名称
webhook_configs:
- url: 'http://211.149.224.155:3000/hooks/ZanvEXd5t2Qaoycvm/Wwff7ndDeCq8DqFnaGhi4gdGWHpXJnaWyQoQzESypwwBvApJ' # 接收地址
send_resolved: true
4.expr表达式设置(通过grafana面板)
登陆Grafana面板--Dashboards--选择面板[Alerts-Linux Nodes]--选择视图[Linux Nodes Disk Usage]
编辑视图--查看Metrics--即可或者表达式:
100.0 - 100 * ((node_filesystem_avail_bytes / 1000 / 1000 ) / (node_filesystem_size_bytes / 1024 / 1024))
5.告警规则大全参考
URL:https://awesome-prometheus-alerts.grep.to/rules
网友评论