prometheus
global:
scrape_interval: 1m
scrape_timeout: 10s
evaluation_interval: 1m
external_labels:
monitor: codelab-monitor
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
scheme: http
timeout: 10s
- static_configs:
- targets:
- localhost:9093
scheme: http
timeout: 10s
rule_files:
- /storage/config/alert.rules
scrape_configs:
- job_name: NODE
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- 192.168.119.132:9100
labels:
group: NODE_exporter
- job_name: snmp
params:
module:
- default
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /snmp
scheme: http
static_configs:
- targets:
- 10.55.173.34
- 192.168.119.132
relabel_configs:
- source_labels: [__address__]
separator: ;
regex: (.*)
target_label: __param_target
replacement: $1
action: replace
- source_labels: [__param_target]
separator: ;
regex: (.*)
target_label: instance
replacement: $1
action: replace
- source_labels: []
separator: ;
regex: (.*)
target_label: __address__
replacement: 127.0.0.1:9116
action: replace
rules
ALERTS cpu_threshold_exceeded
IF [(100 * (1 - avg(irate(node_cpu{mode="idle"}[5m])) BY (instacnce))) > 70](http://192.168.119.132:9090/graph?g0.expr=%28100+%2A+%281+-+avg%28irate%28node_cpu%7Bmode%3D%22idle%22%7D%5B5m%5D%29%29+BY+%28instacnce%29%29%29+%3E+70&g0.tab=0)
FOR 1m
LABELS {title="CPU占用率过高", urgency="immediate"}
ANNOTATIONS {description="服务器当前CPU占用率为{{ $value }},超过了上限设置", summary="服务器{{ $labels.instance }}上的CPU占用率太高了"}
ALERT [net_device_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22net_device_down%22%7D&g0.tab=0)
IF [up{job="snmp"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22snmp%22%7D+%3D%3D+0&g0.tab=0)
FOR 1m
LABELS {title="网络设备掉线", urgency="immediate"}
ANNOTATIONS {description="这个网络设备无法连接超过1分钟了,请及时检查", summary="IP地址为{{ $labels.instance }}的网络设备无法连接"}
ALERT [server_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22server_down%22%7D&g0.tab=0)
IF [up{job="NODE"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22NODE%22%7D+%3D%3D+0&g0.tab=0)
FOR 1m
LABELS {title="服务器掉线", urgency="immediate"}
ANNOTATIONS {description="这台服务器无法访问已经超过1分钟了,怀疑是服务器宕机,或者node_export服务有问题", summary="IP地址为{{ $labels.instance }}的服务器无法访问"}
ALERT [windows_server_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22windows_server_down%22%7D&g0.tab=0)
IF [up{job="WIN_NODE"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22WIN_NODE%22%7D+%3D%3D+0&g0.tab=0)
FOR 1m
LABELS {title="服务器掉线", urgency="immediate"}
ANNOTATIONS {description="这台Windows服务器无法访问已经超过1分钟了,怀疑是服务器宕机,或者node_export服务有问题", summary="IP地址为{{ $labels.instance }}的服务器无法访问"}
ALERT [NodeRebootingFrequently](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22NodeRebootingFrequently%22%7D&g0.tab=0)
IF [changes(node_boot_time{job="NODE"}[1h]) > 3](http://192.168.119.132:9090/graph?g0.expr=changes%28node_boot_time%7Bjob%3D%22NODE%22%7D%5B1h%5D%29+%3E+3&g0.tab=0)
LABELS {title="服务器频繁重启", urgency="immediate"}
ANNOTATIONS {description="这台服务器在过去1小时内重启了{{$value}}次,超过了3次的限制", summary="服务器{{$labels.instance}}重启的太频繁了"}
ALERT [DiskWillFillIn4Hours](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22DiskWillFillIn4Hours%22%7D&g0.tab=0)
IF [predict_linear(node_filesystem_avail{fstype="ext4",job="NODE"}[1h], 4 * 3600) < 0](http://192.168.119.132:9090/graph?g0.expr=predict_linear%28node_filesystem_avail%7Bfstype%3D%22ext4%22%2Cjob%3D%22NODE%22%7D%5B1h%5D%2C+4+%2A+3600%29+%3C+0&g0.tab=0)
FOR 5m
LABELS {title="硬盘空间不够", urgency="immediate"}
ANNOTATIONS {description="挂载在服务器{{$labels.instance}}目录{{$labels.mountpoint}}上的分区{{$labels.device}},其空余空间在4小时内会用完", summary="服务器{{$labels.instance}}上的空余硬盘空间预计在4小时内会用完"}
ALERT [HighErrorRate](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22HighErrorRate%22%7D&g0.tab=0)
IF [sum(rate(http_requests_total{status=~"5.."}[1m])) BY (job, path) / sum(rate(http_requests_total[1m])) BY (job, path) * 100 > 1](http://192.168.119.132:9090/graph?g0.expr=sum%28rate%28http_requests_total%7Bstatus%3D~%225..%22%7D%5B1m%5D%29%29+BY+%28job%2C+path%29+%2F+sum%28rate%28http_requests_total%5B1m%5D%29%29+BY+%28job%2C+path%29+%2A+100+%3E+1&g0.tab=0)
LABELS {title="HTTP 5xx错误太多", urgency="immediate"}
ANNOTATIONS {description="工作任务{{$labels.job}}中web服务{{$labels.path}}有{{$value}}%的5xx错误", summary="web服务太多5xx错误"}</pre>
aleertmanager
global:
resolve_timeout: 1m
smtp_from: alertmanager@example.org
smtp_smarthost: localhost:25
smtp_auth_username: alertmanager
smtp_auth_password: <secret>
smtp_auth_secret: null
smtp_auth_identity: ""
smtp_require_tls: true
slack_api_url: null
pagerduty_url: ""
hipchat_url: ""
hipchat_auth_token: null
opsgenie_api_host: ""
victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/
route:
receiver: luhya
group_by:
- alertname
group_wait: 1s
group_interval: 5m
repeat_interval: 1h
receivers:
- name: luhya
email_configs:
- send_resolved: false
to: admin@com.cn
from: alertmanager@example.org
smarthost: localhost:25
auth_username: alertmanager
auth_password: <secret>
auth_secret: null
auth_identity: ""
headers:
From: alertmanager@example.org
Subject: '{{ template "email.default.subject" . }}'
To: admin@com.cn
html: '{{ template "email.default.html" . }}'
require_tls: true
webhook_configs:
- send_resolved: true
url: http://127.0.0.1/portal/api/1.0/alert
templates: []
网友评论