Alertmanager
配置服务
[root@k8smaster ~]# vi /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
[Service]
ExecStart=/opt/monitor/alertmanager/alertmanager --config.file=/opt/monitor/alertmanager/alertmanager.yml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
启动服务
[root@k8smaster alertmanager]# systemctl daemon-reload
[root@k8smaster alertmanager]# systemctl restart alertmanager
配置邮件发送
[root@k8smaster alertmanager]# vi alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'lql_h@163.com'
smtp_auth_username: 'lql_h@163.com'
smtp_auth_password: 'BBTGIGYUNBZNNQEB'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'lql'
receivers:
- name: 'lql'
email_configs:
- to: 'lql_h@163.com'
prometheus配置文件
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
rule_files:
- "/opt/monitor/prometheus/rules/*.yml"
报警规则设置
实时检查服务是否正常
groups:
- name: general.rules
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: error
annotations:
summary: "Instance {{ $labels.instance }} 停止工作"
description: "{{ $labels.instance }}: job {{ $labels.job }} 已经停止5分钟以上."
实时检查cpu、内存、磁盘指标是否正常
groups:
- name: node.rules
rules:
- alert: NodeFilesystemUsage
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: {{$labels.mountpoint }} 分区使用过高"
description: "{{$labels.instance}}: {{$labels.mountpoint }} 分区使用大于 80% (当前值: {{ $value }})"
- alert: NodeMemoryUsage
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: 内存使用过高"
description: "{{$labels.instance}}: 内存使用大于 80% (当前值: {{ $value }})"
- alert: NodeCPUUsage
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: CPU使用过高"
description: "{{$labels.instance}}: CPU使用大于 80% (当前值: {{ $value }})"
报警后查看prometheus效果
1635752971404.png
查看是否触动相关指标
1635820036174.png
触发报警后发送邮件
#每个alert发送一个邮件
#PromQL所查询的数据(多条)显示在邮件里,每条都显示
1635820061504.png
网友评论