一、背景
由于前段时间业务服务器上cpu和内存告警频发,但无法有效地进行历史回溯分析;由此决定寻找一款方案,可以记录并清晰的展示历史请求状态及趋势。针对目前公司业务服务器上跑的大部分服务为docker启动,使用traefik作为边缘路由器的现状;决定采用grafana+prometheus监控方案来监控所有经traefik流入的接口请求。
traefik监控.png
二、操作
1、操作traefik,对外暴露监控指标
可通过traefik的配置文件或者cli配置开启metrics支持
- 方案一,traefik配置文件开启metrics支持
yaml文件尾部添加
# 数据监控
metrics:
prometheus:
buckets:
- 0.1
- 0.3
- 1.2
- 5.0
# 入口处增加metrics标签
addEntryPointsLabels: true
# 在service中启用metrics
addServicesLabels: true
# 指定metrics的端点(默认是管理端口8080/metrics)
# entryPoint: "traefik"
entryPoint: "metrics"
toml文件尾部添加
# toml配置
[metrics]
[metrics.prometheus]
#延迟的metrics的bucket存储[0.100000, 0.300000, 1.200000, 5.000000]
buckets = [0.1,0.3,1.2,5.0]
#入口处增加metrics标签[true]
addEntryPointsLabels = true
#在service中启用meirtcs[true]
addServicesLabels = true
#指定metrics的端点[traefik(默认是管理端口8080/metrics)],也可以自定义
entryPoint = "metrics"
#是否禁用内部路由[false]
manualRouting = true
- 方案二,cli配置开启metrics支持
version: "3"
services:
traefik:
# The official v2 Traefik docker image
image: traefik:latest
restart: always
container_name: traefik
# Enables the web UI and tells Traefik to listen to docker
command:
# - "--api.insecure=true"
- "--api.insecure=true"
- "--providers.docker"
- "--entryPoints.port443.address=:443"
- "--entryPoints.port80.address=:80"
- "--entryPoints.metrics.address=:8082"
- "--certificatesResolvers.le-ssl.acme.tlsChallenge=true"
- "--certificatesResolvers.le-ssl.acme.email=traefik@163.com"
- "--certificatesResolvers.le-ssl.acme.storage=/letsencrypt/acme.json"
- "--metrics.prometheus=true"
- "--metrics.prometheus.buckets=0.100000,0.300000,1.200000,5.000000"
- "--metrics.prometheus.entryPoint=metrics"
- "--metrics.prometheus.addServicesLabels=true"
- "--metrics.prometheus.addEntryPointsLabels=true"
ports:
# The HTTP port
- "80:80"
# The Web UI (enabled by --api.insecure=true)
- "8080:8080"
- "8082:8082"
- "443:443"
volumes:
# So that Traefik can listen to the Docker events
- /home/admin/traefik/data:/letsencrypt/
- /var/run/docker.sock:/var/run/docker.sock
networks:
- proxy
networks:
proxy:
external: true
2、操作prometheus,添加traefik监控
vim prometheus/prometheus.yml
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor'
remote_write:
- url: "http://xxx.xxx.xxx.xxx:8086/api/v1/prom/write?db=prometheus&u=admin&p=***"
remote_read:
- url: "http://xxx.xxx.xxx.xxx:8086/api/v1/prom/read?db=prometheus&u=admin&p=***"
rule_files:
- "./rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['localhost:9090']
- job_name: 'agent'
basic_auth:
username: admin
password: ***
static_configs:
- targets: ['xxx.xxx.xxx.xxx:9100']
# 添加traefik监控
- job_name: 'traefik'
basic_auth:
username: admin
password: ***
static_configs:
- targets: ['xxx.xxx.xxx.xxx:8080']
重启prometheus服务
测试访问metrics服务:curl -s xxx.xxx.xxx.xxx:8080/metrics | head -10
3、操作grafana
导入traefik监控模板,import->4475;4475面板默认只支持监控一个traefik服务且仪表盘内容过少,可以根据需要在4475的基础上进行了一些小完善
完善过的监控面板json结构
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Traefik dashboard prometheus",
"editable": true,
"gnetId": 4475,
"graphTooltip": 0,
"id": 7,
"iteration": 1635214212727,
"links": [],
"panels": [
{
"datasource": null,
"description": "运行时长",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 5,
"x": 0,
"y": 0
},
"id": 14,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.0.3",
"targets": [
{
"exemplar": true,
"expr": "time() - process_start_time_seconds{job=\"traefik\",instance=~\"$node\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Uptime",
"type": "stat"
},
{
"datasource": null,
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "semi-dark-blue",
"mode": "fixed"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 5,
"x": 5,
"y": 0
},
"id": 18,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.0.3",
"targets": [
{
"exemplar": true,
"expr": "sum(traefik_entrypoint_request_duration_seconds_sum{instance=~\"$node\"}) / sum(traefik_entrypoint_requests_total{instance=~\"$node\"}) * 1000",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Average response time",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"decimals": 0,
"mappings": [],
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 5,
"x": 10,
"y": 0
},
"id": 8,
"interval": null,
"links": [],
"maxDataPoints": 3,
"options": {
"displayLabels": [],
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"values": [
"value"
]
},
"pieType": "pie",
"reduceOptions": {
"calcs": [
"sum"
],
"fields": "",
"values": false
},
"text": {},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"exemplar": true,
"expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint =~ \"$entrypoint\", instance=~\"$node\"}[5m])) by (entrypoint) ",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ entrypoint }}",
"refId": "A"
}
],
"title": "Requests by protocol",
"type": "piechart"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"decimals": 0,
"mappings": [],
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 9,
"x": 15,
"y": 0
},
"id": 7,
"interval": null,
"links": [],
"maxDataPoints": 3,
"options": {
"displayLabels": [],
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"values": [
"value"
]
},
"pieType": "pie",
"reduceOptions": {
"calcs": [
"sum"
],
"fields": "",
"values": false
},
"text": {},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"exemplar": true,
"expr": "sum(rate(traefik_service_requests_total{instance=~\"$node\"}[5m])) by (service) ",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ service }}",
"refId": "A"
}
],
"title": "Requests by service",
"type": "piechart"
},
{
"datasource": null,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 8
},
"id": 10,
"title": "$backend stats",
"type": "row"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"decimals": 0,
"mappings": [],
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 9
},
"id": 2,
"interval": null,
"links": [],
"maxDataPoints": 3,
"options": {
"displayLabels": [],
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"values": [
"value",
"percent"
]
},
"pieType": "pie",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"exemplar": true,
"expr": "traefik_service_requests_total{service=~\"$service\", instance=~\"$node\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{method}} : {{code}}",
"refId": "A"
}
],
"title": "return code $service ",
"type": "piechart"
},
{
"aliasColors": {},
"bars": true,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 9,
"x": 8,
"y": 9
},
"hiddenSeries": false,
"id": 3,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": true,
"min": true,
"rightSide": false,
"show": true,
"total": false,
"values": true
},
"lines": false,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.3",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(rate(traefik_service_requests_total{service=~\"$service\", instance=~\"$node\"}[$interval]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "Total requests $service",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Total requests $service",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:69",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:70",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "rgb(31, 120, 193)",
"mode": "fixed"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 7,
"x": 17,
"y": 9
},
"id": 4,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.0.3",
"targets": [
{
"exemplar": true,
"expr": "sum(traefik_service_request_duration_seconds_sum{service=~\"$service\",instance=~\"$node\"}) / sum(traefik_service_requests_total{service=~\"$service\",instance=~\"$node\"}) * 1000",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
}
],
"title": "response time $service ",
"type": "stat"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 16
},
"hiddenSeries": false,
"id": 20,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.3",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(increase(traefik_service_requests_total{code=\"404\",service=~\"$service\",instance=~\"$node\"}[$interval])) by (code)",
"interval": "",
"legendFormat": "{{code}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "404 Error $service",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:148",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:149",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 16
},
"hiddenSeries": false,
"id": 16,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.3",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(increase(traefik_service_requests_total{protocol=~\"$protocol\",service=~\"$service\",instance=~\"$node\"}[$interval])) by (code)",
"interval": "",
"legendFormat": "{{code}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Status Code Count $service",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:52",
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:53",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"collapsed": false,
"datasource": null,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 25
},
"id": 12,
"panels": [],
"title": "Global stats",
"type": "row"
},
{
"aliasColors": {},
"bars": true,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 26
},
"hiddenSeries": false,
"id": 5,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": false,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.3",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code=\"200\",instance=~\"$node\"}[$interval])",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{method}} : {{code}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Status code 200",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:314",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:315",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": true,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 26
},
"hiddenSeries": false,
"id": 6,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": false,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.3",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code!=\"200\",instance=~\"$node\"}[$interval])",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ method }} : {{code}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Others status code",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:391",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:392",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "",
"schemaVersion": 30,
"style": "dark",
"tags": [
"traefik",
"prometheus"
],
"templating": {
"list": [
{
"allValue": null,
"current": {
"selected": true,
"text": [
"dashboard.tzf-foryou.com:80"
],
"value": [
"dashboard.tzf-foryou.com:80"
]
},
"datasource": null,
"definition": "label_values(traefik_config_reloads_total{job=\"traefik\"}, instance)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "instance",
"multi": true,
"name": "node",
"options": [],
"query": {
"query": "label_values(traefik_config_reloads_total{job=\"traefik\"}, instance)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": null,
"current": {
"selected": false,
"text": "hsl_bankcard@docker",
"value": "hsl_bankcard@docker"
},
"datasource": "Prometheus",
"definition": "label_values(traefik_service_requests_total{instance=~\"$node\"},service)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "service",
"options": [],
"query": {
"query": "label_values(traefik_service_requests_total{instance=~\"$node\"},service)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": "Prometheus",
"definition": "label_values(traefik_entrypoint_requests_total{instance=~\"$node\"},entrypoint)",
"description": null,
"error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "entrypoint",
"options": [],
"query": {
"query": "label_values(traefik_entrypoint_requests_total{instance=~\"$node\"},entrypoint)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"selected": true,
"text": [
"http"
],
"value": [
"http"
]
},
"datasource": null,
"definition": "label_values(traefik_service_requests_total{instance=~\"$node\"}, protocol)",
"description": null,
"error": null,
"hide": 0,
"includeAll": true,
"label": "Service:",
"multi": true,
"name": "protocol",
"options": [],
"query": {
"query": "label_values(traefik_service_requests_total{instance=~\"$node\"}, protocol)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"auto": true,
"auto_count": 30,
"auto_min": "10s",
"current": {
"selected": false,
"text": "5m",
"value": "5m"
},
"description": null,
"error": null,
"hide": 0,
"label": "Interval",
"name": "interval",
"options": [
{
"selected": false,
"text": "auto",
"value": "$__auto_interval_interval"
},
{
"selected": false,
"text": "1m",
"value": "1m"
},
{
"selected": false,
"text": "2m",
"value": "2m"
},
{
"selected": true,
"text": "5m",
"value": "5m"
},
{
"selected": false,
"text": "10m",
"value": "10m"
},
{
"selected": false,
"text": "30m",
"value": "30m"
},
{
"selected": false,
"text": "1h",
"value": "1h"
},
{
"selected": false,
"text": "6h",
"value": "6h"
},
{
"selected": false,
"text": "12h",
"value": "12h"
},
{
"selected": false,
"text": "1d",
"value": "1d"
},
{
"selected": false,
"text": "7d",
"value": "7d"
},
{
"selected": false,
"text": "14d",
"value": "14d"
},
{
"selected": false,
"text": "30d",
"value": "30d"
}
],
"query": "1m,2m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
"queryValue": "",
"refresh": 2,
"skipUrlSync": false,
"type": "interval"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Traefik监控",
"uid": "QURlqlNnz",
"version": 47
}
4、添加警报规则
vim prometheus/rules/traefik_alert.yml
groups:
- name: traefik-监控告警
rules:
- alert: Traefik http状态码404 5分钟内请求量大于50
expr: sum(increase(traefik_service_requests_total{code="404",instance="xxx.xxx.xxx.xxx:8080"}[5m])) by (service) > 50
for: 50s
labels:
severity: warning
instance: xxx.xxx.xxx.xxx
annotations:
summary: "{{ $labels.service }} 404状态码响应多"
description: "后端响应状态码404 5分钟内统计量大于50 (当前值为 {{ $value }})"
link: https://grafana.xxxx.com/d/QURlqlNnz/traefikjian-kong?orgId=1
- alert: Traefik http状态码5XX 5分钟内请求量大于10
expr: sum(increase(traefik_service_requests_total{code=~"5.*",instance="xxx.xxx.xxx.xxx:8080"}[5m])) by (service) > 20
for: 50s
labels:
severity: warning
instance: xxx.xxx.xxx.xxx
annotations:
summary: "{{ $labels.service }} 5xx状态码响应多"
description: "后端响应状态码5xx 5分钟内统计量大于20 (当前值为 {{ $value }})"
link: https://grafana.xxxx.com/d/QURlqlNnz/traefikjian-kong?orgId=1
指标及其相关含义
指标项 | 含义 |
---|---|
process_max_fds | traefik进程最大的fd |
process_open_fds | 进程打开的fd |
process_resident_memory_bytes | 进程占用内存 |
process_start_time_seconds | 进程启动时间 |
process_virtual_memory_bytes | 进程占用虚拟内存 |
traefik_entrypoint_open_connections | 入口点存在打开链接的数量(method and protocol划分) |
traefik_entrypoint_request_duration_seconds_bucket | 在入口点处理请求花费的时间(status code, protocol, and method.) |
traefik_entrypoint_requests_total | 一个入口点处理的总请求数(状态码分布) |
网友评论