Spark on yarn 执行流计算时,如果流挂了,没有提醒会导致实时指标计算停滞,为了保证流的7/24运行,需要有一个能监控Spark on yarn上的应用,实现失败重启、失败告警,同时能展示Spark应用的相关指标,实现数据质量的监控和管理。
一、指标监控模块
使用Prometheus、graphite_exporter、Grafana实现Spark应用指标监控。
1.Spark配置Graphite metrics
spark 是自带 Graphite Sink 的,只需要配置一下metrics.properties:
*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
*.sink.graphite.protocol=tcp
*.sink.graphite.host=127.0.0.1
*.sink.graphite.port=9109
*.sink.graphite.period=5
*.sink.graphite.unit=seconds
driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
提交时记得使用 --files /path/to/spark/conf/metrics.properties 参数将配置文件分发到所有的 Executor。
2.安装启动graphite_exporter
prometheus 提供了一个插件(graphite_exporter),可以将 Graphite metrics 进行转化并写入 Prometheus (本文的方式),
- 先去https://prometheus.io/download/下载graphite_exporter
- 启动 graphite_exporter 时加载配置文件
./graphite_exporter --graphite.mapping-config=graphite_exporter_mapping
graphite_exporter_mapping:
mappings:
- match: '*.*.executor.filesystem.*.*'
name: filesystem_usage
labels:
application: $1
executor_id: $2
fs_type: $3
qty: $4
- match: '*.*.jvm.*.*'
name: jvm_memory_usage
labels:
application: $1
executor_id: $2
mem_type: $3
qty: $4
- match: '*.*.executor.jvmGCTime.count'
name: jvm_gcTime_count
labels:
application: $1
executor_id: $2
- match: '*.*.jvm.pools.*.*'
name: jvm_memory_pools
labels:
application: $1
executor_id: $2
mem_type: $3
qty: $4
- match: '*.*.executor.threadpool.*'
name: executor_tasks
labels:
application: $1
executor_id: $2
qty: $3
- match: '*.*.BlockManager.*.*'
name: block_manager
labels:
application: $1
executor_id: $2
type: $3
qty: $4
- match: DAGScheduler.*.*
name: DAG_scheduler
labels:
type: $1
qty: $2
- 安装启动应用后,如果采集成功,将在 http://127.0.0.1:9108/metrics 页面中看到相应的信息。
3.配置 Prometheus和Grafana
修改/path/to/prometheus/prometheus.yml,增加
scrape_configs: - job_name: 'spark' static_configs: - targets: ['localhost:9108']
配置Grafana并加载Json配置:
spark_prometheus.json
{ "id": 1, "title": "Spark Prometheus", "originalTitle": "Spark Prometheus", "tags": [], "style": "dark", "timezone": "browser", "editable": true, "hideControls": false, "sharedCrosshair": false, "rows": [ { "collapse": false, "editable": true, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 0, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": 0, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 1, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "color": "#F2C96D", "linewidth": 7, "yaxis": 2, "zindex": 3 } ], "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "filesystem_usage{exported_job=\"read_bytes\", fs_type=\"hdfs\", application=\"$application_ID\"}", "intervalFactor": 2, "legendFormat": "{{executor_id}}", "metric": "filesystem_usage", "refId": "A", "step": 2, "target": "" }, { "expr": "sum(filesystem_usage{exported_job=\"read_bytes\", fs_type=\"hdfs\", application=\"$application_ID\"})", "intervalFactor": 2, "legendFormat": "total", "metric": "filesystem_usage", "refId": "B", "step": 2, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Executor HDFS reads", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] }, { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 1, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": null, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 2, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 4, "stack": true, "steppedLine": false, "targets": [ { "expr": "filesystem_usage{exported_job=\"write_bytes\", fs_type=\"hdfs\", application=\"$application_ID\"}", "intervalFactor": 2, "legendFormat": "{{executor_id}}", "metric": "filesystem_usage", "refId": "A", "step": 2, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Executor HDFS writes", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] }, { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 1, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": null, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 3, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "yaxis": 2 } ], "span": 4, "stack": true, "steppedLine": false, "targets": [ { "expr": "avg(rate(filesystem_usage{exported_job=\"read_bytes\", fs_type=\"hdfs\", application=\"$application_ID\"}[1m]))", "intervalFactor": 1, "legendFormat": "average", "metric": "filesystem_usage", "refId": "A", "step": 1, "target": "" }, { "expr": "sum(rate(filesystem_usage{exported_job=\"read_bytes\", fs_type=\"hdfs\", application=\"$application_ID\"}[1m]))", "intervalFactor": 1, "legendFormat": "total", "metric": "filesystem_usage", "refId": "B", "step": 1, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "HDFS Read Rate / s", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] } ], "title": "HDFS stats" }, { "collapse": false, "editable": true, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 0, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": 0, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 4, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "jvm_memory_usage{mem_type=\"heap\", qty=\"usage\", application=\"$application_ID\", executor_id=\"driver\"}", "intervalFactor": 2, "legendFormat": "{{executor_id}}", "metric": "jvm_memory_usage", "refId": "A", "step": 2, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Driver Heap Usage", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] }, { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 1, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": null, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 5, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "jvm_memory_pools{executor_id=\"driver\", application=\"$application_ID\",qty=\"used\"}", "intervalFactor": 2, "legendFormat": "{{mem_type}}", "metric": "jvm_memory_pools", "refId": "A", "step": 2, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Driver JVM Memory Pools", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] }, { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 0, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": null, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 6, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "jvm_memory_usage{mem_type=\"heap\", qty=\"usage\", application=\"$application_ID\", executor_id!~\"driver\"} ", "intervalFactor": 2, "legendFormat": "{{executor_id}}", "metric": "jvm_memory_usage", "refId": "A", "step": 2, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Executor Heap Usage", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] }, { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 0, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": null, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 7, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "jvm_memory_pools{executor_id!~\"driver\", application=\"$application_ID\",qty=\"usage\", mem_type=\"PS-Eden-Space\"}", "intervalFactor": 2, "legendFormat": "{{executor_id}}", "metric": "jvm_memory_pools", "refId": "A", "step": 2, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Executor Eden-Space", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] }, { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 0, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": null, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 9, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "jvm_memory_pools{executor_id!~\"driver\", application=\"$application_ID\",qty=\"usage\", mem_type=\"PS-Old-Gen\"}", "intervalFactor": 2, "legendFormat": "{{executor_id}}", "metric": "jvm_memory_pools", "refId": "A", "step": 2, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Executor Old-Gen", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] }, { "aliasColors": {}, "bars": false, "datasource": "Prometheus", "editable": true, "error": false, "fill": 1, "grid": { "leftLogBase": 1, "leftMax": null, "leftMin": null, "rightLogBase": 1, "rightMax": null, "rightMin": null, "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 10, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "completeTasks", "yaxis": 2 } ], "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(executor_tasks{application=\"$application_ID\", qty!~\"maxPool_size\"}) by (qty)", "intervalFactor": 2, "legendFormat": "{{qty}}", "refId": "B", "step": 2, "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Tasks", "tooltip": { "shared": true, "value_type": "cumulative" }, "type": "graph", "x-axis": true, "y-axis": true, "y_formats": [ "short", "short" ] } ], "title": "New row" } ], "time": { "from": "now-5m", "to": "now" }, "timepicker": { "now": true, "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "templating": { "list": [ { "allFormat": "glob", "current": { "text": "application_1564363240533_7587", "value": "application_1564363240533_7587" }, "datasource": null, "includeAll": false, "multi": false, "multiFormat": "glob", "name": "application_ID", "options": [ { "text": "application_1564363240533_7587", "value": "application_1564363240533_7587", "selected": false }, { "text": "application_1564363240533_7592", "value": "application_1564363240533_7592", "selected": false }, { "text": "application_1564363240533_7615", "value": "application_1564363240533_7615", "selected": false } ], "query": "label_values(application)", "refresh": true, "refresh_on_load": false, "type": "query" } ] }, "annotations": { "list": [] }, "refresh": "5s", "schemaVersion": 8, "version": 0, "links": [] }
二、进程监控失败重启和告警模块
监控yarn上指定的Spark应用是否存在,不存在则发出告警。
使用Python脚本查看yarn状态,指定监控应用,应用中断则通过webhook发送报警信息到钉钉群,并且自动重启。
#!/usr/bin/python3.5
# -*- coding: utf-8 -*-
import os
import json
import requests
'''
Yarn应用监控:当配置的应用名不在yarn applicaition -list时,钉钉告警
'''
def yarn_list(applicatin_list):
yarn_application_list = os.popen('yarn application -list').read()
result = ""
for appName in applicatin_list:
if appName in yarn_application_list:
print("应用:%s 正常!" % appName)
else:
result += ("告警--应用:%s 中断!" % appName)
if "应用名1" == appName:
os.system('重启命令')
return result
def dingding_robot(data):
# 机器人的webhooK 获取地址参考:https://open-doc.dingtalk.com/microapp/serverapi2/qf2nxq
webhook = "https://oapi.dingtalk.com/robot/send?access_token" \
"=你的token "
headers = {'content-type': 'application/json'} # 请求头
r = requests.post(webhook, headers=headers, data=json.dumps(data))
r.encoding = 'utf-8'
return r.text
if __name__ == '__main__':
applicatin_list = ["应用名1", "应用名2", "应用名3"]
output = yarn_list(applicatin_list)
print(output)
if len(output) > 0:
# 请求参数 可以写入配置文件中
data = {
"msgtype": "text",
"text": {
"content": output
},
"at": {
"atMobiles": [
"xxxxxxx"
],
"isAtAll": False
}
}
res = dingding_robot(data)
print(res) # 打印请求结果
else:
print("一切正常!")
三、指标监控告警
TBC:使用Prometheus的AlertManager模块实现指标的监控。
网友评论