配置文件
[server]
hostname=localhost
url_port=8440
secured_url_port=8441
connect_retry_delay=10
max_reconnect_retry_delay=30
[agent]
logdir=/var/log/ambari-agent
piddir=/var/run/ambari-agent
prefix=/var/lib/ambari-agent/data
;loglevel=(DEBUG/INFO)
loglevel=INFO
data_cleanup_interval=86400
data_cleanup_max_age=2592000
data_cleanup_max_size_MB = 100
ping_port=8670
cache_dir=/var/lib/ambari-agent/cache
tolerate_download_failures=true
run_as_user=root
parallel_execution=0
alert_grace_period=5
status_command_timeout=5
alert_kinit_timeout=14400000
system_resource_overrides=/etc/resource_overrides
; memory_threshold_soft_mb=400
; memory_threshold_hard_mb=1000
; ignore_mount_points=/mnt/custom1,/mnt/custom2
[security]
keysdir=/var/lib/ambari-agent/keys
server_crt=ca.crt
passphrase_env_var_name=AMBARI_PASSPHRASE
ssl_verify_cert=0
credential_lib_dir=/var/lib/ambari-agent/cred/lib
credential_conf_dir=/var/lib/ambari-agent/cred/conf
credential_shell_cmd=org.apache.hadoop.security.alias.CredentialShell
force_https_protocol=PROTOCOL_TLSv1_2
[network]
; this option apply only for Agent communication
use_system_proxy_settings=true
[services]
pidLookupPath=/var/run/
[heartbeat]
state_interval_seconds=60
dirs=/etc/hadoop,/etc/hadoop/conf,/etc/hbase,/etc/hcatalog,/etc/hive,/etc/oozie,
/etc/sqoop,
/var/run/hadoop,/var/run/zookeeper,/var/run/hbase,/var/run/templeton,/var/run/oozie,
/var/log/hadoop,/var/log/zookeeper,/var/log/hbase,/var/run/templeton,/var/log/hive
; 0 - unlimited
log_lines_count=300
idle_interval_min=1
idle_interval_max=10
[logging]
syslog_enabled=0
心跳通信(Heartbeat)
发送给ambari-server的数据
{
"alerts": [],
"nodeStatus": {
"status": "HEALTHY",
"cause": "NONE"
},
"timestamp": 1536306543920,
"hostname": "xxxxx.com",
"responseId": 1139,
"reports": [],
"mounts": [{
"available": "97151016",
"used": "112461784",
"percent": "54%",
"device": "/dev/vdb",
"mountpoint": "/data01",
"type": "xfs",
"size": "209612800"
}],
"recoveryTimestamp": 1536305501690,
"agentEnv": {
"transparentHugePage": "",
"hostHealth": {
"agentTimeStampAtReporting": 1536306543990,
"activeJavaProcs": [ {
"command": "../java/jre/bin/java -Xmx256m -Dflume.monitoring.monitorInterval=120 -XX:+HeapDumpOnOutOfMemoryError -Dflume.monitoring.type=com.suning.flume.monitor.AgentMonitorService -cp /opt/sunflower/conf:/opt/sunflower/lib/*:/opt/sunflower/plugins.d/flume-agent/lib/* -Djava.library.path= org.apache.flume.node.Application --conf-file ../conf/custem.conf --name agent",
"pid": 12252,
"hadoop": false,
"user": "root"
}],
"liveServices": [{
"status": "Healthy",
"name": "ntpd or chronyd",
"desc": ""
}]
},
"reverseLookup": true,
"alternatives": [],
"hasUnlimitedJcePolicy": false,
"umask": "18",
"firewallName": "iptables",
"stackFoldersAndFiles": [],
"existingUsers": [],
"firewallRunning": false
},
"recoveryReport": {
"componentReports": [],
"summary": "RECOVERABLE"
},
"componentStatus": []
}
参数解析
nodeStatus:agent节点状态,进入心跳阶段,状态均为健康HEALTHY
timestamp:本次心跳阶段当前时间
hostname:agent所在节点的hostname,config.get('agent', 'hostname_script')
获取不到则通过socket.getfqdn()
来获取主机名
responseId:本地心跳阶段的唯一ID,如果是第一次心跳则为-1
recoveryTimestamp:暂时不分析recovery
recoveryReport:暂时不分析recovery
reports:对服务组件执行INSTALL/START/STOP等操作得到的结果
componentStatus:服务组件死活检查结果
reports&componentStatus JSON
{
"componentStatus": [{
"status": "HEALTHY",
"componentName": "DATANODE"
}],
"reports": [{
"status": "FAILED",
"taskId": 3
},
{
"status": "COMPLETE",
"taskId": 4
},
{
"status": "IN_PROGRESS",
"stderr": "...",
"stdout": "...",
"clusterName": "cc",
"structuredOut": "{}",
"roleCommand": "INSTALL",
"serviceName": "HDFS",
"role": "DATANODE",
"actionId": "1-1",
"taskId": 5,
"exitCode": 777
},
{
"status": "IN_PROGRESS",
"stderr": "...",
"stdout": "...",
"structuredOut": {
"var1": "test1",
"var2": "test2"
},
"clusterName": "cc",
"roleCommand": "INSTALL",
"serviceName": "HDFS",
"role": "DATANODE",
"actionId": "1-1",
"taskId": 6,
"exitCode": 777
}
]
}
reports&componentStatus代码片段
resultReports = []
resultComponentStatus = []
for key, item in self.current_state.items():
command = item[0]
report = item[1]
if command ['commandType'] in [ActionQueue.EXECUTION_COMMAND, ActionQueue.BACKGROUND_EXECUTION_COMMAND]:
if (report['status']) != ActionQueue.IN_PROGRESS_STATUS:
resultReports.append(report)
# Removing complete/failed command status from dict
del self.current_state[key]
else:
in_progress_report = self.generate_in_progress_report(command, report)
resultReports.append(in_progress_report)
elif command ['commandType'] == ActionQueue.STATUS_COMMAND:
resultComponentStatus.append(report)
# Component status is useful once, removing it
del self.current_state[key]
elif command ['commandType'] in [ActionQueue.AUTO_EXECUTION_COMMAND]:
logger.debug("AUTO_EXECUTION_COMMAND task deleted " + str(command['commandId']))
del self.current_state[key]
pass
result = {
'reports': resultReports,
'componentStatus': resultComponentStatus
}
return result
mounts:运行df以查找主机上的磁盘。 仅适用于Linux平台。 请注意,此解析器忽略任何带空格的文件系统和任何带空格的安装
我们需要通过几个参数来过滤mount:
- 挂载设备不在忽略的列表中
- 当前进程正在运行的用户可以访问
- 它不是文件挂载(docker environment)
- 挂载路径或挂载路径的一部分不在黑名单中
available:可用大小
used:已经使用的大小
percent:已经使用的百分比
device:代表该文件系统是哪个分区,所以列出的是设备名称
mountpoint:挂载点
type:文件系统类型
size:分区的大小
agentEnv:返回有关主机的各种详细信息
"agentEnv": {
"transparentHugePage": "",//透明大页信息
"hostHealth": {
"agentTimeStampAtReporting": 1536306543990,//记录该信息的当前时间戳
"activeJavaProcs": [ {//通过/proc获取进程来进行分析
"command": "",///proc/cmdline:系统启动时输入的内核命令行参数
"pid": 12252,//进程号
"hadoop": false,//为hadoop或者zookeeper一类的则为true,反之为false
"user": "root"//执行用户
}],
"liveServices": [{//serviceName ntpd status
"status": "Healthy",
"name": "ntpd or chronyd",
"desc": ""
}]
},
"reverseLookup": true,//检查主机`socket.fqdn`是否解析为当前主机ip
"alternatives": [],//获取项目的替代文件夹路径
"hasUnlimitedJcePolicy": false,//测试JVM的JCE策略,以查看是否支持无限密钥长度
"umask": "18",//权限值 777
"firewallName": "iptables",//iptables
"stackFoldersAndFiles": [],//获取linux用户目录列表信息
"existingUsers": [],//获取已存在的以下DEFAULT_USERS 列表中的linux用户
"firewallRunning": false//查看防火墙信息
}
DEFAULT_USERS = [
"hive", "ambari-qa", "oozie", "hbase", "hcat", "mapred",
"hdfs", "zookeeper", "flume", "sqoop", "sqoop2",
"hue", "yarn", "tez", "storm", "falcon", "kafka", "knox", "ams",
"hadoop", "spark", "accumulo", "atlas", "mahout", "ranger", "kms", "zeppelin"
]
未完待续.....
网友评论