美文网首页
ambari-agent Heartbeat字段解析

ambari-agent Heartbeat字段解析

作者: Nazzd | 来源:发表于2018-09-11 09:56 被阅读0次

    配置文件

    [server]
    hostname=localhost
    url_port=8440
    secured_url_port=8441
    connect_retry_delay=10
    max_reconnect_retry_delay=30
    
    
    [agent]
    logdir=/var/log/ambari-agent
    piddir=/var/run/ambari-agent
    prefix=/var/lib/ambari-agent/data
    ;loglevel=(DEBUG/INFO)
    loglevel=INFO
    data_cleanup_interval=86400
    data_cleanup_max_age=2592000
    data_cleanup_max_size_MB = 100
    ping_port=8670
    cache_dir=/var/lib/ambari-agent/cache
    tolerate_download_failures=true
    run_as_user=root
    parallel_execution=0
    alert_grace_period=5
    status_command_timeout=5
    alert_kinit_timeout=14400000
    system_resource_overrides=/etc/resource_overrides
    ; memory_threshold_soft_mb=400
    ; memory_threshold_hard_mb=1000
    ; ignore_mount_points=/mnt/custom1,/mnt/custom2
    
    [security]
    keysdir=/var/lib/ambari-agent/keys
    server_crt=ca.crt
    passphrase_env_var_name=AMBARI_PASSPHRASE
    ssl_verify_cert=0
    credential_lib_dir=/var/lib/ambari-agent/cred/lib
    credential_conf_dir=/var/lib/ambari-agent/cred/conf
    credential_shell_cmd=org.apache.hadoop.security.alias.CredentialShell
    force_https_protocol=PROTOCOL_TLSv1_2
    
    [network]
    ; this option apply only for Agent communication
    use_system_proxy_settings=true
    
    [services]
    pidLookupPath=/var/run/
    
    [heartbeat]
    state_interval_seconds=60
    dirs=/etc/hadoop,/etc/hadoop/conf,/etc/hbase,/etc/hcatalog,/etc/hive,/etc/oozie,
      /etc/sqoop,
      /var/run/hadoop,/var/run/zookeeper,/var/run/hbase,/var/run/templeton,/var/run/oozie,
      /var/log/hadoop,/var/log/zookeeper,/var/log/hbase,/var/run/templeton,/var/log/hive
    ; 0 - unlimited
    log_lines_count=300
    idle_interval_min=1
    idle_interval_max=10
    
    
    [logging]
    syslog_enabled=0
    

    心跳通信(Heartbeat)

    发送给ambari-server的数据

    {
        "alerts": [],
        "nodeStatus": {
            "status": "HEALTHY",
            "cause": "NONE"
        },
        "timestamp": 1536306543920,
        "hostname": "xxxxx.com",
        "responseId": 1139,
        "reports": [],
        "mounts": [{
            "available": "97151016",
            "used": "112461784",
            "percent": "54%",
            "device": "/dev/vdb",
            "mountpoint": "/data01",
            "type": "xfs",
            "size": "209612800"
        }],
        "recoveryTimestamp": 1536305501690,
        "agentEnv": {
            "transparentHugePage": "",
            "hostHealth": {
                "agentTimeStampAtReporting": 1536306543990,
                "activeJavaProcs": [ {
                    "command": "../java/jre/bin/java -Xmx256m -Dflume.monitoring.monitorInterval=120 -XX:+HeapDumpOnOutOfMemoryError -Dflume.monitoring.type=com.suning.flume.monitor.AgentMonitorService -cp /opt/sunflower/conf:/opt/sunflower/lib/*:/opt/sunflower/plugins.d/flume-agent/lib/* -Djava.library.path= org.apache.flume.node.Application --conf-file ../conf/custem.conf --name agent",
                    "pid": 12252,
                    "hadoop": false,
                    "user": "root"
                }],
                "liveServices": [{
                    "status": "Healthy",
                    "name": "ntpd or chronyd",
                    "desc": ""
                }]
            },
            "reverseLookup": true,
            "alternatives": [],
            "hasUnlimitedJcePolicy": false,
            "umask": "18",
            "firewallName": "iptables",
            "stackFoldersAndFiles": [],
            "existingUsers": [],
            "firewallRunning": false
        },
        "recoveryReport": {
            "componentReports": [],
            "summary": "RECOVERABLE"
        },
        "componentStatus": []
    }
    

    参数解析

    nodeStatus:agent节点状态,进入心跳阶段,状态均为健康HEALTHY
    timestamp:本次心跳阶段当前时间
    hostname:agent所在节点的hostname,config.get('agent', 'hostname_script')获取不到则通过socket.getfqdn()来获取主机名
    responseId:本地心跳阶段的唯一ID,如果是第一次心跳则为-1
    recoveryTimestamp:暂时不分析recovery
    recoveryReport:暂时不分析recovery
    reports:对服务组件执行INSTALL/START/STOP等操作得到的结果
    componentStatus:服务组件死活检查结果

    reports&componentStatus JSON

    {
        "componentStatus": [{
            "status": "HEALTHY",
            "componentName": "DATANODE"
        }],
        "reports": [{
                "status": "FAILED",
                "taskId": 3
            },
            {
                "status": "COMPLETE",
                "taskId": 4
            },
            {
                "status": "IN_PROGRESS",
                "stderr": "...",
                "stdout": "...",
                "clusterName": "cc",
                "structuredOut": "{}",
                "roleCommand": "INSTALL",
                "serviceName": "HDFS",
                "role": "DATANODE",
                "actionId": "1-1",
                "taskId": 5,
                "exitCode": 777
            },
            {
                "status": "IN_PROGRESS",
                "stderr": "...",
                "stdout": "...",
                "structuredOut": {
                    "var1": "test1",
                    "var2": "test2"
                },
                "clusterName": "cc",
                "roleCommand": "INSTALL",
                "serviceName": "HDFS",
                "role": "DATANODE",
                "actionId": "1-1",
                "taskId": 6,
                "exitCode": 777
            }
        ]
    }
    

    reports&componentStatus代码片段

          resultReports = []
          resultComponentStatus = []
          for key, item in self.current_state.items():
            command = item[0]
            report = item[1]
            if command ['commandType'] in [ActionQueue.EXECUTION_COMMAND, ActionQueue.BACKGROUND_EXECUTION_COMMAND]:
              if (report['status']) != ActionQueue.IN_PROGRESS_STATUS:
                resultReports.append(report)
                # Removing complete/failed command status from dict
                del self.current_state[key]
              else:
                in_progress_report = self.generate_in_progress_report(command, report)
                resultReports.append(in_progress_report)
            elif command ['commandType'] == ActionQueue.STATUS_COMMAND:
              resultComponentStatus.append(report)
              # Component status is useful once, removing it
              del self.current_state[key]
            elif command ['commandType'] in [ActionQueue.AUTO_EXECUTION_COMMAND]:
              logger.debug("AUTO_EXECUTION_COMMAND task deleted " + str(command['commandId']))
              del self.current_state[key]
              pass
          result = {
            'reports': resultReports,
            'componentStatus': resultComponentStatus
          }
          return result
    

    mounts:运行df以查找主机上的磁盘。 仅适用于Linux平台。 请注意,此解析器忽略任何带空格的文件系统和任何带空格的安装

    我们需要通过几个参数来过滤mount:

    • 挂载设备不在忽略的列表中
    • 当前进程正在运行的用户可以访问
    • 它不是文件挂载(docker environment)
    • 挂载路径或挂载路径的一部分不在黑名单中

    available:可用大小
    used:已经使用的大小
    percent:已经使用的百分比
    device:代表该文件系统是哪个分区,所以列出的是设备名称
    mountpoint:挂载点
    type:文件系统类型
    size:分区的大小
    agentEnv:返回有关主机的各种详细信息

    "agentEnv": {
            "transparentHugePage": "",//透明大页信息
            "hostHealth": {
                "agentTimeStampAtReporting": 1536306543990,//记录该信息的当前时间戳
                "activeJavaProcs": [ {//通过/proc获取进程来进行分析
                    "command": "",///proc/cmdline:系统启动时输入的内核命令行参数
                    "pid": 12252,//进程号
                    "hadoop": false,//为hadoop或者zookeeper一类的则为true,反之为false
                    "user": "root"//执行用户
                }],
                "liveServices": [{//serviceName ntpd status
                    "status": "Healthy",
                    "name": "ntpd or chronyd",
                    "desc": ""
                }]
            },
            "reverseLookup": true,//检查主机`socket.fqdn`是否解析为当前主机ip
            "alternatives": [],//获取项目的替代文件夹路径
            "hasUnlimitedJcePolicy": false,//测试JVM的JCE策略,以查看是否支持无限密钥长度
            "umask": "18",//权限值 777
            "firewallName": "iptables",//iptables
            "stackFoldersAndFiles": [],//获取linux用户目录列表信息
            "existingUsers": [],//获取已存在的以下DEFAULT_USERS 列表中的linux用户
            "firewallRunning": false//查看防火墙信息
        }
    
    DEFAULT_USERS = [
        "hive", "ambari-qa", "oozie", "hbase", "hcat", "mapred",
        "hdfs", "zookeeper", "flume", "sqoop", "sqoop2",
        "hue", "yarn", "tez", "storm", "falcon", "kafka", "knox", "ams",
        "hadoop", "spark", "accumulo", "atlas", "mahout", "ranger", "kms", "zeppelin"
      ]
    

    未完待续.....

    相关文章

      网友评论

          本文标题:ambari-agent Heartbeat字段解析

          本文链接:https://www.haomeiwen.com/subject/wxssgftx.html