美文网首页docker. k8s
k8s集群监控 cadvisor/exporter+promet

k8s集群监控 cadvisor/exporter+promet

作者: Maxwell_Dncey | 来源:发表于2019-10-28 17:08 被阅读0次

    k8s监控处理

    1.cadvisor/exporter+prometheus+grafana 安装

    1.1 配置nfs安装

    ubuntu:
        nfs 服务器
        apt-get install nfs-kernel-server 
       
        # 创建一个/data/pvdata的共享目录
        mkdir /data/pvdata
        
        centos:
         chown nfsnobody:nfsnobody /data/pvdata
        ubuntu:
         chown chown nobody:nogroup /data/pvdata
         
        vim /etc/exports 
        #ip填写自己所支持IP范围
         /data/pvdata xxx.xxx.xxx.0/24(rw,async,all_squash)
        
         exportfs -rv
         #显示 exporting xxx.xxx.xxx.0/24:/data/pvdata
        
        #需要在prometheus服务所在的节点安装
        #nfs 客户端
        apt-get update
        apt-get install nfs-common
        
        在其他节点上测试nfs是否可用,挂载命令:
        mkdir /kubernetes
        mount nfs服务器的ip:/data/pvdata /kubernetes
    

    1.2 prometheus配置

    mkdir /data/k8s/yaml/kube-system/prometheus
    cd /data/k8s/yaml/kube-system/prometheus/
    
    # 从github官网下载yaml部署文件
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-rbac.yaml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-configmap.yaml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-service.yaml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-statefulset.yaml
    

    1.2.1 修改prometheus-statefulset.yaml

    # 删掉最下面的10行
    
        volumeClaimTemplates:
         - metadata:
             name: prometheus-data
           spec:
             storageClassName: standard
             accessModes:
               - ReadWriteOnce
             resources:
               requests:
                 storage: "16Gi"
             
    # 新增下面3行(使用自己指定的nfs作为存储)
               - name: prometheus-data
                 persistentVolumeClaim:  
                   claimName: prometheus-data
    

    1.2.2 新增pv/pvc 修改yaml文件

    #创建普罗米修斯数据存储位置
    mkdir /data/pvdata/prometheus
    #注意 ubuntu 系统需要换成nobody
    chown nfsnobody. /data/pvdata/prometheus
    
    cat > prometheus-pvc-data.yaml << EFO
    apiVersion: v1
    kind: PersistentVolume
    metadata:
      name: prometheus-data
    spec:
      storageClassName: prometheus-data
      capacity: 
        storage: 10Gi  
      accessModes: 
        - ReadWriteOnce  
      persistentVolumeReclaimPolicy: Recycle 
      nfs:
        path: /data/pvdata/prometheus
        server: nfs-server-ip
    
    ---
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: prometheus-data
      namespace: kube-system  
    spec:
      accessModes:
        - ReadWriteOnce 
      resources:
        requests:
          storage: 10Gi 
      storageClassName: prometheus-data
    EFO
    

    1.2.3 修改Nodeport与prometheus镜像版本

    #修改prometheus-service.yaml
      type: NodePort
      
    #修改prometheus-statefulset.yaml 
    #注意prometheus的配置,默认的cpu、mem配置支持10个节点30个pod
    prometheus 镜像版本更改最新的v2.13.0
    添加 args: --storage.tsdb.retention.time=指定数据保存时长
    

    1.2.4 开始安装配置prometheus

    # 应用yaml文件
    kubectl apply -f prometheus-rbac.yaml 
    kubectl apply -f prometheus-configmap.yaml 
    kubectl apply -f prometheus-pvc-data.yaml
    kubectl apply -f prometheus-service.yaml
    kubectl apply -f prometheus-statefulset.yaml
    
    #查看是否安装成功
    kubectl get pods -n kube-system |grep prometheus
    
    #获取所在prometheus所在的节点 NODE 信息
    kubectl get pods -n kube-system  -o wide |grep prometheus
    
    #获取prometheus的NODEPort
    kubectl get service -n kube-system
    prometheus           NodePort    xxx.xxx.xxx.xxx   <none>        9090:32809/TCP           5d20h
     访问prometheus服务 NodeIP+Nodeport(32809)
    

    1.3 安装node-exporter

    #下载node-exporter的yaml文件
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/node-exporter-ds.yml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/node-exporter-service.yaml
    
    #应用node-exporter
    kubectl apply -f node-exporter-service.yaml
    kubectl apply -f node-exporter-ds.yml 
    

    1.4 部署kube-state-metrics

    #下载yaml文件
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-service.yaml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-rbac.yaml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-deployment.yaml
    
    #应用yaml文件
    kubectl apply -f kube-state-metrics-service.yaml
    kubectl apply -f kube-state-metrics-rbac.yaml
    kubectl apply -f kube-state-metrics-deployment.yaml
    

    1.5 部署grafana

    1.5.1 创建数据存储目录

    mkdir /data/pvdata/prometheus-grafana
    ubuntu:
      chown nobody. /data/pvdata/prometheus-grafana
    centos:
      chown nfsnobody. /data/pvdata/prometheus-grafana
    

    1.5.2 创建grafana的pvc

    cat > grafana-pvc.yaml << EFO
    apiVersion: v1
    kind: PersistentVolume
    metadata:
      name: prometheus-grafana
    spec:
      storageClassName: prometheus-grafana
      capacity: 
        storage: 1Gi  
      accessModes: 
        - ReadWriteOnce  
      persistentVolumeReclaimPolicy: Recycle 
      nfs:
        path: /data/pvdata/prometheus-grafana
        server: nfs服务ip
    
    ---
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: prometheus-grafana
      namespace: kube-system  
    spec:
      accessModes:
        - ReadWriteOnce 
      resources:
        requests:
          storage: 1Gi 
      storageClassName: prometheus-grafana
    EFO
    

    1.5.3 grafana-deploment.yaml

    #该服务使用的cpu和内存可跟所需配置
    
    cat > grafana-deployment.yaml << EFO
    apiVersion: extensions/v1beta1
    kind: Deployment
    metadata:
      name: grafana
      namespace: kube-system
      labels:
        app: grafana
    spec:
      revisionHistoryLimit: 10
      template:
        metadata:
          labels:
            app: grafana
            component: prometheus
        spec:
         #nodeSelector:
         #  kubernetes.io/hostname: 可设置指定部署到的节点
          containers:
          - name: grafana
            env:
            - name: GF_SECURITY_ADMIN_USER
              value: admin
            - name: GF_SECURITY_ADMIN_PASSWORD
              value: admin
            image: grafana/grafana:6.4.3  
            imagePullPolicy: IfNotPresent
            ports:
            - containerPort: 3000
              name: grafana
            readinessProbe:
              failureThreshold: 10
              httpGet:
                path: /api/health
                port: 3000
                scheme: HTTP
              initialDelaySeconds: 30
              periodSeconds: 10
              successThreshold: 1
              timeoutSeconds: 30
            livenessProbe:
              failureThreshold: 3
              httpGet:
                path: /api/health
                port: 3000
                scheme: HTTP
              periodSeconds: 10
              successThreshold: 1
              timeoutSeconds: 1
            resources:
              limits:
                cpu: 100m
                memory: 256Mi
              requests:
                cpu: 100m
                memory: 256Mi
            volumeMounts:
            - mountPath: /var/lib/grafana
              subPath: grafana
              name: grafana-volumes
          volumes:
          - name: grafana-volumes
            persistentVolumeClaim:
              claimName: prometheus-grafana
              
    ---
    # ------------------- APP Service ------------------- #
    
    kind: Service
    apiVersion: v1
    metadata:
      labels:
        app: grafana
      name: grafana
      namespace: kube-system
    spec:
      #type: ClusterIP
      type: NodePort
      ports:
        - port: 80
          targetPort: 3000
      selector:
        app: grafana
    EFO      
    

    1.5.4 部署文件并查看服务的ip和端口

    kubectl apply -f grafana-pvc.yaml
    kubectl apply -f grafana-deployment.yaml
    
    #查看服务和端口
    kubectl get service -n kube-system
    prometheus           NodePort    xxx.xxx.xxx.xxx    <none>        9090:31920/TCP           3d23h
    
    kubectl get pods -n kube-system -o wide 
    
    获取到grafana所在的node上,拿到nodeip+上面获得的端口(31920)即可访问grafana服务。
    

    -- 登陆用户名默认admin,admin。登陆之后需要配置数据源,数据源选择prometheus,填写prometheus服务的ip和端口。然后在datasource 那一栏设置。
    导入入dashboard可以选择模版10000,就可以看数据监控页面了

    1.6 k8s报警系统

    1.6.1 下载所需alertmanager yaml文件

    #建议与prometheus放到一个服务器上,避免出现需要安装nfs客户端的错误。
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-pvc.yaml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-service.yaml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-deployment.yaml
    curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-configmap.yaml
    
    1.6.2 创建保存报警信息的文件夹
    mkdir /data/pvdata/prometheus-alertmanager
    chown nfsnobody. /data/pvdata/prometheus-alertmanager
    

    1.6.3 创建alertmanager-pvc.yaml

    cat > alertmanager-pvc.yaml  << EFO
    apiVersion: v1
    kind: PersistentVolume
    metadata:
      name: prometheus-alertmanager
    spec:
      storageClassName: prometheus-alertmanager
      capacity: 
        storage: 1Gi  
      accessModes: 
        - ReadWriteOnce  
      persistentVolumeReclaimPolicy: Recycle 
      nfs:
        path: /data/pvdata/prometheus-alertmanager
        server: 搭建的nfs服务ip
    
    ---
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: prometheus-alertmanager
      namespace: kube-system  
    spec:
      accessModes:
        - ReadWriteOnce 
      resources:
        requests:
          storage: 1Gi 
      storageClassName: prometheus-alertmanager
    EFO
    

    1.6.4 修改alertmanger-deploment.yaml

    # 修改最后一行的claimName,更改自己创建的volume
            - name: storage-volume
              persistentVolumeClaim:
                claimName: prometheus-alertmanager
                ```
    #### 1.6.5 修改alertmanger-service.yaml
    ```shell
    #修改spec中的type 为NodePort,方便根据节点ip和随机映射的端口访问该服务。
    
    apiVersion: v1
    kind: Service
    metadata:
      name: alertmanager
      namespace: kube-system
      labels:
        kubernetes.io/cluster-service: "true"
        addonmanager.kubernetes.io/mode: Reconcile
        kubernetes.io/name: "Alertmanager"
    spec:
      ports:
        - name: http
          port: 80
          protocol: TCP
          targetPort: 9093
      selector:
        k8s-app: alertmanager
      type: NodePort
    

    1.6.6 部署alertmanger

    kubectl apply -f alertmanager-pvc.yaml
    kubectl apply -f alertmanager-configmap.yaml
    kubectl apply -f alertmanager-service.yaml
    kubectl apply -f alertmanager-deployment.yaml
    

    1.6.7 创建告警规则

    
    kubectl edit configmaps prometheus-config -n kube-system 
    
    // 在prometheus.yml: |下面添加
        global:
        #抓取数据间隔
          scrape_interval: 5s
        #评估告警时间间隔
          evaluation_interval: 5s
          
        alerting:
          alertmanagers:
          - static_configs:
            - targets: ["alertmanger服务的ip和监听的端口"]
        rule_files:
        - "/etc/config/rules.yml"
        
        
    // 创建告警规则, 在最下面添加
      rules.yml: |
        groups:
        - name: monitor
          rules:
          - alert: InstanceDown
            expr: up == 0
            for: 1m
            labels:
              team: kube-system
            annotations:
              summary: "Instance {{ $labels.instance }} down"
              description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
    
    //  重载配置文件
    curl -X POST http://prometheus的节点ip:端口/-/reload
    

    1.6.8 创建邮件告警

    # 修改alertmanager-configmap.yaml文件
    cat > alertmanager-configmap.yaml  << EFO
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: alertmanager-config
      namespace: kube-system
      labels:
        kubernetes.io/cluster-service: "true"
        addonmanager.kubernetes.io/mode: EnsureExists
    data:
      alertmanager.yml: |
        global:
          resolve_timeout: 3m #解析的超时时间
          smtp_smarthost: 'smtp.163.com:25'
          smtp_from: 'USERNAMR@163.com'
          smtp_auth_username: 'USERNAMR@163.com'
          smtp_auth_password: 'PASSWORD'
          smtp_require_tls: false 
        
        route:
          group_by: ['example']
          group_wait: 60s
          group_interval: 60s
          repeat_interval: 12h
          receiver: 'webhook'
        
        receivers:
        - name: 'webhook'
          webhook_configs:
          #填写web_hook_url
          - url: 'web_hook_url'
            #是否在告警消除时发送回执消息
            send_resolved: false
          email_configs:
          - to: 'xxxx@qq.com'
            send_resolved: false
    EFO
    
    kubectl delete configmaps -n kube-system alertmanager-config 
    kubectl apply  -f alertmanager-configmap.yaml 
    

    参考 :
    https://www.jianshu.com/p/e76053b6f3f5

    相关文章

      网友评论

        本文标题:k8s集群监控 cadvisor/exporter+promet

        本文链接:https://www.haomeiwen.com/subject/rymbvctx.html