美文网首页
pod容器启动慢分析

pod容器启动慢分析

作者: wwq2020 | 来源:发表于2023-05-18 19:11 被阅读0次

    背景

    同事创建的pod从volume attach到start container需要很久,但是从event上没看出啥异常

    日志查看

    从kubelet日志中看到第一条异常日志

    {"ts":1684116807548.4512,"caller":"reconciler/reconciler.go:207","msg":"operationExecutor.VerifyControllerAttachedVolume failed for volume \"pvc-b5664de8-f778-44a3-8f86-5070d5c0f024\" (UniqueName: \"kubernetes.io/csi/kubesphere-rook-ceph.rbd.csi.ceph.com^0001-0014--ceph-0000000000000014-051274f6-f097-11ed-b513-c20eb2fc7bd6\") pod \"7215410480483457836-1-0\" (UID: \"323b9765-4313-4235-bf74-b8465e17b4bd\")  volume not marked in-use","v":5,"pod":{"name":"7215410480483457836-1-0","namespace":"1638169965330698240"}}
    

    最后一条异常日志

    {"ts":1684116837923.4392,"caller":"reconciler/reconciler.go:207","msg":"operationExecutor.VerifyControllerAttachedVolume failed for volume \"pvc-b5664de8-f778-44a3-8f86-5070d5c0f024\" (UniqueName: \"kubernetes.io/csi/kubesphere-rook-ceph.rbd.csi.ceph.com^0001-0014-ceph-0000000000000014-051274f6-f097-11ed-b513-c20eb2fc7bd6\") pod \"7215410480483457836-1-0\" (UID: \"323b9765-4313-4235-bf74-b8465e17b4bd\")  volume not marked in-use","v":5,"pod":{"name":"7215410480483457836-1-0","namespace":"1638169965330698240"}}
    

    后续第一条正常日志

    {"ts":1684116838024.7468,"caller":"reconciler/reconciler.go:212","msg":"Starting operationExecutor.VerifyControllerAttachedVolume for volume \"pvc-b5664de8-f778-44a3-8f86-5070d5c0f024\" (UniqueName: \"kubernetes.io/csi/kubesphere-rook-ceph.rbd.csi.ceph.com^0001-0014-ceph-0000000000000014-051274f6-f097-11ed-b513-c20eb2fc7bd6\") pod \"7215410480483457836-1-0\" (UID: \"323b9765-4313-4235-bf74-b8465e17b4bd\") ","v":5,"pod":{"name":"7215410480483457836-1-0","namespace":"1638169965330698240"}}
    

    分析

    kubelet收到pod创建事件后将pod的volumes添加到desiredStateOfWorld,
    kubelet的volumemanager reconciler会定期处理desiredStateOfWorld中待mount的volume
    这次的场景参数是controllerAttachDetachEnabled且PluginIsAttachable,当遇到待mount的volume没有被设置ReportedInUse,不会开始进行相应的检查attach和mount操作
    而待mount的volume的ReportedInUse状态是kubelet根据nodeStatusUpdateFrequency周期性设置的

    相关代码

    kubelet volumemanager reconciler

    pkg/kubelet/volumemanager/reconciler/reconciler.go中(只截取了相关代码)
    func (rc *reconciler) mountAttachVolumes() {
        for _, volumeToMount := range rc.desiredStateOfWorld.GetVolumesToMount() {
            volMounted, devicePath, err := rc.actualStateOfWorld.PodExistsInVolume(volumeToMount.PodName, volumeToMount.VolumeName)
            volumeToMount.DevicePath = devicePath
            if cache.IsVolumeNotAttachedError(err) {
                if rc.controllerAttachDetachEnabled || !volumeToMount.PluginIsAttachable {
                    if volumeToMount.PluginIsAttachable && !volumeToMount.ReportedInUse {
                        klog.V(5).InfoS(volumeToMount.GenerateMsgDetailed("operationExecutor.VerifyControllerAttachedVolume failed", " volume not marked in-use"), "pod", klog.KObj(volumeToMount.Pod))
                        continue
                    }
                } 
            } 
        }
    }
    

    kubelet

    pkg/kubelet/kubelet_node_status.go中(只截取了相关代码)

    func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) {
            go wait.JitterUntil(kl.syncNodeStatus, kl.nodeStatusUpdateFrequency, 0.04, true, wait.NeverStop)
    }
    func (kl *Kubelet) syncNodeStatus() {
        kl.syncNodeStatusMux.Lock()
        defer kl.syncNodeStatusMux.Unlock()
    
        if kl.kubeClient == nil || kl.heartbeatClient == nil {
            return
        }
        if kl.registerNode {
            // This will exit immediately if it doesn't need to do anything.
            kl.registerWithAPIServer()
        }
        if err := kl.updateNodeStatus(); err != nil {
            klog.ErrorS(err, "Unable to update node status")
        }
    }
    func (kl *Kubelet) updateNodeStatus() error {
            if err := kl.tryUpdateNodeStatus(i); err != nil {
                if i > 0 && kl.onRepeatedHeartbeatFailure != nil {
                    kl.onRepeatedHeartbeatFailure()
                }
                klog.ErrorS(err, "Error updating node status, will retry")
            }
        }
        return fmt.Errorf("update node status exceeds retry count")
    }
    func (kl *Kubelet) tryUpdateNodeStatus(tryNumber int) error {
        kl.volumeManager.MarkVolumesAsReportedInUse(updatedNode.Status.VolumesInUse)
    }
    

    总结

    也就是volume处理时间是0-nodeStatusUpdateFrequency范围内,假如nodeStatusUpdateFrequency为30秒,则0-30秒内都是可能的

    解决方案

    调低nodeStatusUpdateFrequency
    但是调低过多会频繁访问apiserver,增加apiserver和etcd的压力

    将csidriver中的attachRequired设置为false,如下


    image.png

    相关文章

      网友评论

          本文标题:pod容器启动慢分析

          本文链接:https://www.haomeiwen.com/subject/fuqgsdtx.html