美文网首页
vhost-net 3 -- 网卡多队列

vhost-net 3 -- 网卡多队列

作者: 苏苏林 | 来源:发表于2022-11-17 00:19 被阅读0次

    虚拟机配置接口多队列

    为虚拟机接口配置多队列可以提高虚拟机网卡的收发性能。
    如下,我们为vm的一个网卡配置了4队列。

    # virsh dumpxml 5a6a67e65b2d43c6850dc8998a6d51f1
    ......
        <interface type='bridge'>
          <mac address='fa:b9:b3:7e:17:00'/>
          <source bridge='br_zsn0_31'/>
          <target dev='vnic285.0'/>
          <model type='virtio'/>
          <driver name='vhost' txmode='iothread' ioeventfd='on' event_idx='off' queues='4' rx_queue_size='256' tx_queue_size='256'/>
          <mtu size='1500'/>
          <alias name='net0'/>
          <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/>
        </interface>
    ......
    
    虚拟机启动后,可以看到无论是宿主机上的tap口还是vm的网卡都已经是4队列了。 tap口多队列 vm接口多队列 同时,内核为这个vm启动了四个vhost线程。也就是说,每个接口的每个队列对应一个vhost线程。 多队列创建的多线程

    vhost-net多队列的相关实现

    包含几部分
    tap设备多队列

    https://www.jianshu.com/p/53b3199c9a92 中介绍过tap设备驱动,tap设备一般的使用方式如下:

    int open_tun (const char *dev, char *actual, int size)  
    {  
      struct ifreq ifr;  
      int fd;  
      char *device = "/dev/net/tun";  
    
      if ((fd = open (device, O_RDWR)) < 0) //创建描述符  
          msg (M_ERR, "Cannot open TUN/TAP dev %s", device);  
    
      memset (&ifr, 0, sizeof (ifr));  
      ifr.ifr_flags = IFF_NO_PI;  
    
      if (!strncmp (dev, "tun", 3)) {    
          ifr.ifr_flags |= IFF_TUN;  
      } else if (!strncmp (dev, "tap", 3)) {  
          ifr.ifr_flags |= IFF_TAP;  
      }  else {  
          msg (M_FATAL, "I don't recognize device %s as a TUN or TAP device",dev);  
      } 
    
      if (strlen (dev) > 3)      /* unit number specified? */  
          strncpy (ifr.ifr_name, dev, IFNAMSIZ);  
      if (ioctl (fd, TUNSETIFF, (void *) &ifr) < 0) //打开虚拟网卡  
          msg (M_ERR, "Cannot ioctl TUNSETIFF %s", dev);  
    
      set_nonblock (fd);  
      msg (M_INFO, "TUN/TAP device %s opened", ifr.ifr_name);  
      strncpynt (actual, ifr.ifr_name, size);  
      return fd;  
    } 
    

    包含两个步骤:
    1) 打开tun字符设备,返回一个文件句柄。在内核创建了tun_file结构,它就是队列的一个抽象;
    2)为tun设备设置虚拟网卡。是真正的创建tun设备,内核创建了net_device 和其私有数据 tun_struct,并将队列(tun_file)绑定到设备上。

    tun_struct 代表着一个tun/tap设备,定义中包含一个tun_file数组,即代表着设备的多个队列,tun_attach函数负责绑定 tun_file 和 tun_struct,每绑定一个队列,tun_struct的numqueues++,所以,每次执行一次类似上面 open_tun,open一次"/dev/net/tun",并TUNSETIFF绑定到相同名称的设备上 ,tun/tap设备就会多一个队列。
    从定义上看(3.10内核),tun设备最多支持8队列。

    #define DEFAULT_MAX_NUM_RSS_QUEUES  (8)
    
    struct tun_struct {
        struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES];
        unsigned int            numqueues;
    ......
    };
    
    
    
    static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
    {
        struct tun_struct *tun;
        struct tun_file *tfile = file->private_data;
        struct net_device *dev;
        int err;
    
        if (tfile->detached)
            return -EINVAL;
    
        dev = __dev_get_by_name(net, ifr->ifr_name);
        if (dev) {
            // tun 设备已经存在的情况下,只绑定新的队列到设备
            if (ifr->ifr_flags & IFF_TUN_EXCL)
                return -EBUSY;
            if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
                tun = netdev_priv(dev);
            else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
                tun = netdev_priv(dev);
            else
                return -EINVAL;
    
            if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
                !!(tun->flags & TUN_TAP_MQ))
                return -EINVAL;
    
            if (tun_not_capable(tun))
                return -EPERM;
            err = security_tun_dev_open(tun->security);
            if (err < 0)
                return err;
    
            err = tun_attach(tun, file);
            if (err < 0)
                return err;
    
            if (tun->flags & TUN_TAP_MQ &&
                (tun->numqueues + tun->numdisabled > 1)) {
                /* One or more queue has already been attached, no need
                 * to initialize the device again.
                 */
                return 0;
            }
        }
        else {
            char *name;
            unsigned long flags = 0;
            int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
                     MAX_TAP_QUEUES : 1;
    
            if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;
            err = security_tun_dev_create();
            if (err < 0)
                return err;
    
            /* Set dev type */
            if (ifr->ifr_flags & IFF_TUN) {
                /* TUN device */
                flags |= TUN_TUN_DEV;
                name = "tun%d";
            } else if (ifr->ifr_flags & IFF_TAP) {
                /* TAP device */
                flags |= TUN_TAP_DEV;
                name = "tap%d";
            } else
                return -EINVAL;
    
            if (*ifr->ifr_name)
                name = ifr->ifr_name;
            // 创建tun设备
            dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
                           tun_setup, queues, queues);
    
            ......
            // 绑定 队列到tun 设备
            err = tun_attach(tun, file);
            if (err < 0)
                goto err_free_dev;
    
            ......
    }
    
    static int tun_attach(struct tun_struct *tun, struct file *file)
    {
        struct tun_file *tfile = file->private_data;
        int err;
    
        err = security_tun_dev_attach(tfile->socket.sk, tun->security);
        if (err < 0)
            goto out;
    
        err = -EINVAL;
        if (rtnl_dereference(tfile->tun) && !tfile->detached)
            goto out;
    
        err = -EBUSY;
        if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
            goto out;
    
        err = -E2BIG;
        if (!tfile->detached &&
            tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
            goto out;
    
        err = 0;
    
        /* Re-attach the filter to presist device */
        if (tun->filter_attached == true) {
            err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
            if (!err)
                goto out;
        }
        tfile->queue_index = tun->numqueues;
        rcu_assign_pointer(tfile->tun, tun);
        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
        tun->numqueues++;
    
        if (tfile->detached)
            tun_enable_queue(tfile);
        else
            sock_hold(&tfile->sk);
    
        tun_set_real_num_queues(tun);
    
        /* device is allowed to go away first, so no need to hold extra
         * refcnt.
         */
    
    out:
        return err;
    }
    
    vhost设置多队列

    vhost设备设置多队列其实和tap很类似。

    qemu 每open(“/dev/vhost-net”, O_RDWR)一次,就会调用vhost_net_open创建一个vhost设备,对应一个vhost_net 数据结构,对应一个队列。如下,vhost_net包含一个vhost_net_virtqueue,对应一对发送和接收队列,所以如果要为虚拟机的一个接口配置4队列,需要open 四次“/dev/vhost-net”。

    struct vhost_net {
        struct vhost_dev dev;
        struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
        struct vhost_poll poll[VHOST_NET_VQ_MAX];
        /* Number of TX recently submitted.
         * Protected by tx vq lock. */
        unsigned tx_packets;
        /* Number of times zerocopy TX recently failed.
         * Protected by tx vq lock. */
        unsigned tx_zcopy_err;
        /* Flush in progress. Protected by tx vq lock. */
        bool tx_flush;
    };
    

    然后qemu通过 VHOST_NET_SET_BACKEND,设置vhost设备和tap设备的关系,其实是设置 vhost设备和 tap设备队列的关系,因为传递下来的参数是tun_file对应的socket文件的句柄。而tun_file如上文所述,是一个队列的抽象。

    VHOST_NET_SET_BACKEND完成了vhost_net(vhost设备)和tap socket的绑定,vhost_net.vq->private_data 设置为了tap socket。也就是将vhost_net和tap队列绑定了。

    所以虚拟机网卡支持多队列,就需要走多次VHOST_NET_SET_BACKEND流程,将多个vhost_net和多个tap队列绑定。

    static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
                    unsigned long arg)
    {
        struct vhost_net *n = f->private_data;
        void __user *argp = (void __user *)arg;
        u64 __user *featurep = argp;
        struct vhost_vring_file backend;
        u64 features;
        int r;
    
        switch (ioctl) {
        case VHOST_NET_SET_BACKEND:
            if (copy_from_user(&backend, argp, sizeof backend))
                return -EFAULT;
            return vhost_net_set_backend(n, backend.index, backend.fd);
    
    ......
    }
    
    
    static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
    {
        struct socket *sock, *oldsock;
        struct vhost_virtqueue *vq;
        struct vhost_net_virtqueue *nvq;
        struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
        int r;
    ......
        sock = get_socket(fd);
        if (IS_ERR(sock)) {
            r = PTR_ERR(sock);
            goto err_vq;
        }
    
        /* start polling new socket */
        oldsock = rcu_dereference_protected(vq->private_data,
                            lockdep_is_held(&vq->mutex));
        if (sock != oldsock) {
            ubufs = vhost_net_ubuf_alloc(vq,
                             sock && vhost_sock_zcopy(sock));
            if (IS_ERR(ubufs)) {
                r = PTR_ERR(ubufs);
                goto err_ubufs;
            }
    
            vhost_net_disable_vq(n, vq);
            rcu_assign_pointer(vq->private_data, sock);
            r = vhost_init_used(vq);
            if (r)
                goto err_used;
            r = vhost_net_enable_vq(n, vq);
            if (r)
                goto err_used;
    
    ......
    }
    
    

    多队列情况下,为每个队列创建一个vhost内核线程。
    qemu通过 VHOST_SET_OWNER 为每个vhost_net(vhost设备)创建一个vhost内核线程,详情可以参考本系列的其他文章。所以为虚拟机设置多队列,创建了多个vhost设备(vhost_net),自然为每个vhost设备做一次VHOST_SET_OWNER操作,在内核创建出队列数个vhost线程。

    static long vhost_net_set_owner(struct vhost_net *n)
    {
        int r;
    
        mutex_lock(&n->dev.mutex);
        if (vhost_dev_has_owner(&n->dev)) {
            r = -EBUSY;
            goto out;
        }
        r = vhost_net_set_ubuf_info(n);
        if (r)
            goto out;
        r = vhost_dev_set_owner(&n->dev);
        if (r)
            vhost_net_clear_ubuf_info(n);
        vhost_net_flush(n);
    out:
        mutex_unlock(&n->dev.mutex);
        return r;
    }
    

    相关文章

      网友评论

          本文标题:vhost-net 3 -- 网卡多队列

          本文链接:https://www.haomeiwen.com/subject/crjuxdtx.html