美文网首页
veth虚拟网卡

veth虚拟网卡

作者: 分享放大价值 | 来源:发表于2020-09-22 21:55 被阅读0次

    veth是kernel提供的一种虚拟网卡,总是成对出现。在一端发送数据,就可以在另一端接收到,类似一根网线。那么它是如何实现的呢?今天就看一下它的实现。

    veth创建

    可以使用ip命令创建,如下。

    [root@localhost ~]# ip link add vetha type veth peer name vethb
    

    在kernel端需要提前加载veth module,如下,这个module做的事情很简单,就是注册一个 rtnl_link_ops

    [root@localhost ~]# modprobe veth
    [root@localhost ~]# lsmod | grep veth
    veth                   13410  0
    
    //veth.ko 初始化
    #define DRV_NAME    "veth"
    static struct rtnl_link_ops veth_link_ops = {
        .kind       = DRV_NAME,
        .priv_size  = sizeof(struct veth_priv),
        .setup      = veth_setup,
        .validate   = veth_validate,
        .newlink    = veth_newlink,
        .dellink    = veth_dellink,
        .policy     = veth_policy,
        .maxtype    = VETH_INFO_MAX,
    };
    
    /*
     * init/fini
     */
    
    static __init int veth_init(void)
    {
        return rtnl_link_register(&veth_link_ops);
    }
    int rtnl_link_register(struct rtnl_link_ops *ops)
    {
        int err;
    
        rtnl_lock();
        err = __rtnl_link_register(ops);
        rtnl_unlock();
        return err;
    }
    int __rtnl_link_register(struct rtnl_link_ops *ops)
    {
        if (rtnl_link_ops_get(ops->kind))
            return -EEXIST;
    
        /* The check for setup is here because if ops
         * does not have that filled up, it is not possible
         * to use the ops for creating device. So do not
         * fill up dellink as well. That disables rtnl_dellink.
         */
        if (ops->setup && !ops->dellink)
            ops->dellink = unregister_netdevice_queue;
    
        list_add_tail(&ops->list, &link_ops);
        return 0;
    }
    

    通过命令 ip link add vetha type veth peer name vethb 创建veth时,在kernel中调用rtnl_newlink,会根据传入的type查找rtnl_link_ops,再调用rtnl_link_ops的newlink创建veth的peer,并将两端veth分别放在对方的私有数据中。

    rtnl_newlink
        if (linkinfo[IFLA_INFO_KIND]) {
            nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
            ops = rtnl_link_ops_get(kind);
    
        struct net_device *dev;
        dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb);
        
        if (ops->newlink) {
            err = ops->newlink(net, dev, tb, data);
    
    static int veth_newlink(struct net *src_net, struct net_device *dev,
                 struct nlattr *tb[], struct nlattr *data[])
    
        struct net_device *peer;
        peer = rtnl_create_link(net, ifname, name_assign_type,
                    &veth_link_ops, tbp);
    
        register_netdevice(peer);
        register_netdevice(dev);
    
        /*
         * tie the deviced together
         */
       //这里是关键,将peer放在dev的priv中,将dev放在peer的priv中,
       //这样将这两个虚拟设备绑定到一起。
        priv = netdev_priv(dev);
        rcu_assign_pointer(priv->peer, peer);
    
        priv = netdev_priv(peer);
        rcu_assign_pointer(priv->peer, dev);
    
    static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
        //取出peer设备
        struct veth_priv *priv = netdev_priv(dev);
        rcv = rcu_dereference(priv->peer);
        //调用 dev_forward_skb 发送数据时,已经换成peer设备rcv了
        //相当于 peer 设备的接收
        if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
            struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
            u64_stats_update_begin(&stats->syncp);
            stats->bytes += length;
            stats->packets++;
            u64_stats_update_end(&stats->syncp);
        } else {
    drop:
            atomic64_inc(&priv->dropped);
        }
        rcu_read_unlock();
        return NETDEV_TX_OK;
    }
    //如果报文可转发,则调用netif_rx_internal将报文送入主机协议栈
    int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
    {
        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
    }
    int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
    {
        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
            if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
                atomic_long_inc(&dev->rx_dropped);
                kfree_skb(skb);
                return NET_RX_DROP;
            }
        }
    
        if (unlikely(!is_skb_forwardable(dev, skb))) {
            atomic_long_inc(&dev->rx_dropped);
            kfree_skb(skb);
            return NET_RX_DROP;
        }
    
        skb_scrub_packet(skb, true);
        skb->protocol = eth_type_trans(skb, dev);
        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
    
        return 0;
    }
    //根据目的mac设备 pkt_type
    __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
    {
        unsigned short _service_access_point;
        const unsigned short *sap;
        const struct ethhdr *eth;
    
        skb->dev = dev;
        skb_reset_mac_header(skb);
        skb_pull_inline(skb, ETH_HLEN);
        eth = eth_hdr(skb);
    
        if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
            if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
                skb->pkt_type = PACKET_BROADCAST;
            else
                skb->pkt_type = PACKET_MULTICAST;
        }
        else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
                               dev->dev_addr)))
            skb->pkt_type = PACKET_OTHERHOST;
      ....
    }
    

    veth使用

    我见过的使用方法有两种,如下

    a. 一端在 root namespace,另一端放在其他 namespace,连接两个namespace,比如在k8s中,calico, cilium和ovs这些cni都是如此实现的。
    b.两端分别放在两个网桥上,连接网桥。

    第二种情况比较简单,重点说一下第一种情况遇到的几个问题。

    实验步骤如下,
    创建一对veth口,vetha和vethb,
    创建一个namespace test,
    将vetha放入namespace test,
    将vetha和vethb都up起来,
    给vetha配置ip 1.1.1.2

    [root@localhost ~]# ip link add vetha type veth peer name vethb
    [root@localhost ~]# ip link set dev vethb up
    [root@localhost ~]# ip netns add test
    [root@localhost ~]# ip link set dev vetha netns test
    [root@localhost ~]# ip netns exec test ip link set dev vetha up
    [root@localhost ~]# ip netns exec test ip address add dev vetha 1.1.1.2/24
    
    [root@localhost ~]# ip netns exec test ip a
    1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN qlen 1
        link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    30: vetha@if29: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP qlen 1000
        link/ether 96:7f:a6:ea:93:23 brd ff:ff:ff:ff:ff:ff link-netnsid 0
        inet 1.1.1.2/24 scope global vetha
           valid_lft forever preferred_lft forever
        inet6 fe80::74ef:e3ff:fe5d:2db0/64 scope link
           valid_lft forever preferred_lft forever
    
    [root@localhost ~]# ip netns exec test ip r
    1.1.1.0/24 dev vetha proto kernel scope link src 1.1.1.2
    
    [root@localhost ~]# ip netns exec test arp -n
    Address                  HWtype  HWaddress           Flags Mask            Iface
    1.1.1.4                          (incomplete)                              vetha
    
    [root@localhost ~]# ifconfig vethb
    vethb: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
            inet6 fe80::947f:a6ff:feea:9321  prefixlen 64  scopeid 0x20<link>
            ether 96:7f:a6:ea:93:21  txqueuelen 1000  (Ethernet)
            RX packets 208784  bytes 12836250 (12.2 MiB)
            RX errors 0  dropped 0  overruns 0  frame 0
            TX packets 1143  bytes 62469 (61.0 KiB)
            TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
    

    问题1:如果在test namespace中,设置一个静态arp,ping一个不存在的地址1.1.1.4

    [root@localhost ~]# ip netns exec test arp -s 1.1.1.4 00:00:00:00:00:01
    [root@localhost ~]# ip netns exec test arp -n
    Address                  HWtype  HWaddress           Flags Mask            Iface
    1.1.1.4                  ether   00:00:00:00:00:01   CM                    vetha
    

    icmp报文到达vethb设备后,走host的协议栈,匹配到host的默认路由表,应该会从em1发出去,但是结果是vethb可以抓到icmp报文,em1抓不到。

    [root@localhost ~]# ip r
    default via 10.164.129.1 dev em1 proto static metric 100
    10.10.10.0/24 dev gre10 proto kernel scope link src 10.10.10.1
    10.164.129.0/24 dev em1 proto kernel scope link src 10.164.129.16 metric 100
    
    [root@localhost ~]# tcpdump -vne -i vethb icmp
    tcpdump: listening on vethb, link-type EN10MB (Ethernet), capture size 262144 bytes
    08:04:49.386991 f6:d4:d2:de:20:be > 00:00:00:00:00:01, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 48297, offset 0, flags [DF], proto ICMP (1), length 84)
        1.1.1.2 > 1.1.1.4: ICMP echo request, id 30782, seq 21, length 64
    08:04:50.386985 f6:d4:d2:de:20:be > 00:00:00:00:00:01, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 48427, offset 0, flags [DF], proto ICMP (1), length 84)
        1.1.1.2 > 1.1.1.4: ICMP echo request, id 30782, seq 22, length 64
    ^C
    2 packets captured
    2 packets received by filter
    0 packets dropped by kernel
    [root@localhost ~]# tcpdump -vne -i em1 icmp
    tcpdump: listening on em1, link-type EN10MB (Ethernet), capture size 262144 bytes
    ^C
    0 packets captured
    0 packets received by filter
    0 packets dropped by kernel
    

    原因是icmp报文的目的mac为00:00:00:00:00:01,而在vethb收到此报文后,在函数eth_type_trans中会根据目的mac给skb->pkt_type赋值,因为目的mac不为vethb的mac,所以skb->pkt_type被设置成PACKET_OTHERHOST。

    veth_xmit ->dev_forward_skb -> __dev_forward_skb -> eth_type_trans
        if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
            if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
                skb->pkt_type = PACKET_BROADCAST;
            else
                skb->pkt_type = PACKET_MULTICAST;
        }
        else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
                               dev->dev_addr)))
            skb->pkt_type = PACKET_OTHERHOST;
    

    随后将报文送入主机协议栈,在ip_rcv中有个判断如果skb->pkt_type == PACKET_OTHERHOST就直接drop报文,比较恶心的是,这个drop没有统计信息可看。
    看来得在test namespace中将1.1.1.4对应的mac设置为vethb的mac地址。

    netif_rx_internal -> enqueue_to_backlog -> process_backlog ->__netif_receive_skb -> __netif_receive_skb_core -> ip_rcv
    int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
    {
        const struct iphdr *iph;
        u32 len;
    
        /* When the interface is in promisc. mode, drop all the crap
         * that it receives, do not try to analyse it.
         */
        if (skb->pkt_type == PACKET_OTHERHOST)
            goto drop;
    drop:
        kfree_skb(skb);
    out:
        return NET_RX_DROP;
    }
    

    问题2: 如下,将1.1.1.4对应的mac修改为vethb的mac了,但是仍然有问题,vethb可以收到,em1收不到。

    [root@localhost ~]# ip link show dev vethb
    33: vethb@if34: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT qlen 1000
        link/ether 96:ec:6a:a8:67:ed brd ff:ff:ff:ff:ff:ff link-netnsid 0
    [root@localhost ~]# ip netns exec test arp -d 1.1.1.4
    [root@localhost ~]# ip netns exec test arp -s 1.1.1.4 96:ec:6a:a8:67:ed
    [root@localhost ~]# ip netns exec test arp -n
    Address                  HWtype  HWaddress           Flags Mask            Iface
    1.1.1.4                  ether   96:ec:6a:a8:67:ed   CM                    vetha
    
    [root@localhost ~]# ip link show dev vethb
    33: vethb@if34: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT qlen 1000
        link/ether 96:ec:6a:a8:67:ed brd ff:ff:ff:ff:ff:ff link-netnsid 0
    [root@localhost ~]# tcpdump -vne -i vethb icmp
    tcpdump: listening on vethb, link-type EN10MB (Ethernet), capture size 262144 bytes
    08:15:21.495979 f6:d4:d2:de:20:be > 96:ec:6a:a8:67:ed, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 23666, offset 0, flags [DF], proto ICMP (1), length 84)
        1.1.1.2 > 1.1.1.4: ICMP echo request, id 31770, seq 32, length 64
    ^C
    1 packet captured
    1 packet received by filter
    0 packets dropped by kernel
    [root@localhost ~]# tcpdump -vne -i em1 icmp
    tcpdump: listening on em1, link-type EN10MB (Ethernet), capture size 262144 bytes
    ^C
    0 packets captured
    0 packets received by filter
    0 packets dropped by kernel
    

    问题3: 这又是另一个问题了,在调用ip_route_input_noref查找路由时,虽然可以匹配到默认路由,但是因为 vethb 没有开启forward功能,所以仍然会失败。

    ip_route_input_noref -> ip_route_input_slow
        fl4.flowi4_oif = 0;
        fl4.flowi4_iif = dev->ifindex;
        fl4.flowi4_mark = skb->mark;
        fl4.flowi4_tos = tos;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.daddr = daddr;
        fl4.saddr = saddr;
        err = fib_lookup(net, &fl4, &res);
    
        if (!IN_DEV_FORWARD(in_dev)) {
            err = -EHOSTUNREACH;
            goto no_route;
        }
    no_route:
        RT_CACHE_STAT_INC(in_no_route);
        res.type = RTN_UNREACHABLE;
        res.fi = NULL;
    

    可以通过下面的命令查看丢包计数 in_no_route

    cat /proc/net/stat/rt_cache | awk -F " " '{print $5}'
    

    接下来使能vethb的forwarding试试看

    [root@localhost ~]# echo 1 > /proc/sys/net/ipv4/conf/vethb/forwarding
    [root@localhost ~]# cat /proc/sys/net/ipv4/conf/vethb/forwarding
    1
    
    注意:如果 /proc/sys/net/ipv4/conf/all/forwarding 使能了,则新创建的网卡的forwarding 功能都会默认被使能。
    

    问题4: 再次ping还是不行,这是由于反向路径检查失败导致的。会调用fib_validate_source使用报文的源ip作为目的查找路由表,只能匹配到默认路由,因为默认路由出接口和报文入接口不是同一个,所以判断失败。收发同一个报文应该是同一个设备,这称为对称路由。

    ip_route_input_slow -> __mkroute_input
    /* Ignore rp_filter for packets protected by IPsec. */
        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
                      in_dev->dev, in_dev, &itag);
        if (err < 0) {
            ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                         saddr);
    
            goto cleanup;
        }
    
    int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                u8 tos, int oif, struct net_device *dev,
                struct in_device *idev, u32 *itag)
    {
        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
        //反向路径检查开关,为0就不做检查
        if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
            IN_DEV_ACCEPT_LOCAL(idev) &&
            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
            *itag = 0;
            return 0;
        }
        return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
    }
    
    static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                     u8 tos, int oif, struct net_device *dev,
                     int rpf, struct in_device *idev, u32 *itag)
    {
        int ret, no_addr;
        struct fib_result res;
        struct flowi4 fl4;
        struct net *net;
        bool dev_match;
    
        fl4.flowi4_oif = 0;
        fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
        fl4.daddr = src;
        fl4.saddr = dst;
        fl4.flowi4_tos = tos;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
    
        no_addr = idev->ifa_list == NULL;
    
        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
    
        net = dev_net(dev);
        if (fib_lookup(net, &fl4, &res))
            goto last_resort;
        if (res.type != RTN_UNICAST &&
            (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
            goto e_inval;
        if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
            goto last_resort;
        fib_combine_itag(itag, &res);
        dev_match = false;
    
    #ifdef CONFIG_IP_ROUTE_MULTIPATH
        for (ret = 0; ret < res.fi->fib_nhs; ret++) {
            struct fib_nh *nh = &res.fi->fib_nh[ret];
    
            if (nh->nh_dev == dev) {
                dev_match = true;
                break;
            }
        }
    #else
        //如果路由中的出接口是入接口才会成功。否则就是反向路径检查失败。
        if (FIB_RES_DEV(res) == dev)
            dev_match = true;
    #endif
        if (dev_match) {
            ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
            return ret;
        }
        if (no_addr)
            goto last_resort;
        if (rpf == 1)
            goto e_rpf;
        fl4.flowi4_oif = dev->ifindex;
    
        ret = 0;
        if (fib_lookup(net, &fl4, &res) == 0) {
            if (res.type == RTN_UNICAST)
                ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
        }
        return ret;
    
    last_resort:
        if (rpf)
            goto e_rpf;
        *itag = 0;
        return 0;
    
    e_inval:
        return -EINVAL;
    e_rpf:
        return -EXDEV;
    }
    

    可以通过下面的命令查看drop计数 in_martian_src

    [root@localhost ~]# cat /proc/net/stat/rt_cache | awk -F " " '{print $8}'
    

    这个问题的解决办法有两个

    a. 添加对称路由
    b. 关闭反向路径检查

    a. 添加对称路由,如下添加到test namespace路由后,icmp报文可以从em1发出去了

    [root@localhost ~]# ip route add 1.1.1.2 dev vethb
    [root@localhost ~]# ip r
    default via 10.164.129.1 dev em1 proto static metric 100
    1.1.1.2 dev vethb scope link
    10.10.10.0/24 dev gre10 proto kernel scope link src 10.10.10.1
    10.164.129.0/24 dev em1 proto kernel scope link src 10.164.129.16 metric 100
    169.254.0.0/16 dev provisioning_nw scope link metric 1016
    169.254.0.0/16 dev idrac_nw scope link metric 1017
    192.168.0.0/24 dev provisioning_nw proto kernel scope link src 192.168.0.253
    192.168.10.0/24 dev idrac_nw proto kernel scope link src 192.168.10.13
    192.168.122.0/24 dev virbr0 proto kernel scope link src 192.168.122.1
    [root@localhost ~]# tcpdump -vne -i em1 icmp or arp
    tcpdump: listening on em1, link-type EN10MB (Ethernet), capture size 262144 bytes
    08:56:10.509045 90:b1:1c:55:37:1e > 00:00:0c:07:ac:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 63, id 28970, offset 0, flags [DF], proto ICMP (1), length 84)
        1.1.1.2 > 1.1.1.4: ICMP echo request, id 1009, seq 1664, length 64
    08:56:11.509051 90:b1:1c:55:37:1e > 00:00:0c:07:ac:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 63, id 29200, offset 0, flags [DF], proto ICMP (1), length 84)
        1.1.1.2 > 1.1.1.4: ICMP echo request, id 1009, seq 1665, length 64
    

    b. 关闭反向路径检查
    rp_filter取决于设备和all的最大值,所以必须把设备和all的rp_filter都关闭

    #define IN_DEV_RPFILTER(in_dev)     IN_DEV_MAXCONF((in_dev), RP_FILTER)
    
    #define IN_DEV_MAXCONF(in_dev, attr) \
        (max(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr), \
             IN_DEV_CONF_GET((in_dev), attr)))
    
    [root@localhost ~]# ip route del 1.1.1.2 dev vethb
    [root@localhost ~]# echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
    [root@localhost ~]# echo 0 > /proc/sys/net/ipv4/conf/vethb/rp_filter
    
    [root@localhost ~]# tcpdump -vne -i em1 icmp or arp
    tcpdump: listening on em1, link-type EN10MB (Ethernet), capture size 262144 bytes
    09:01:22.555047 90:b1:1c:55:37:1e > 00:00:0c:07:ac:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 63, id 58344, offset 0, flags [DF], proto ICMP (1), length 84)
        1.1.1.2 > 1.1.1.4: ICMP echo request, id 1009, seq 1976, length 64
    09:01:23.555046 90:b1:1c:55:37:1e > 00:00:0c:07:ac:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 63, id 58481, offset 0, flags [DF], proto ICMP (1), length 84)
        1.1.1.2 > 1.1.1.4: ICMP echo request, id 1009, seq 1977, length 64
    

    proxy_arp

    到这里icmp报文算是成功发出去了,但是test namespace中1.1.1.4的mac地址是手动设置的,不太灵活,可以使用设备的 proxy_arp 功能。

    //使能 proxy_arp
    [root@localhost ~]# echo 1 >  /proc/sys/net/ipv4/conf/vethb/proxy_arp
    
    [root@localhost ~]# taskset -c 3 ip netns exec test ping 1.1.1.4
    PING 1.1.1.4 (1.1.1.4) 56(84) bytes of data.
    ^C
    --- 1.1.1.4 ping statistics ---
    1 packets transmitted, 0 received, 100% packet loss, time 0ms
    
    //学到了vethb的mac地址
    [root@localhost ~]# ip netns exec test arp -n
    Address                  HWtype  HWaddress           Flags Mask            Iface
    1.1.1.4                  ether   96:ec:6a:a8:67:ed   C                     vetha
    

    但是在 arp 处理过程中,也会查找路由,反向路径检查等流程,所以上面的问题也都会遇到,按照上面的设置一下就行。

    arp_process
        if (arp->ar_op == htons(ARPOP_REQUEST) &&
            ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
            rt = skb_rtable(skb);
            addr_type = rt->rt_type;
    
            if (addr_type == RTN_LOCAL) {
            ...
          } else if (IN_DEV_FORWARD(in_dev)) {
                if (addr_type == RTN_UNICAST  &&
                    (arp_fwd_proxy(in_dev, dev, rt) ||
                     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
                     (rt->dst.dev != dev &&
                      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
                    n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                    if (n)
                        neigh_release(n);
    
                    if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
                        skb->pkt_type == PACKET_HOST ||
                        NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
                        arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
                             dev, tip, sha, dev->dev_addr,
                             sha);
                    } else {
                        pneigh_enqueue(&arp_tbl,
                                   in_dev->arp_parms, skb);
                        return 0;
                    }
                    goto out;
                }
    

    总结

    a. 如果veth一端在其他namespace,另一端在root namespace,并没有被加入到网桥,则test namespace中通过veth发送的报文的目的mac如果是单播的,则必须是veth的peer设备的mac。可以在test namespace中静态配置或者设置代理arp
    b. 为了通过反向路径检查,可以关闭反向路径检查或者设置对称路由。
    c. 必须使能设备的 forwarding 功能。

    用到的命令如下,其实在k8s的calico cni网络中,基本上就是下面的几个设置。

    echo 1 >  /proc/sys/net/ipv4/conf/vethb/proxy_arp
    echo 1 > /proc/sys/net/ipv4/conf/vethb/forwarding
    echo 0 > /proc/sys/net/ipv4/conf/vethb/rp_filter
    ip route add 1.1.1.2 dev vethb
    

    相关文章

      网友评论

          本文标题:veth虚拟网卡

          本文链接:https://www.haomeiwen.com/subject/pcaiyktx.html