美文网首页
接口重设net namespace后的报文收发

接口重设net namespace后的报文收发

作者: 苏苏林 | 来源:发表于2021-01-05 16:09 被阅读0次

    linux网络虚拟化经常会用到network namespace,将一些创建完成的虚拟接口加入到另一个namespace实现网络隔离。
    既然接口已经属于另一个namespace,为什么报文仍然能够在创建接口的ns和切换后的ns之间收发呢?这是因为接口无论怎么切换netns都会在创建接口所在的netns中留下一些痕迹,将创建接口所在netns和接口关联起来。
    拿最简单的ip gre口举例。

    ip gre接口是一个三层的ip tunnel接口,外层dst ip通常是本地物理口的ip地址,即协议认为报文是送往本机的,做上次协议(gre)处理,调用ipgre_rcv,最终调用__ipgre_rcv,其中ip_tunnel_lookup函数就是用来查找应该送往哪个gre接口的查询函数,我们可以看到gre接口数据都是从net_generic(net, ipgre_net_id) 返回的ip_tunnel_net 中查询的。

    
    static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
                 int hdr_len)
    {
        struct net *net = dev_net(skb->dev);
        struct ip_tunnel_net *itn;
        int res;
    
        if (tpi->proto == htons(ETH_P_TEB))
            itn = net_generic(net, gre_tap_net_id);
        else
            itn = net_generic(net, ipgre_net_id);
    
        res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
        if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
            /* ipgre tunnels in collect metadata mode should receive
             * also ETH_P_TEB traffic.
             */
            itn = net_generic(net, ipgre_net_id);
            res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
        }
        return res;
    }
    
    static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
                   struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
    {
        struct metadata_dst *tun_dst = NULL;
        const struct iphdr *iph;
        struct ip_tunnel *tunnel;
    
        iph = ip_hdr(skb);
        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
                      iph->saddr, iph->daddr, tpi->key);
    
        if (tunnel) {
            if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
                           raw_proto, false) < 0)
                goto drop;
    
            if (tunnel->dev->type != ARPHRD_NONE)
                skb_pop_mac_header(skb);
            else
                skb_reset_mac_header(skb);
            if (tunnel->collect_md) {
                __be16 flags;
                __be64 tun_id;
    
                flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
                tun_id = key32_to_tunnel_id(tpi->key);
                tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
                if (!tun_dst)
                    return PACKET_REJECT;
            }
    
            ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
            return PACKET_RCVD;
        }
        return PACKET_NEXT;
    
    drop:
        kfree_skb(skb);
        return PACKET_RCVD;
    }
    
    

    在每个namespace中都有一个存放每namespace data的地方,net->gen,一些虚拟接口在创建的时候,都会将自己的私有数据存放在其中。我们可以搜一下net_generic 函数查询那些接口会在net->gen中挂载数据。

    static inline void *net_generic(const struct net *net, int id)
    {
        struct net_generic *ng;
        void *ptr;
    
        rcu_read_lock();
        ng = rcu_dereference(net->gen);
        ptr = ng->ptr[id - 1];
        rcu_read_unlock();
    
        return ptr;
    }
    #endif
    

    可以看到 ip gre在其中存储的是 ip_tunnel 结构,gre的封装信息都在其中。
    vxlan接口存放了 vxlan_dev 和 vxlan_sock结构,vxlan接口信息和vxlan的udp sock信息都存在其中。

    struct ip_tunnel_net {
        struct net_device *fb_tunnel_dev;
        struct hlist_head tunnels[IP_TNL_HASH_SIZE];  // ip_tunnel
        struct ip_tunnel __rcu *collect_md_tun;
    };
    
    /* per-network namespace private data for this module */
    struct vxlan_net {
        struct list_head  vxlan_list;          // vxlan_dev
        struct hlist_head sock_list[PORT_HASH_SIZE];  // vxlan_sock
        spinlock_t    sock_lock;
    };
    

    我们在创建接口的时候,在net中做了两个操作:
    1、调用 register_netdevice 做设备初始化,它再调用list_netdevice,将dev加入到net的dev_base_head、dev_name_head、dev_index_head三个链表上;
    2、调用 ip_tunnel_add,将dev的私有数据(ip_tunnel)加入到 net的gen中(net_generic);
    注意这里 nt->net = net 的操作,保存了原始的netns。

    
    int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
                  struct ip_tunnel_parm *p)
    {
        struct ip_tunnel *nt;
        struct net *net = dev_net(dev);
        struct ip_tunnel_net *itn;
        int mtu;
        int err;
    
        nt = netdev_priv(dev);
        itn = net_generic(net, nt->ip_tnl_net_id);
    
        if (nt->collect_md) {
            if (rtnl_dereference(itn->collect_md_tun))
                return -EEXIST;
        } else {
            if (ip_tunnel_find(itn, p, dev->type))
                return -EEXIST;
        }
    
        nt->net = net;                 //========== ip_tunnel中保存的原始的net
        nt->parms = *p;
        err = register_netdevice(dev);
        if (err)
            goto out;
    
        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
            eth_hw_addr_random(dev);
    
        mtu = ip_tunnel_bind_dev(dev);
        if (!tb[IFLA_MTU])
            dev->mtu = mtu;
    
        ip_tunnel_add(itn, nt);
    out:
        return err;
    }
    EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
    
    

    然后再设置接口netns的时候,调用 dev_change_net_namespace 函数,这个函数中,再修改了netns之后(dev_net_set),只调用了list_netdevice 重新挂载dev到net的设备链表中,而没有设置net->gen,这说明,在创建接口的原始netns才会有private data保存在net->gen,其它netns是没有的。
    类似的,vxlan接口,它的udp socket、vxlan_dev在都留在创建它的netns,会在源netns接收到vxlan的udp报文,解封udp后,解析为vxlan报文,通过sock关联到vxlan_dev和dev可以找到对应的vxlan接口。
    在源netns找到对应的接口,调用接口的接收处理函数,完成解封装等操作,后面如果再入协议栈,就是接口当前所在的netns的协议栈了,比如路由、netfilter、neighbor等等。

    int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
    {
        int err;
    
        ASSERT_RTNL();
    
        /* Don't allow namespace local devices to be moved. */
        err = -EINVAL;
        if (dev->features & NETIF_F_NETNS_LOCAL)
            goto out;
    
        /* Ensure the device has been registrered */
        if (dev->reg_state != NETREG_REGISTERED)
            goto out;
    
        /* Get out if there is nothing todo */
        err = 0;
        if (net_eq(dev_net(dev), net))
            goto out;
    
        /* Pick the destination device name, and ensure
         * we can use it in the destination network namespace.
         */
        err = -EEXIST;
        if (__dev_get_by_name(net, dev->name)) {
            /* We get here if we can't use the current device name */
            if (!pat)
                goto out;
            if (dev_get_valid_name(net, dev, pat) < 0)
                goto out;
        }
    
        /*
         * And now a mini version of register_netdevice unregister_netdevice.
         */
    
        /* If device is running close it first. */
        dev_close(dev);
    
        /* And unlink it from device chain */
        err = -ENODEV;
        unlist_netdevice(dev);
    
        synchronize_net();
    
        /* Shutdown queueing discipline. */
        dev_shutdown(dev);
    
        /* Notify protocols, that we are about to destroy
           this device. They should clean all the things.
    
           Note that dev->reg_state stays at NETREG_REGISTERED.
           This is wanted because this way 8021q and macvlan know
           the device is just moving and can keep their slaves up.
        */
        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
        rcu_barrier();
        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
    
        /*
         *  Flush the unicast and multicast chains
         */
        dev_uc_flush(dev);
        dev_mc_flush(dev);
    
        /* Send a netdev-removed uevent to the old namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
        netdev_adjacent_del_links(dev);
    
        /* Actually switch the network namespace */
        dev_net_set(dev, net);
    
        /* If there is an ifindex conflict assign a new one */
        if (__dev_get_by_index(net, dev->ifindex))
            dev->ifindex = dev_new_index(net);
    
        /* Send a netdev-add uevent to the new namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
        netdev_adjacent_add_links(dev);
    
        /* Fixup kobjects */
        err = device_rename(&dev->dev, dev->name);
        WARN_ON(err);
    
        /* Add the device back in the hashes */
        list_netdevice(dev);
    
        /* Notify protocols, that a new device appeared. */
        call_netdevice_notifiers(NETDEV_REGISTER, dev);
    
        /*
         *  Prevent userspace races by waiting until the network
         *  device is fully setup before sending notifications.
         */
        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
    
        synchronize_net();
        err = 0;
    out:
        return err;
    }
    EXPORT_SYMBOL_GPL(dev_change_net_namespace);
    
    

    上面是接收流程,如果是从netns中出来的流量,怎么进入源netns的呢?
    gre接口的发送函数是 ipgre_xmit,最终调用ip_tunnel_xmit,如下我们可以看到,在完成 gre头和外层ip头的封装后,重新查询路由的处理:
    // rt = ip_route_output_key(tunnel->net, &fl4);
    使用的netns是ip_tunnel中存储的netns,在上面创建gre的函数中可知,这个netns是原始的netns(创建gre的netns),也就是说外层ip的路由是查询的原始netns的路由。
    再后面调用iptunnel_xmit发送的时候,!net_eq(tunnel->net, dev_net(dev)==true也会做skb的netns切换:
    // iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));

    
    void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                const struct iphdr *tnl_params, u8 protocol)
    {
        struct ip_tunnel *tunnel = netdev_priv(dev);
        const struct iphdr *inner_iph;
        struct flowi4 fl4;
        u8     tos, ttl;
        __be16 df;
        struct rtable *rt;      /* Route to the other host */
        unsigned int max_headroom;  /* The extra header space needed */
        __be32 dst;
        bool connected;
    
        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
        connected = (tunnel->parms.iph.daddr != 0);
    
        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
    
        dst = tnl_params->daddr;
        if (dst == 0) {
            /* NBMA tunnel */
    
            if (!skb_dst(skb)) {
                dev->stats.tx_fifo_errors++;
                goto tx_error;
            }
    
            if (skb->protocol == htons(ETH_P_IP)) {
                rt = skb_rtable(skb);
                dst = rt_nexthop(rt, inner_iph->daddr);
            }
    #if IS_ENABLED(CONFIG_IPV6)
            else if (skb->protocol == htons(ETH_P_IPV6)) {
                const struct in6_addr *addr6;
                struct neighbour *neigh;
                bool do_tx_error_icmp;
                int addr_type;
    
                neigh = dst_neigh_lookup(skb_dst(skb),
                             &ipv6_hdr(skb)->daddr);
                if (!neigh)
                    goto tx_error;
    
                addr6 = (const struct in6_addr *)&neigh->primary_key;
                addr_type = ipv6_addr_type(addr6);
    
                if (addr_type == IPV6_ADDR_ANY) {
                    addr6 = &ipv6_hdr(skb)->daddr;
                    addr_type = ipv6_addr_type(addr6);
                }
    
                if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
                    do_tx_error_icmp = true;
                else {
                    do_tx_error_icmp = false;
                    dst = addr6->s6_addr32[3];
                }
                neigh_release(neigh);
                if (do_tx_error_icmp)
                    goto tx_error_icmp;
            }
    #endif
            else
                goto tx_error;
    
            connected = false;
        }
    
        tos = tnl_params->tos;
        if (tos & 0x1) {
            tos &= ~0x1;
            if (skb->protocol == htons(ETH_P_IP)) {
                tos = inner_iph->tos;
                connected = false;
            } else if (skb->protocol == htons(ETH_P_IPV6)) {
                tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
                connected = false;
            }
        }
    
        init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
                 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
        // 关于 src ip, 从cache 或者路由 (出接口ip addr)中取
        if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
            goto tx_error;
    
        rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
                 NULL;
    
        if (!rt) {
            rt = ip_route_output_key(tunnel->net, &fl4);
    
            if (IS_ERR(rt)) {
                dev->stats.tx_carrier_errors++;
                goto tx_error;
            }
            if (connected)
                dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
                          fl4.saddr);
        }
    
        if (rt->dst.dev == dev) {
            ip_rt_put(rt);
            dev->stats.collisions++;
            goto tx_error;
        }
    
        if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
            ip_rt_put(rt);
            goto tx_error;
        }
    
        if (tunnel->err_count > 0) {
            if (time_before(jiffies,
                    tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
                tunnel->err_count--;
    
                dst_link_failure(skb);
            } else
                tunnel->err_count = 0;
        }
    
        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
        ttl = tnl_params->ttl;
        if (ttl == 0) {
            if (skb->protocol == htons(ETH_P_IP))
                ttl = inner_iph->ttl;
    #if IS_ENABLED(CONFIG_IPV6)
            else if (skb->protocol == htons(ETH_P_IPV6))
                ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
    #endif
            else
                ttl = ip4_dst_hoplimit(&rt->dst);
        }
    
        df = tnl_params->frag_off;
        if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
            df |= (inner_iph->frag_off&htons(IP_DF));
    
        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
                + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
        if (max_headroom > dev->needed_headroom)
            dev->needed_headroom = max_headroom;
    
        if (skb_cow_head(skb, dev->needed_headroom)) {
            ip_rt_put(rt);
            dev->stats.tx_dropped++;
            kfree_skb(skb);
            return;
        }
    
        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
                  df, !net_eq(tunnel->net, dev_net(dev)));
        return;
    
    #if IS_ENABLED(CONFIG_IPV6)
    tx_error_icmp:
        dst_link_failure(skb);
    #endif
    tx_error:
        dev->stats.tx_errors++;
        kfree_skb(skb);
    }
    EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
    
    

    相关文章

      网友评论

          本文标题:接口重设net namespace后的报文收发

          本文链接:https://www.haomeiwen.com/subject/orpioktx.html