美文网首页
接口重设net namespace后的报文收发

接口重设net namespace后的报文收发

作者: 苏苏林 | 来源:发表于2021-01-05 16:09 被阅读0次

linux网络虚拟化经常会用到network namespace,将一些创建完成的虚拟接口加入到另一个namespace实现网络隔离。
既然接口已经属于另一个namespace,为什么报文仍然能够在创建接口的ns和切换后的ns之间收发呢?这是因为接口无论怎么切换netns都会在创建接口所在的netns中留下一些痕迹,将创建接口所在netns和接口关联起来。
拿最简单的ip gre口举例。

ip gre接口是一个三层的ip tunnel接口,外层dst ip通常是本地物理口的ip地址,即协议认为报文是送往本机的,做上次协议(gre)处理,调用ipgre_rcv,最终调用__ipgre_rcv,其中ip_tunnel_lookup函数就是用来查找应该送往哪个gre接口的查询函数,我们可以看到gre接口数据都是从net_generic(net, ipgre_net_id) 返回的ip_tunnel_net 中查询的。


static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
             int hdr_len)
{
    struct net *net = dev_net(skb->dev);
    struct ip_tunnel_net *itn;
    int res;

    if (tpi->proto == htons(ETH_P_TEB))
        itn = net_generic(net, gre_tap_net_id);
    else
        itn = net_generic(net, ipgre_net_id);

    res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
    if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
        /* ipgre tunnels in collect metadata mode should receive
         * also ETH_P_TEB traffic.
         */
        itn = net_generic(net, ipgre_net_id);
        res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
    }
    return res;
}

static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
               struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
    struct metadata_dst *tun_dst = NULL;
    const struct iphdr *iph;
    struct ip_tunnel *tunnel;

    iph = ip_hdr(skb);
    tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
                  iph->saddr, iph->daddr, tpi->key);

    if (tunnel) {
        if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
                       raw_proto, false) < 0)
            goto drop;

        if (tunnel->dev->type != ARPHRD_NONE)
            skb_pop_mac_header(skb);
        else
            skb_reset_mac_header(skb);
        if (tunnel->collect_md) {
            __be16 flags;
            __be64 tun_id;

            flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
            tun_id = key32_to_tunnel_id(tpi->key);
            tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
            if (!tun_dst)
                return PACKET_REJECT;
        }

        ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
        return PACKET_RCVD;
    }
    return PACKET_NEXT;

drop:
    kfree_skb(skb);
    return PACKET_RCVD;
}

在每个namespace中都有一个存放每namespace data的地方,net->gen,一些虚拟接口在创建的时候,都会将自己的私有数据存放在其中。我们可以搜一下net_generic 函数查询那些接口会在net->gen中挂载数据。

static inline void *net_generic(const struct net *net, int id)
{
    struct net_generic *ng;
    void *ptr;

    rcu_read_lock();
    ng = rcu_dereference(net->gen);
    ptr = ng->ptr[id - 1];
    rcu_read_unlock();

    return ptr;
}
#endif

可以看到 ip gre在其中存储的是 ip_tunnel 结构,gre的封装信息都在其中。
vxlan接口存放了 vxlan_dev 和 vxlan_sock结构,vxlan接口信息和vxlan的udp sock信息都存在其中。

struct ip_tunnel_net {
    struct net_device *fb_tunnel_dev;
    struct hlist_head tunnels[IP_TNL_HASH_SIZE];  // ip_tunnel
    struct ip_tunnel __rcu *collect_md_tun;
};

/* per-network namespace private data for this module */
struct vxlan_net {
    struct list_head  vxlan_list;          // vxlan_dev
    struct hlist_head sock_list[PORT_HASH_SIZE];  // vxlan_sock
    spinlock_t    sock_lock;
};

我们在创建接口的时候,在net中做了两个操作:
1、调用 register_netdevice 做设备初始化,它再调用list_netdevice,将dev加入到net的dev_base_head、dev_name_head、dev_index_head三个链表上;
2、调用 ip_tunnel_add,将dev的私有数据(ip_tunnel)加入到 net的gen中(net_generic);
注意这里 nt->net = net 的操作,保存了原始的netns。


int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
              struct ip_tunnel_parm *p)
{
    struct ip_tunnel *nt;
    struct net *net = dev_net(dev);
    struct ip_tunnel_net *itn;
    int mtu;
    int err;

    nt = netdev_priv(dev);
    itn = net_generic(net, nt->ip_tnl_net_id);

    if (nt->collect_md) {
        if (rtnl_dereference(itn->collect_md_tun))
            return -EEXIST;
    } else {
        if (ip_tunnel_find(itn, p, dev->type))
            return -EEXIST;
    }

    nt->net = net;                 //========== ip_tunnel中保存的原始的net
    nt->parms = *p;
    err = register_netdevice(dev);
    if (err)
        goto out;

    if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
        eth_hw_addr_random(dev);

    mtu = ip_tunnel_bind_dev(dev);
    if (!tb[IFLA_MTU])
        dev->mtu = mtu;

    ip_tunnel_add(itn, nt);
out:
    return err;
}
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);

然后再设置接口netns的时候,调用 dev_change_net_namespace 函数,这个函数中,再修改了netns之后(dev_net_set),只调用了list_netdevice 重新挂载dev到net的设备链表中,而没有设置net->gen,这说明,在创建接口的原始netns才会有private data保存在net->gen,其它netns是没有的。
类似的,vxlan接口,它的udp socket、vxlan_dev在都留在创建它的netns,会在源netns接收到vxlan的udp报文,解封udp后,解析为vxlan报文,通过sock关联到vxlan_dev和dev可以找到对应的vxlan接口。
在源netns找到对应的接口,调用接口的接收处理函数,完成解封装等操作,后面如果再入协议栈,就是接口当前所在的netns的协议栈了,比如路由、netfilter、neighbor等等。

int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
{
    int err;

    ASSERT_RTNL();

    /* Don't allow namespace local devices to be moved. */
    err = -EINVAL;
    if (dev->features & NETIF_F_NETNS_LOCAL)
        goto out;

    /* Ensure the device has been registrered */
    if (dev->reg_state != NETREG_REGISTERED)
        goto out;

    /* Get out if there is nothing todo */
    err = 0;
    if (net_eq(dev_net(dev), net))
        goto out;

    /* Pick the destination device name, and ensure
     * we can use it in the destination network namespace.
     */
    err = -EEXIST;
    if (__dev_get_by_name(net, dev->name)) {
        /* We get here if we can't use the current device name */
        if (!pat)
            goto out;
        if (dev_get_valid_name(net, dev, pat) < 0)
            goto out;
    }

    /*
     * And now a mini version of register_netdevice unregister_netdevice.
     */

    /* If device is running close it first. */
    dev_close(dev);

    /* And unlink it from device chain */
    err = -ENODEV;
    unlist_netdevice(dev);

    synchronize_net();

    /* Shutdown queueing discipline. */
    dev_shutdown(dev);

    /* Notify protocols, that we are about to destroy
       this device. They should clean all the things.

       Note that dev->reg_state stays at NETREG_REGISTERED.
       This is wanted because this way 8021q and macvlan know
       the device is just moving and can keep their slaves up.
    */
    call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
    rcu_barrier();
    call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
    rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);

    /*
     *  Flush the unicast and multicast chains
     */
    dev_uc_flush(dev);
    dev_mc_flush(dev);

    /* Send a netdev-removed uevent to the old namespace */
    kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
    netdev_adjacent_del_links(dev);

    /* Actually switch the network namespace */
    dev_net_set(dev, net);

    /* If there is an ifindex conflict assign a new one */
    if (__dev_get_by_index(net, dev->ifindex))
        dev->ifindex = dev_new_index(net);

    /* Send a netdev-add uevent to the new namespace */
    kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
    netdev_adjacent_add_links(dev);

    /* Fixup kobjects */
    err = device_rename(&dev->dev, dev->name);
    WARN_ON(err);

    /* Add the device back in the hashes */
    list_netdevice(dev);

    /* Notify protocols, that a new device appeared. */
    call_netdevice_notifiers(NETDEV_REGISTER, dev);

    /*
     *  Prevent userspace races by waiting until the network
     *  device is fully setup before sending notifications.
     */
    rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);

    synchronize_net();
    err = 0;
out:
    return err;
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);

上面是接收流程,如果是从netns中出来的流量,怎么进入源netns的呢?
gre接口的发送函数是 ipgre_xmit,最终调用ip_tunnel_xmit,如下我们可以看到,在完成 gre头和外层ip头的封装后,重新查询路由的处理:
// rt = ip_route_output_key(tunnel->net, &fl4);
使用的netns是ip_tunnel中存储的netns,在上面创建gre的函数中可知,这个netns是原始的netns(创建gre的netns),也就是说外层ip的路由是查询的原始netns的路由。
再后面调用iptunnel_xmit发送的时候,!net_eq(tunnel->net, dev_net(dev)==true也会做skb的netns切换:
// iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));


void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
            const struct iphdr *tnl_params, u8 protocol)
{
    struct ip_tunnel *tunnel = netdev_priv(dev);
    const struct iphdr *inner_iph;
    struct flowi4 fl4;
    u8     tos, ttl;
    __be16 df;
    struct rtable *rt;      /* Route to the other host */
    unsigned int max_headroom;  /* The extra header space needed */
    __be32 dst;
    bool connected;

    inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
    connected = (tunnel->parms.iph.daddr != 0);

    memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));

    dst = tnl_params->daddr;
    if (dst == 0) {
        /* NBMA tunnel */

        if (!skb_dst(skb)) {
            dev->stats.tx_fifo_errors++;
            goto tx_error;
        }

        if (skb->protocol == htons(ETH_P_IP)) {
            rt = skb_rtable(skb);
            dst = rt_nexthop(rt, inner_iph->daddr);
        }
#if IS_ENABLED(CONFIG_IPV6)
        else if (skb->protocol == htons(ETH_P_IPV6)) {
            const struct in6_addr *addr6;
            struct neighbour *neigh;
            bool do_tx_error_icmp;
            int addr_type;

            neigh = dst_neigh_lookup(skb_dst(skb),
                         &ipv6_hdr(skb)->daddr);
            if (!neigh)
                goto tx_error;

            addr6 = (const struct in6_addr *)&neigh->primary_key;
            addr_type = ipv6_addr_type(addr6);

            if (addr_type == IPV6_ADDR_ANY) {
                addr6 = &ipv6_hdr(skb)->daddr;
                addr_type = ipv6_addr_type(addr6);
            }

            if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
                do_tx_error_icmp = true;
            else {
                do_tx_error_icmp = false;
                dst = addr6->s6_addr32[3];
            }
            neigh_release(neigh);
            if (do_tx_error_icmp)
                goto tx_error_icmp;
        }
#endif
        else
            goto tx_error;

        connected = false;
    }

    tos = tnl_params->tos;
    if (tos & 0x1) {
        tos &= ~0x1;
        if (skb->protocol == htons(ETH_P_IP)) {
            tos = inner_iph->tos;
            connected = false;
        } else if (skb->protocol == htons(ETH_P_IPV6)) {
            tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
            connected = false;
        }
    }

    init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
    // 关于 src ip, 从cache 或者路由 (出接口ip addr)中取
    if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
        goto tx_error;

    rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
             NULL;

    if (!rt) {
        rt = ip_route_output_key(tunnel->net, &fl4);

        if (IS_ERR(rt)) {
            dev->stats.tx_carrier_errors++;
            goto tx_error;
        }
        if (connected)
            dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
                      fl4.saddr);
    }

    if (rt->dst.dev == dev) {
        ip_rt_put(rt);
        dev->stats.collisions++;
        goto tx_error;
    }

    if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
        ip_rt_put(rt);
        goto tx_error;
    }

    if (tunnel->err_count > 0) {
        if (time_before(jiffies,
                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
            tunnel->err_count--;

            dst_link_failure(skb);
        } else
            tunnel->err_count = 0;
    }

    tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
    ttl = tnl_params->ttl;
    if (ttl == 0) {
        if (skb->protocol == htons(ETH_P_IP))
            ttl = inner_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
        else if (skb->protocol == htons(ETH_P_IPV6))
            ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
#endif
        else
            ttl = ip4_dst_hoplimit(&rt->dst);
    }

    df = tnl_params->frag_off;
    if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
        df |= (inner_iph->frag_off&htons(IP_DF));

    max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
            + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
    if (max_headroom > dev->needed_headroom)
        dev->needed_headroom = max_headroom;

    if (skb_cow_head(skb, dev->needed_headroom)) {
        ip_rt_put(rt);
        dev->stats.tx_dropped++;
        kfree_skb(skb);
        return;
    }

    iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
              df, !net_eq(tunnel->net, dev_net(dev)));
    return;

#if IS_ENABLED(CONFIG_IPV6)
tx_error_icmp:
    dst_link_failure(skb);
#endif
tx_error:
    dev->stats.tx_errors++;
    kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);

相关文章

  • 接口重设net namespace后的报文收发

    linux网络虚拟化经常会用到network namespace,将一些创建完成的虚拟接口加入到另一个namesp...

  • net namespace

    ip netns {list | add delete | exec | identify | pids | m...

  • docker 容器技术的原理

    三大件 namespace pid namespace 进程隔离 net namespace ip, 路由,端口等...

  • Mybatis入门实现CURD

    Mapper字段笔记 namespace namespace的包名要和UserMapper/UserDao接口报名...

  • eNSP模拟实验-OSPF被动接口配置

    OSPF被动接口也称抑制接口,成为被动接口后,将不会接受和发送OSPF报文。可以配置被动接口来禁止此接口接受和发送...

  • 3、CRUD

    3.1:namespace 配置文件中namespace中的名称为对应Mapper接口或者Dao接口的完整包名,必...

  • SoapUI使用方法-01发送http请求

    知识传送门: SoapUI模拟服务端自定义响应报文进行接口测:http://blog.csdn.net/russ4...

  • Springboot使用了ResponseBodyAdvice处

    为了统一接口响应的报文,现实了ResponseBodyAdvice接口,通过这个接口的实现类来统一处理报文 然而在...

  • DPDK 收发包流程

    本文整理下之前的学习笔记,基于DPDK17.11版本源码,主要分析一下收发包流程。 使用DPDK的APP收发报文流...

  • 接口测试

    接口测试的原理 测试人员借助工具模拟客户端向服务器端发送请求报文,服务器端接收请求报文后,对相应的报文做出处理并向...

网友评论

      本文标题:接口重设net namespace后的报文收发

      本文链接:https://www.haomeiwen.com/subject/orpioktx.html