接口重设net namespace后的报文收发

作者: 苏苏林 | 来源:发表于2021-01-05 16:09 被阅读0次

接口重设net namespace后的报文收发
net namespace
docker 容器技术的原理
Mybatis入门实现CURD
eNSP模拟实验-OSPF被动接口配置
3、CRUD
SoapUI使用方法-01发送http请求
Springboot使用了ResponseBodyAdvice处
DPDK 收发包流程
接口测试

linux网络虚拟化经常会用到network namespace，将一些创建完成的虚拟接口加入到另一个namespace实现网络隔离。
既然接口已经属于另一个namespace，为什么报文仍然能够在创建接口的ns和切换后的ns之间收发呢？这是因为接口无论怎么切换netns都会在创建接口所在的netns中留下一些痕迹，将创建接口所在netns和接口关联起来。
拿最简单的ip gre口举例。

ip gre接口是一个三层的ip tunnel接口，外层dst ip通常是本地物理口的ip地址，即协议认为报文是送往本机的，做上次协议（gre）处理，调用ipgre_rcv，最终调用__ipgre_rcv，其中ip_tunnel_lookup函数就是用来查找应该送往哪个gre接口的查询函数，我们可以看到gre接口数据都是从net_generic(net, ipgre_net_id) 返回的ip_tunnel_net 中查询的。


static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
             int hdr_len)
{
    struct net *net = dev_net(skb->dev);
    struct ip_tunnel_net *itn;
    int res;

    if (tpi->proto == htons(ETH_P_TEB))
        itn = net_generic(net, gre_tap_net_id);
    else
        itn = net_generic(net, ipgre_net_id);

    res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
    if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
        /* ipgre tunnels in collect metadata mode should receive
         * also ETH_P_TEB traffic.
         */
        itn = net_generic(net, ipgre_net_id);
        res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
    }
    return res;
}

static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
               struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
    struct metadata_dst *tun_dst = NULL;
    const struct iphdr *iph;
    struct ip_tunnel *tunnel;

    iph = ip_hdr(skb);
    tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
                  iph->saddr, iph->daddr, tpi->key);

    if (tunnel) {
        if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
                       raw_proto, false) < 0)
            goto drop;

        if (tunnel->dev->type != ARPHRD_NONE)
            skb_pop_mac_header(skb);
        else
            skb_reset_mac_header(skb);
        if (tunnel->collect_md) {
            __be16 flags;
            __be64 tun_id;

            flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
            tun_id = key32_to_tunnel_id(tpi->key);
            tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
            if (!tun_dst)
                return PACKET_REJECT;
        }

        ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
        return PACKET_RCVD;
    }
    return PACKET_NEXT;

drop:
    kfree_skb(skb);
    return PACKET_RCVD;
}

在每个namespace中都有一个存放每namespace data的地方，net->gen，一些虚拟接口在创建的时候，都会将自己的私有数据存放在其中。我们可以搜一下net_generic 函数查询那些接口会在net->gen中挂载数据。

static inline void *net_generic(const struct net *net, int id)
{
    struct net_generic *ng;
    void *ptr;

    rcu_read_lock();
    ng = rcu_dereference(net->gen);
    ptr = ng->ptr[id - 1];
    rcu_read_unlock();

    return ptr;
}
#endif

可以看到 ip gre在其中存储的是 ip_tunnel 结构，gre的封装信息都在其中。
vxlan接口存放了 vxlan_dev 和 vxlan_sock结构，vxlan接口信息和vxlan的udp sock信息都存在其中。

struct ip_tunnel_net {
    struct net_device *fb_tunnel_dev;
    struct hlist_head tunnels[IP_TNL_HASH_SIZE];  // ip_tunnel
    struct ip_tunnel __rcu *collect_md_tun;
};

/* per-network namespace private data for this module */
struct vxlan_net {
    struct list_head  vxlan_list;          // vxlan_dev
    struct hlist_head sock_list[PORT_HASH_SIZE];  // vxlan_sock
    spinlock_t    sock_lock;
};

我们在创建接口的时候，在net中做了两个操作:
1、调用 register_netdevice 做设备初始化，它再调用list_netdevice，将dev加入到net的dev_base_head、dev_name_head、dev_index_head三个链表上；
2、调用 ip_tunnel_add，将dev的私有数据（ip_tunnel）加入到 net的gen中（net_generic）；
注意这里 nt->net = net 的操作，保存了原始的netns。


int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
              struct ip_tunnel_parm *p)
{
    struct ip_tunnel *nt;
    struct net *net = dev_net(dev);
    struct ip_tunnel_net *itn;
    int mtu;
    int err;

    nt = netdev_priv(dev);
    itn = net_generic(net, nt->ip_tnl_net_id);

    if (nt->collect_md) {
        if (rtnl_dereference(itn->collect_md_tun))
            return -EEXIST;
    } else {
        if (ip_tunnel_find(itn, p, dev->type))
            return -EEXIST;
    }

    nt->net = net;                 //========== ip_tunnel中保存的原始的net
    nt->parms = *p;
    err = register_netdevice(dev);
    if (err)
        goto out;

    if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
        eth_hw_addr_random(dev);

    mtu = ip_tunnel_bind_dev(dev);
    if (!tb[IFLA_MTU])
        dev->mtu = mtu;

    ip_tunnel_add(itn, nt);
out:
    return err;
}
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);

然后再设置接口netns的时候，调用 dev_change_net_namespace 函数，这个函数中，再修改了netns之后（dev_net_set），只调用了list_netdevice 重新挂载dev到net的设备链表中，而没有设置net->gen，这说明，在创建接口的原始netns才会有private data保存在net->gen，其它netns是没有的。
类似的，vxlan接口，它的udp socket、vxlan_dev在都留在创建它的netns，会在源netns接收到vxlan的udp报文，解封udp后，解析为vxlan报文，通过sock关联到vxlan_dev和dev可以找到对应的vxlan接口。
在源netns找到对应的接口，调用接口的接收处理函数，完成解封装等操作，后面如果再入协议栈，就是接口当前所在的netns的协议栈了，比如路由、netfilter、neighbor等等。

int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
{
    int err;

    ASSERT_RTNL();

    /* Don't allow namespace local devices to be moved. */
    err = -EINVAL;
    if (dev->features & NETIF_F_NETNS_LOCAL)
        goto out;

    /* Ensure the device has been registrered */
    if (dev->reg_state != NETREG_REGISTERED)
        goto out;

    /* Get out if there is nothing todo */
    err = 0;
    if (net_eq(dev_net(dev), net))
        goto out;

    /* Pick the destination device name, and ensure
     * we can use it in the destination network namespace.
     */
    err = -EEXIST;
    if (__dev_get_by_name(net, dev->name)) {
        /* We get here if we can't use the current device name */
        if (!pat)
            goto out;
        if (dev_get_valid_name(net, dev, pat) < 0)
            goto out;
    }

    /*
     * And now a mini version of register_netdevice unregister_netdevice.
     */

    /* If device is running close it first. */
    dev_close(dev);

    /* And unlink it from device chain */
    err = -ENODEV;
    unlist_netdevice(dev);

    synchronize_net();

    /* Shutdown queueing discipline. */
    dev_shutdown(dev);

    /* Notify protocols, that we are about to destroy
       this device. They should clean all the things.

       Note that dev->reg_state stays at NETREG_REGISTERED.
       This is wanted because this way 8021q and macvlan know
       the device is just moving and can keep their slaves up.
    */
    call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
    rcu_barrier();
    call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
    rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);

    /*
     *  Flush the unicast and multicast chains
     */
    dev_uc_flush(dev);
    dev_mc_flush(dev);

    /* Send a netdev-removed uevent to the old namespace */
    kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
    netdev_adjacent_del_links(dev);

    /* Actually switch the network namespace */
    dev_net_set(dev, net);

    /* If there is an ifindex conflict assign a new one */
    if (__dev_get_by_index(net, dev->ifindex))
        dev->ifindex = dev_new_index(net);

    /* Send a netdev-add uevent to the new namespace */
    kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
    netdev_adjacent_add_links(dev);

    /* Fixup kobjects */
    err = device_rename(&dev->dev, dev->name);
    WARN_ON(err);

    /* Add the device back in the hashes */
    list_netdevice(dev);

    /* Notify protocols, that a new device appeared. */
    call_netdevice_notifiers(NETDEV_REGISTER, dev);

    /*
     *  Prevent userspace races by waiting until the network
     *  device is fully setup before sending notifications.
     */
    rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);

    synchronize_net();
    err = 0;
out:
    return err;
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);

上面是接收流程，如果是从netns中出来的流量，怎么进入源netns的呢？
gre接口的发送函数是 ipgre_xmit，最终调用ip_tunnel_xmit，如下我们可以看到，在完成 gre头和外层ip头的封装后，重新查询路由的处理：
// rt = ip_route_output_key(tunnel->net, &fl4);
使用的netns是ip_tunnel中存储的netns，在上面创建gre的函数中可知，这个netns是原始的netns（创建gre的netns），也就是说外层ip的路由是查询的原始netns的路由。
再后面调用iptunnel_xmit发送的时候，!net_eq(tunnel->net, dev_net(dev)==true也会做skb的netns切换：
// iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));


void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
            const struct iphdr *tnl_params, u8 protocol)
{
    struct ip_tunnel *tunnel = netdev_priv(dev);
    const struct iphdr *inner_iph;
    struct flowi4 fl4;
    u8     tos, ttl;
    __be16 df;
    struct rtable *rt;      /* Route to the other host */
    unsigned int max_headroom;  /* The extra header space needed */
    __be32 dst;
    bool connected;

    inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
    connected = (tunnel->parms.iph.daddr != 0);

    memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));

    dst = tnl_params->daddr;
    if (dst == 0) {
        /* NBMA tunnel */

        if (!skb_dst(skb)) {
            dev->stats.tx_fifo_errors++;
            goto tx_error;
        }

        if (skb->protocol == htons(ETH_P_IP)) {
            rt = skb_rtable(skb);
            dst = rt_nexthop(rt, inner_iph->daddr);
        }
#if IS_ENABLED(CONFIG_IPV6)
        else if (skb->protocol == htons(ETH_P_IPV6)) {
            const struct in6_addr *addr6;
            struct neighbour *neigh;
            bool do_tx_error_icmp;
            int addr_type;

            neigh = dst_neigh_lookup(skb_dst(skb),
                         &ipv6_hdr(skb)->daddr);
            if (!neigh)
                goto tx_error;

            addr6 = (const struct in6_addr *)&neigh->primary_key;
            addr_type = ipv6_addr_type(addr6);

            if (addr_type == IPV6_ADDR_ANY) {
                addr6 = &ipv6_hdr(skb)->daddr;
                addr_type = ipv6_addr_type(addr6);
            }

            if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
                do_tx_error_icmp = true;
            else {
                do_tx_error_icmp = false;
                dst = addr6->s6_addr32[3];
            }
            neigh_release(neigh);
            if (do_tx_error_icmp)
                goto tx_error_icmp;
        }
#endif
        else
            goto tx_error;

        connected = false;
    }

    tos = tnl_params->tos;
    if (tos & 0x1) {
        tos &= ~0x1;
        if (skb->protocol == htons(ETH_P_IP)) {
            tos = inner_iph->tos;
            connected = false;
        } else if (skb->protocol == htons(ETH_P_IPV6)) {
            tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
            connected = false;
        }
    }

    init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
    // 关于 src ip, 从cache 或者路由 (出接口ip addr)中取
    if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
        goto tx_error;

    rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
             NULL;

    if (!rt) {
        rt = ip_route_output_key(tunnel->net, &fl4);

        if (IS_ERR(rt)) {
            dev->stats.tx_carrier_errors++;
            goto tx_error;
        }
        if (connected)
            dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
                      fl4.saddr);
    }

    if (rt->dst.dev == dev) {
        ip_rt_put(rt);
        dev->stats.collisions++;
        goto tx_error;
    }

    if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
        ip_rt_put(rt);
        goto tx_error;
    }

    if (tunnel->err_count > 0) {
        if (time_before(jiffies,
                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
            tunnel->err_count--;

            dst_link_failure(skb);
        } else
            tunnel->err_count = 0;
    }

    tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
    ttl = tnl_params->ttl;
    if (ttl == 0) {
        if (skb->protocol == htons(ETH_P_IP))
            ttl = inner_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
        else if (skb->protocol == htons(ETH_P_IPV6))
            ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
#endif
        else
            ttl = ip4_dst_hoplimit(&rt->dst);
    }

    df = tnl_params->frag_off;
    if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
        df |= (inner_iph->frag_off&htons(IP_DF));

    max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
            + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
    if (max_headroom > dev->needed_headroom)
        dev->needed_headroom = max_headroom;

    if (skb_cow_head(skb, dev->needed_headroom)) {
        ip_rt_put(rt);
        dev->stats.tx_dropped++;
        kfree_skb(skb);
        return;
    }

    iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
              df, !net_eq(tunnel->net, dev_net(dev)));
    return;

#if IS_ENABLED(CONFIG_IPV6)
tx_error_icmp:
    dst_link_failure(skb);
#endif
tx_error:
    dev->stats.tx_errors++;
    kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);

接口重设net namespace后的报文收发
linux网络虚拟化经常会用到network namespace，将一些创建完成的虚拟接口加入到另一个namesp...
net namespace
ip netns {list | add delete | exec | identify | pids | m...
docker 容器技术的原理
三大件 namespace pid namespace 进程隔离 net namespace ip, 路由，端口等...
Mybatis入门实现CURD
Mapper字段笔记 namespace namespace的包名要和UserMapper/UserDao接口报名...
eNSP模拟实验-OSPF被动接口配置
OSPF被动接口也称抑制接口，成为被动接口后，将不会接受和发送OSPF报文。可以配置被动接口来禁止此接口接受和发送...
3、CRUD
3.1：namespace 配置文件中namespace中的名称为对应Mapper接口或者Dao接口的完整包名,必...
SoapUI使用方法-01发送http请求
知识传送门： SoapUI模拟服务端自定义响应报文进行接口测:http://blog.csdn.net/russ4...
Springboot使用了ResponseBodyAdvice处
为了统一接口响应的报文，现实了ResponseBodyAdvice接口，通过这个接口的实现类来统一处理报文然而在...
DPDK 收发包流程
本文整理下之前的学习笔记，基于DPDK17.11版本源码，主要分析一下收发包流程。使用DPDK的APP收发报文流...
接口测试
接口测试的原理测试人员借助工具模拟客户端向服务器端发送请求报文，服务器端接收请求报文后，对相应的报文做出处理并向...

接口重设net namespace后的报文收发

相关文章

接口重设net namespace后的报文收发

net namespace

docker 容器技术的原理

Mybatis入门实现CURD

eNSP模拟实验-OSPF被动接口配置

3、CRUD

SoapUI使用方法-01发送http请求

Springboot使用了ResponseBodyAdvice处

DPDK 收发包流程

接口测试

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读