linux网络虚拟化经常会用到network namespace,将一些创建完成的虚拟接口加入到另一个namespace实现网络隔离。
既然接口已经属于另一个namespace,为什么报文仍然能够在创建接口的ns和切换后的ns之间收发呢?这是因为接口无论怎么切换netns都会在创建接口所在的netns中留下一些痕迹,将创建接口所在netns和接口关联起来。
拿最简单的ip gre口举例。
ip gre接口是一个三层的ip tunnel接口,外层dst ip通常是本地物理口的ip地址,即协议认为报文是送往本机的,做上次协议(gre)处理,调用ipgre_rcv,最终调用__ipgre_rcv,其中ip_tunnel_lookup函数就是用来查找应该送往哪个gre接口的查询函数,我们可以看到gre接口数据都是从net_generic(net, ipgre_net_id) 返回的ip_tunnel_net 中查询的。
static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
int hdr_len)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn;
int res;
if (tpi->proto == htons(ETH_P_TEB))
itn = net_generic(net, gre_tap_net_id);
else
itn = net_generic(net, ipgre_net_id);
res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
/* ipgre tunnels in collect metadata mode should receive
* also ETH_P_TEB traffic.
*/
itn = net_generic(net, ipgre_net_id);
res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
}
return res;
}
static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
struct metadata_dst *tun_dst = NULL;
const struct iphdr *iph;
struct ip_tunnel *tunnel;
iph = ip_hdr(skb);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
raw_proto, false) < 0)
goto drop;
if (tunnel->dev->type != ARPHRD_NONE)
skb_pop_mac_header(skb);
else
skb_reset_mac_header(skb);
if (tunnel->collect_md) {
__be16 flags;
__be64 tun_id;
flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
return PACKET_REJECT;
}
ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
return PACKET_RCVD;
}
return PACKET_NEXT;
drop:
kfree_skb(skb);
return PACKET_RCVD;
}
在每个namespace中都有一个存放每namespace data的地方,net->gen,一些虚拟接口在创建的时候,都会将自己的私有数据存放在其中。我们可以搜一下net_generic 函数查询那些接口会在net->gen中挂载数据。
static inline void *net_generic(const struct net *net, int id)
{
struct net_generic *ng;
void *ptr;
rcu_read_lock();
ng = rcu_dereference(net->gen);
ptr = ng->ptr[id - 1];
rcu_read_unlock();
return ptr;
}
#endif
可以看到 ip gre在其中存储的是 ip_tunnel 结构,gre的封装信息都在其中。
vxlan接口存放了 vxlan_dev 和 vxlan_sock结构,vxlan接口信息和vxlan的udp sock信息都存在其中。
struct ip_tunnel_net {
struct net_device *fb_tunnel_dev;
struct hlist_head tunnels[IP_TNL_HASH_SIZE]; // ip_tunnel
struct ip_tunnel __rcu *collect_md_tun;
};
/* per-network namespace private data for this module */
struct vxlan_net {
struct list_head vxlan_list; // vxlan_dev
struct hlist_head sock_list[PORT_HASH_SIZE]; // vxlan_sock
spinlock_t sock_lock;
};
我们在创建接口的时候,在net中做了两个操作:
1、调用 register_netdevice 做设备初始化,它再调用list_netdevice,将dev加入到net的dev_base_head、dev_name_head、dev_index_head三个链表上;
2、调用 ip_tunnel_add,将dev的私有数据(ip_tunnel)加入到 net的gen中(net_generic);
注意这里 nt->net = net 的操作,保存了原始的netns。
int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_parm *p)
{
struct ip_tunnel *nt;
struct net *net = dev_net(dev);
struct ip_tunnel_net *itn;
int mtu;
int err;
nt = netdev_priv(dev);
itn = net_generic(net, nt->ip_tnl_net_id);
if (nt->collect_md) {
if (rtnl_dereference(itn->collect_md_tun))
return -EEXIST;
} else {
if (ip_tunnel_find(itn, p, dev->type))
return -EEXIST;
}
nt->net = net; //========== ip_tunnel中保存的原始的net
nt->parms = *p;
err = register_netdevice(dev);
if (err)
goto out;
if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
eth_hw_addr_random(dev);
mtu = ip_tunnel_bind_dev(dev);
if (!tb[IFLA_MTU])
dev->mtu = mtu;
ip_tunnel_add(itn, nt);
out:
return err;
}
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
然后再设置接口netns的时候,调用 dev_change_net_namespace 函数,这个函数中,再修改了netns之后(dev_net_set),只调用了list_netdevice 重新挂载dev到net的设备链表中,而没有设置net->gen,这说明,在创建接口的原始netns才会有private data保存在net->gen,其它netns是没有的。
类似的,vxlan接口,它的udp socket、vxlan_dev在都留在创建它的netns,会在源netns接收到vxlan的udp报文,解封udp后,解析为vxlan报文,通过sock关联到vxlan_dev和dev可以找到对应的vxlan接口。
在源netns找到对应的接口,调用接口的接收处理函数,完成解封装等操作,后面如果再入协议栈,就是接口当前所在的netns的协议栈了,比如路由、netfilter、neighbor等等。
int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
{
int err;
ASSERT_RTNL();
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
if (dev->features & NETIF_F_NETNS_LOCAL)
goto out;
/* Ensure the device has been registrered */
if (dev->reg_state != NETREG_REGISTERED)
goto out;
/* Get out if there is nothing todo */
err = 0;
if (net_eq(dev_net(dev), net))
goto out;
/* Pick the destination device name, and ensure
* we can use it in the destination network namespace.
*/
err = -EEXIST;
if (__dev_get_by_name(net, dev->name)) {
/* We get here if we can't use the current device name */
if (!pat)
goto out;
if (dev_get_valid_name(net, dev, pat) < 0)
goto out;
}
/*
* And now a mini version of register_netdevice unregister_netdevice.
*/
/* If device is running close it first. */
dev_close(dev);
/* And unlink it from device chain */
err = -ENODEV;
unlist_netdevice(dev);
synchronize_net();
/* Shutdown queueing discipline. */
dev_shutdown(dev);
/* Notify protocols, that we are about to destroy
this device. They should clean all the things.
Note that dev->reg_state stays at NETREG_REGISTERED.
This is wanted because this way 8021q and macvlan know
the device is just moving and can keep their slaves up.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
/*
* Flush the unicast and multicast chains
*/
dev_uc_flush(dev);
dev_mc_flush(dev);
/* Send a netdev-removed uevent to the old namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
netdev_adjacent_del_links(dev);
/* Actually switch the network namespace */
dev_net_set(dev, net);
/* If there is an ifindex conflict assign a new one */
if (__dev_get_by_index(net, dev->ifindex))
dev->ifindex = dev_new_index(net);
/* Send a netdev-add uevent to the new namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
netdev_adjacent_add_links(dev);
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
/* Add the device back in the hashes */
list_netdevice(dev);
/* Notify protocols, that a new device appeared. */
call_netdevice_notifiers(NETDEV_REGISTER, dev);
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
synchronize_net();
err = 0;
out:
return err;
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
上面是接收流程,如果是从netns中出来的流量,怎么进入源netns的呢?
gre接口的发送函数是 ipgre_xmit,最终调用ip_tunnel_xmit,如下我们可以看到,在完成 gre头和外层ip头的封装后,重新查询路由的处理:
// rt = ip_route_output_key(tunnel->net, &fl4);
使用的netns是ip_tunnel中存储的netns,在上面创建gre的函数中可知,这个netns是原始的netns(创建gre的netns),也就是说外层ip的路由是查询的原始netns的路由。
再后面调用iptunnel_xmit发送的时候,!net_eq(tunnel->net, dev_net(dev)==true也会做skb的netns切换:
// iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *inner_iph;
struct flowi4 fl4;
u8 tos, ttl;
__be16 df;
struct rtable *rt; /* Route to the other host */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst;
bool connected;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
dst = tnl_params->daddr;
if (dst == 0) {
/* NBMA tunnel */
if (!skb_dst(skb)) {
dev->stats.tx_fifo_errors++;
goto tx_error;
}
if (skb->protocol == htons(ETH_P_IP)) {
rt = skb_rtable(skb);
dst = rt_nexthop(rt, inner_iph->daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
const struct in6_addr *addr6;
struct neighbour *neigh;
bool do_tx_error_icmp;
int addr_type;
neigh = dst_neigh_lookup(skb_dst(skb),
&ipv6_hdr(skb)->daddr);
if (!neigh)
goto tx_error;
addr6 = (const struct in6_addr *)&neigh->primary_key;
addr_type = ipv6_addr_type(addr6);
if (addr_type == IPV6_ADDR_ANY) {
addr6 = &ipv6_hdr(skb)->daddr;
addr_type = ipv6_addr_type(addr6);
}
if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
do_tx_error_icmp = true;
else {
do_tx_error_icmp = false;
dst = addr6->s6_addr32[3];
}
neigh_release(neigh);
if (do_tx_error_icmp)
goto tx_error_icmp;
}
#endif
else
goto tx_error;
connected = false;
}
tos = tnl_params->tos;
if (tos & 0x1) {
tos &= ~0x1;
if (skb->protocol == htons(ETH_P_IP)) {
tos = inner_iph->tos;
connected = false;
} else if (skb->protocol == htons(ETH_P_IPV6)) {
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
connected = false;
}
}
init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
// 关于 src ip, 从cache 或者路由 (出接口ip addr)中取
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
goto tx_error;
rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
NULL;
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
if (IS_ERR(rt)) {
dev->stats.tx_carrier_errors++;
goto tx_error;
}
if (connected)
dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
fl4.saddr);
}
if (rt->dst.dev == dev) {
ip_rt_put(rt);
dev->stats.collisions++;
goto tx_error;
}
if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
ip_rt_put(rt);
goto tx_error;
}
if (tunnel->err_count > 0) {
if (time_before(jiffies,
tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
dst_link_failure(skb);
} else
tunnel->err_count = 0;
}
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = tnl_params->ttl;
if (ttl == 0) {
if (skb->protocol == htons(ETH_P_IP))
ttl = inner_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6))
ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
#endif
else
ttl = ip4_dst_hoplimit(&rt->dst);
}
df = tnl_params->frag_off;
if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
df |= (inner_iph->frag_off&htons(IP_DF));
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
if (max_headroom > dev->needed_headroom)
dev->needed_headroom = max_headroom;
if (skb_cow_head(skb, dev->needed_headroom)) {
ip_rt_put(rt);
dev->stats.tx_dropped++;
kfree_skb(skb);
return;
}
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
df, !net_eq(tunnel->net, dev_net(dev)));
return;
#if IS_ENABLED(CONFIG_IPV6)
tx_error_icmp:
dst_link_failure(skb);
#endif
tx_error:
dev->stats.tx_errors++;
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
网友评论