美文网首页
linux bridge - mac转发

linux bridge - mac转发

作者: 分享放大价值 | 来源:发表于2020-10-28 23:37 被阅读0次

    linux bridge在虚拟化场景和docker中用的比较多,之前也知道它的原理,基本上就是类似二层交换机,根据mac地址和vid转发。但是对于vlan的处理网上的文档比较少,所以这次就看一下源码,分析下不配置vlan时如何转发,vlan又如何生效。

    不配置vlan时,bridge纯靠mac转发,可通过如下两个命令之一查看mac转发表

    //此命令只显示单播转发表,比较符合硬件交换机的显示规范,
    //匹配到mac的,从port转发出去(可通过brctl showbsp br1查看端
    //口号和端口的对应关系)
    root@node2:~# brctl showmacs br1
    port no mac addr        is local?   ageing timer
      2 12:27:96:8c:f4:58   yes        0.00
      2 12:27:96:8c:f4:58   yes        0.00
      1 66:e6:6f:a8:d4:97   yes        0.00
      1 66:e6:6f:a8:d4:97   yes        0.00
    
    //通过此命令可显示所有的单播和组播表项
    root@node2:~# bridge fdb show br br1
    33:33:00:00:00:01 dev br1 self permanent
    66:e6:6f:a8:d4:97 dev vetha master br1 permanent
    66:e6:6f:a8:d4:97 dev vetha vlan 1 master br1 permanent
    33:33:00:00:00:01 dev vetha self permanent
    01:00:5e:00:00:01 dev vetha self permanent
    12:27:96:8c:f4:58 dev vethx master br1 permanent
    12:27:96:8c:f4:58 dev vethx vlan 1 master br1 permanent
    33:33:00:00:00:01 dev vethx self permanent
    01:00:5e:00:00:01 dev vethx self permanent
    
    

    这篇文档就先介绍不使能vlan的情况,主要分为下面几个部分
    a. kernel端bridge module的初始化都做了哪些事
    b. 添加网桥时,命令行和kernel端代码流程
    c. 给网桥添加端口时,命令行和kernel端代码流程
    d. 从端口收到报文后,内部是如何转发的

    广播/组播/未知单播报文flood到所有端口。
    查找到转发表项的已知单播报文,发送到此表项的出端口。
    广播/组播/已知单播并且dst为locol的报文,或者网桥设备使能了混杂模式,这几种情况都需要通过网桥设备将报文上送本机协议栈处理。
    

    e. 从网桥br发出去的报文如何转发

    广播/组播/未知单播报文,flood到所有端口。
    能查找到转发表项的单播报文,从表项的出端口发送出去。
    

    bridge还有如下几个注意的地方

    
    单播flood: 控制单播报文是否从此端口发送一份,有两种设置方式,
    a. bridge link set dev vnet1 flood on
    b. echo 1 > /sys/class/net/br1/brif/vnet1/unicast_flood
    
    hairpin模式:控制接收到广播/组播/未知单播的端口,再次从此端口发出。已知单播正常转发。
    a. bridge link set dev vnet1 hairpin on
    b. echo 1 > /sys/class/net/br1/brif/vnet1/hairpin_mode
    
    网桥设备down后,所有端口状态都会变成 disabled, 导致网桥不会正确转发。
    vetha (1)
     port id                8001                    state                  disabled
     designated root        8000.3adce07c2043       path cost                  2
     designated bridge      8000.3adce07c2043       message age timer          0.00
     designated port        8001                    forward delay timer        0.00
     designated cost           0                    hold timer                 0.00
     flags
    

    bridge netfilter框架,可使用ebtables设置和查看


    image.png

    1. module初始化流程

    #module初始化流程
    module_init(br_init)
    static int __init br_init(void)
        static const struct stp_proto br_stp_proto = {
            .rcv    = br_stp_rcv,
        };
        //注册stp协议处理函数,防止环路产生,此文不看stp部分
        stp_proto_register(&br_stp_proto);
        
        //初始化fdb表项用到的cache
        br_fdb_init();
            static struct kmem_cache *br_fdb_cache __read_mostly;
            br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
                         sizeof(struct net_bridge_fdb_entry),0,
                         SLAB_HWCACHE_ALIGN, NULL);
                         
            static u32 fdb_salt __read_mostly;
            get_random_bytes(&fdb_salt, sizeof(fdb_salt));
            
        static struct pernet_operations br_net_ops = {
            .exit   = br_net_exit,
        };
        //注册pernet操作,只提供了exit,所以namespace初始化时无操作
        register_pernet_subsys(&br_net_ops);
        
        static struct notifier_block br_device_notifier = {
            .notifier_call = br_device_event
        };
        //注册网络设备事件处理函数
        register_netdevice_notifier(&br_device_notifier);
        
        br_netlink_init();
            br_mdb_init();
                rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
                rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, NULL);
                rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, NULL);
                
            static struct rtnl_af_ops br_af_ops = {
                .family         = AF_BRIDGE,
                .get_link_af_size   = br_get_link_af_size,
            };
            rtnl_af_register(&br_af_ops);
                list_add_tail(&ops->list, &rtnl_af_ops);
                
            struct rtnl_link_ops br_link_ops __read_mostly = {
                .kind           = "bridge",
                .priv_size      = sizeof(struct net_bridge),
                .setup          = br_dev_setup,
                .maxtype        = IFLA_BRPORT_MAX,
                .policy         = br_policy,
                .validate       = br_validate,
                .newlink        = br_dev_newlink,
                .changelink     = br_changelink,
                .dellink        = br_dev_delete,
                .get_size       = br_get_size,
                .fill_info      = br_fill_info,
                .slave_maxtype      = IFLA_BRPORT_MAX,
                .slave_policy       = br_port_policy,
                .slave_changelink   = br_port_slave_changelink,
                .get_slave_size     = br_port_get_slave_size,
                .fill_slave_info    = br_port_fill_slave_info,
            };
            rtnl_link_register(&br_link_ops);
                __rtnl_link_register(ops);
                    list_add_tail(&ops->list, &link_ops);
        //注册hook函数到br_ioctl_hook,添加网桥时调用br_ioctl_hook
        brioctl_set(br_ioctl_deviceless_stub);
            static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
            br_ioctl_hook = hook;
    

    2. 创建/删除桥流程

    通过strace brctl命令,可知创建/删除桥是通过socket的ioctl调用到kernel端

    //添加桥
    root@node2:~# strace brctl addbr br1
    execve("/usr/sbin/brctl", ["brctl", "addbr", "br1"], 0x7fffd27c39a0 /* 22 vars */) = 0
    socket(AF_UNIX, SOCK_STREAM, 0)         = 3
    ioctl(3, SIOCBRADDBR, "br1")            = 0
    
    //删除桥
    root@node2:~# strace brctl delbr br1
    execve("/usr/sbin/brctl", ["brctl", "delbr", "br1"], 0x7fff18eceaa0 /* 22 vars */) = 0
    socket(AF_UNIX, SOCK_STREAM, 0)         = 3
    ioctl(3, SIOCBRDELBR, "br1")            = 0
    
    #kernel端代码,ioctl最终会调用 sock_ioctl
    static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
            switch (cmd) {
            case SIOCGIFBR:
            case SIOCSIFBR:
            case SIOCBRADDBR:
            case SIOCBRDELBR:
                err = -ENOPKG;
                if (!br_ioctl_hook)
                    request_module("bridge");
    
                mutex_lock(&br_ioctl_mutex);
                //调用之前注册的 br_ioctl_deviceless_stub
                if (br_ioctl_hook)
                    err = br_ioctl_hook(net, cmd, argp); //br_ioctl_deviceless_stub
                mutex_unlock(&br_ioctl_mutex);
                break;
                
    int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
    {
        switch (cmd) {
        case SIOCGIFBR:
        case SIOCSIFBR:
            return old_deviceless(net, uarg);
    
        case SIOCBRADDBR:
        case SIOCBRDELBR:
        {
            char buf[IFNAMSIZ];
    
            if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;
    
            if (copy_from_user(buf, uarg, IFNAMSIZ))
                return -EFAULT;
    
            buf[IFNAMSIZ-1] = 0;
            if (cmd == SIOCBRADDBR)
                return br_add_bridge(net, buf);
    
            return br_del_bridge(net, buf);
        }
        }
        return -EOPNOTSUPP;
    }
    
    int br_add_bridge(struct net *net, const char *name)
        struct net_device *dev;
        dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup);
            alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
                alloc_size = sizeof(struct net_device);
                struct net_device *p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
                struct net_device *dev = PTR_ALIGN(p, NETDEV_ALIGN);
                dev_addr_init(dev)
                dev_mc_init(dev);
                dev_uc_init(dev);
    
                dev_net_set(dev, &init_net);
    
                dev->gso_max_size = GSO_MAX_SIZE;
                dev->gso_max_segs = GSO_MAX_SEGS;
                dev->gso_min_segs = 0;
    
                dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
                setup(dev); //br_dev_setup
                    struct net_bridge *br = netdev_priv(dev);
                    eth_hw_addr_random(dev);
                        dev->addr_assign_type = NET_ADDR_RANDOM;
                        eth_random_addr(dev->dev_addr);
                    ether_setup(dev);
                        dev->header_ops     = &eth_header_ops;
                        dev->type       = ARPHRD_ETHER;
                        dev->hard_header_len    = ETH_HLEN;
                        dev->mtu        = ETH_DATA_LEN;
                        dev->addr_len       = ETH_ALEN;
                        dev->tx_queue_len   = 1000; /* Ethernet wants good queues */
                        dev->flags      = IFF_BROADCAST|IFF_MULTICAST;
                        dev->priv_flags     |= IFF_TX_SKB_SHARING;
                        memset(dev->broadcast, 0xFF, ETH_ALEN);
                    dev->netdev_ops = &br_netdev_ops;
                    dev->destructor = br_dev_free;
                    dev->ethtool_ops = &br_ethtool_ops;
                    SET_NETDEV_DEVTYPE(dev, &br_type);
                    dev->tx_queue_len = 0;
                    dev->priv_flags = IFF_EBRIDGE;
    
                    dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL |
                            NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
                    dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
                               NETIF_F_HW_VLAN_STAG_TX;
                    dev->vlan_features = COMMON_FEATURES;
    
                    br->dev = dev;
                    spin_lock_init(&br->lock);
                    INIT_LIST_HEAD(&br->port_list);
                    spin_lock_init(&br->hash_lock);
    
                    br->bridge_id.prio[0] = 0x80;
                    br->bridge_id.prio[1] = 0x00;
                    /* Reserved Ethernet Addresses per IEEE 802.1Q */
                    static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
                    { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
                    ether_addr_copy(br->group_addr, eth_reserved_addr_base);
    
                    br->stp_enabled = BR_NO_STP;
                    br->group_fwd_mask = BR_GROUPFWD_DEFAULT;
                    br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
    
                    br->designated_root = br->bridge_id;
                    br->bridge_max_age = br->max_age = 20 * HZ;
                    br->bridge_hello_time = br->hello_time = 2 * HZ;
                    br->bridge_forward_delay = br->forward_delay = 15 * HZ;
                    br->ageing_time = 300 * HZ;
    
                    br_netfilter_rtable_init(br);
                        struct rtable *rt = &br->fake_rtable;
                        atomic_set(&rt->dst.__refcnt, 1);
                        rt->dst.dev = br->dev;
                        rt->dst.path = &rt->dst;
                        dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
                        rt->dst.flags   = DST_NOXFRM | DST_FAKE_RTABLE;
                        rt->dst.ops = &fake_dst_ops;
                    br_stp_timer_init(br);
                        setup_timer(&br->hello_timer, br_hello_timer_expired, (unsigned long) br);
                        setup_timer(&br->tcn_timer, br_tcn_timer_expired, (unsigned long) br);
                        setup_timer(&br->topology_change_timer,br_topology_change_timer_expired,(unsigned long) br);
                        setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
                    br_multicast_init(br);
        
                dev->num_tx_queues = txqs;
                dev->real_num_tx_queues = txqs;
                netif_alloc_netdev_queues(dev)
                dev->num_rx_queues = rxqs;
                dev->real_num_rx_queues = rxqs;
                netif_alloc_rx_queues(dev)
                strcpy(dev->name, name);
                dev->name_assign_type = name_assign_type;
                dev->group = INIT_NETDEV_GROUP;
        dev_net_set(dev, net);
        dev->rtnl_link_ops = &br_link_ops;
        register_netdev(dev);
            register_netdevice(dev);
                dev->netdev_ops->ndo_init(dev);//br_dev_init
                    struct net_bridge *br = netdev_priv(dev);
                    br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
                    /vlan相关初始化
                    br_vlan_init(br);
                        //支持的vlan协议,可以通过(/sys/class/net/br1/bridge/vlan_protocol)修改
                        br->vlan_proto = htons(ETH_P_8021Q);
                        //默认 pvid 为 1
                        br->default_pvid = 1;
                        //将vid 1和网桥mac添加到fdb中
                        br_vlan_add(br, 1, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED);
                            struct net_port_vlans *pv = NULL;
                            pv = rtnl_dereference(br->vlan_info);
                            if (pv)
                                return __vlan_add(pv, vid, flags);
                            pv = kzalloc(sizeof(*pv), GFP_KERNEL);
                            pv->parent.br = br;
                            __vlan_add(pv, vid, flags);
                                if (v->port_idx) {
                                    p = v->parent.port;
                                    br = p->br;
                                    dev = p->dev;
                                } else {//出去网桥和网桥设备
                                    br = v->parent.br;
                                    dev = br->dev;
                                }
                                if (p) {
                                    vlan_vid_add(dev, br->vlan_proto, vid);
                                        vlan_info = rtnl_dereference(dev->vlan_info);
                                        vid_info = vlan_vid_info_get(vlan_info, proto, vid);
                                        if (!vid_info) {
                                            __vlan_vid_add(vlan_info, proto, vid, &vid_info);
                                                vid_info = vlan_vid_info_alloc(proto, vid);
                                                //如果硬件支持vlan filter,则设置到硬件
                                                if (vlan_hw_filter_capable(dev, vid_info)) {
                                                    ops->ndo_vlan_rx_add_vid(dev, proto, vid);
                                                list_add(&vid_info->list, &vlan_info->vid_list);
                                                vlan_info->nr_vids++;
                                                *pvid_info = vid_info;
                                        vid_info->refcount++;
                                //插入fdb表项
                                br_fdb_insert(br, p, dev->dev_addr, vid);
                                //设置到 vlan_bitmap 中
                                set_bit(vid, v->vlan_bitmap);
                                v->num_vlans++;
                                __vlan_add_flags(v, vid, flags);
                            rcu_assign_pointer(br->vlan_info, pv);
    

    3. 添加/删除接口流程

    #添加接口
    root@node2:~# strace brctl addif br1 vetha
    execve("/usr/sbin/brctl", ["brctl", "addif", "br1", "vetha"], 0x7fff20137ba8 /* 22 vars */) = 0
    socket(AF_UNIX, SOCK_STREAM, 0)         = 3
    access("/proc/net", R_OK)               = 0
    access("/proc/net/unix", R_OK)          = 0
    socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
    ioctl(4, SIOCGIFINDEX, {ifr_name="vetha", }) = 0
    close(4)                                = 0
    ioctl(3, SIOCBRADDIF)                   = 0
    
    #删除接口
    root@node2:~# strace brctl delif br1 vetha
    execve("/usr/sbin/brctl", ["brctl", "delif", "br1", "vetha"], 0x7ffe8db2f1a8 /* 22 vars */) = 0
    socket(AF_UNIX, SOCK_STREAM, 0)         = 3
    socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
    ioctl(4, SIOCGIFINDEX, {ifr_name="vetha", }) = 0
    close(4)                                = 0
    ioctl(3, SIOCBRDELIF)                   = 0
    
    static const struct net_device_ops br_netdev_ops = {
        .ndo_do_ioctl        = br_dev_ioctl,
        ...
        .ndo_fix_features        = br_fix_features,
        .ndo_fdb_add         = br_fdb_add,
        .ndo_fdb_del         = br_fdb_delete,
        .ndo_fdb_dump        = br_fdb_dump,
        .ndo_bridge_getlink  = br_getlink,
        .ndo_bridge_setlink  = br_setlink,
        .ndo_bridge_dellink  = br_dellink,
    };
    
    #kernel端代码,添加接口
    int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
    {
        struct net_bridge *br = netdev_priv(dev);
    
        switch (cmd) {
        case SIOCDEVPRIVATE:
            return old_dev_ioctl(dev, rq, cmd);
    
        case SIOCBRADDIF:
        case SIOCBRDELIF:
            return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
    
        }
    
        br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
        return -EOPNOTSUPP;
    }
    /* called with RTNL */
    static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
    {
        struct net *net = dev_net(br->dev);
        struct net_device *dev;
        int ret;
    
        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
            return -EPERM;
    
        dev = __dev_get_by_index(net, ifindex);
        if (dev == NULL)
            return -EINVAL;
    
        if (isadd)
            ret = br_add_if(br, dev);
        else
            ret = br_del_if(br, dev);
    
        return ret;
    }
    int br_add_if(struct net_bridge *br, struct net_device *dev)
        int err = 0;
        bool changed_addr;
    
        /* Don't allow bridging non-ethernet like devices */
        if ((dev->flags & IFF_LOOPBACK) ||
            dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
            !is_valid_ether_addr(dev->dev_addr))
            return -EINVAL;
        //bridge接口不能加入另一个bridge
        /* No bridging of bridges */
        if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
            return -ELOOP;
        //加入bridge的接口不能加入另一个bridge,即一个接口不能同时加到两个bridge
        /* Device is already being bridged */
        if (br_port_exists(dev)) //#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
            return -EBUSY;
    
        /* No bridging devices that dislike that (e.g. wireless) */
        if (dev->priv_flags & IFF_DONT_BRIDGE)
            return -EOPNOTSUPP;
    
        struct net_bridge_port *p;
        p = new_nbp(br, dev);
            //找到最小可用的端口号。0保留不用,最大端口号为1<<10
            index = find_portno(br);
            p = kzalloc(sizeof(*p), GFP_KERNEL);
            p->br = br;
            dev_hold(dev);
            p->dev = dev;
            p->path_cost = port_cost(dev);
            p->priority = 0x8000 >> BR_PORT_BITS;
            //保存端口号
            p->port_no = index;
            p->flags = BR_LEARNING | BR_FLOOD;
            br_init_port(p);
                //优先级左移10位或上port_no作为端口号
                p->port_id = br_make_port_id(p->priority, p->port_no);
                    return ((u16)priority << BR_PORT_BITS) | (port_no & ((1<<BR_PORT_BITS)-1));
                br_become_designated_port(p);
                    struct net_bridge *br;
                    br = p->br;
                    p->designated_root = br->designated_root;
                    p->designated_cost = br->root_path_cost;
                    p->designated_bridge = br->bridge_id;
                    p->designated_port = p->port_id;
                //初始状态为 BR_STATE_BLOCKING
                br_set_state(p, BR_STATE_BLOCKING);
                    p->state = state;
                p->topology_change_ack = 0;
                p->config_pending = 0;
            //设置状态为 BR_STATE_DISABLED
            br_set_state(p, BR_STATE_DISABLED);
            br_stp_port_timer_init(p);
                setup_timer(&p->message_age_timer, br_message_age_timer_expired, (unsigned long) p);
                setup_timer(&p->forward_delay_timer, br_forward_delay_timer_expired, (unsigned long) p);
                setup_timer(&p->hold_timer, br_hold_timer_expired, (unsigned long) p);
            br_multicast_add_port(p);
        call_netdevice_notifiers(NETDEV_JOIN, dev);
        //使能组播
        dev_set_allmulti(dev, 1);
            __dev_set_allmulti(dev, inc, true);
                dev->flags |= IFF_ALLMULTI;
                dev->allmulti += inc;
                dev_change_rx_flags(dev, IFF_ALLMULTI);
                dev_set_rx_mode(dev);
        kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR);
        //将接口信息添加到 sys 文件系统中:/sys/class/net/br1/brif/vnet1(桥br1上的接口vnet1)
        br_sysfs_addif(p);
        br_netpoll_enable(p);
        //注册br_handle_frame到协议栈入口处
        netdev_rx_handler_register(dev, br_handle_frame, p);
        //设置flag IFF_BRIDGE_PORT,表示此接口已经加入桥
        dev->priv_flags |= IFF_BRIDGE_PORT;
        netdev_master_upper_dev_link(dev, br->dev);
        //关闭 lro 功能
        dev_disable_lro(dev);
        //将接口加入桥的端口链表 br->port_list
        list_add_rcu(&p->list, &br->port_list);
        nbp_update_port_count(br);
            list_for_each_entry(p, &br->port_list, list) {
                //#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING)
                //#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
                //上面初始化时,p->flags = BR_LEARNING | BR_FLOOD,所以此处成立,cnt加1
                if (br_auto_port(p))
                    cnt++;
            }
            if (br->auto_cnt != cnt) {
                br->auto_cnt = cnt;
                br_manage_promisc(br);
                    //如果bridge接口使能了混杂模式或者bridge接口没有使能vlan filter,则设置桥上所有接口使能混杂模式
                    if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br))
                        set_all = true;
                            list_for_each_entry(p, &br->port_list, list) {
                    if (set_all) {
                        br_port_set_promisc(p);
                            //使能接口混杂模式
                            dev_set_promiscuity(p->dev, 1);
                            //将fdb中静态表项从接口的单播地址列表删除
                            br_fdb_unsync_static(p->br, p);
                                for (i = 0; i < BR_HASH_SIZE; i++) {
                                    hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) {
                                        /* We only care for static entries */
                                        if (!fdb->is_static)
                                            continue;
                                        dev_uc_del(p->dev, fdb->addr.addr);
                                    }
                                }
                            p->flags |= BR_PROMISC;
                    } else {
                        if (br->auto_cnt == 0 ||
                            (br->auto_cnt == 1 && br_auto_port(p)))
                            br_port_clear_promisc(p);
                                //如果接口已经不是混杂模式则返回
                                //或者接口不支持单播过滤,此时也返回,不用关闭混杂模式,因为不支持单播过滤的接口
                                //最终都会使能混杂模式
                                if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT))
                                    return;
                                br_fdb_sync_static(p->br, p);
                                    struct net_bridge_fdb_entry *fdb, *tmp;
                                    //将fdb中静态表项添加到接口的单播地址列表
                                    for (i = 0; i < BR_HASH_SIZE; i++) {
                                        hlist_for_each_entry(fdb, &br->hash[i], hlist) {
                                            /* We only care for static entries */
                                            if (!fdb->is_static)
                                                continue;
    
                                            err = dev_uc_add(p->dev, fdb->addr.addr);
                                            if (err)
                                                goto rollback;
                                        }
                                    }
                                dev_set_promiscuity(p->dev, -1);
                        else
                            br_port_set_promisc(p);
                    }
            }
        netdev_update_features(br->dev);
        //给fdb插入一个表项,vid为 0
        br_fdb_insert(br, p, dev->dev_addr, 0)
            fdb_insert(br, source, addr, vid);
                struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
                struct net_bridge_fdb_entry *fdb;
                //根据mac地址和vid查找是否已经存在
                fdb = fdb_find(head, addr, vid);
                if (fdb) {
                    /* it is okay to have multiple ports with same
                     * address, just use the first one.
                     */
                    //已经存在相同的mac,如果已存在的也是local是允许的。
                    //使用已存在的即可。这样正在添加的接口就不能根据fdb转发了
                    if (fdb->is_local)
                        return 0;
                    br_warn(br, "adding interface %s with same address "
                           "as a received packet\n",
                           source ? source->dev->name : br->dev->name);
                    //如果已经存在的fdb表项不是local的,则删除这个fdb,创建一个新的静态fdb
                    fdb_delete(br, fdb);
                }
                fdb = fdb_create(head, source, addr, vid);
                    struct net_bridge_fdb_entry *fdb;
                    fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
                    if (fdb) {
                        memcpy(fdb->addr.addr, addr, ETH_ALEN);
                        //source作为fdb表项的出接口
                        fdb->dst = source;
                        fdb->vlan_id = vid;
                        fdb->is_local = 0;
                        fdb->is_static = 0;
                        fdb->added_by_user = 0;
                        fdb->updated = fdb->used = jiffies;
                        //将fdb添加到链表
                        hlist_add_head_rcu(&fdb->hlist, head);
                    }
                    return fdb;
                fdb->is_local = fdb->is_static = 1;
                fdb_add_hw(br, addr);
                    //将此接口地址添加到bridge中不是混杂模式的接口上
                    list_for_each_entry(p, &br->port_list, list) {
                        if (!br_promisc_port(p)) {
                            err = dev_uc_add(p->dev, addr);
                            if (err)
                                goto undo;
                        }
                    }
        nbp_vlan_init(p)
            //default_pvid默认为1
            p->br->default_pvid ? nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED) : 0;
                struct net_port_vlans *pv = NULL;
                pv = rtnl_dereference(port->vlan_info);
                if (pv)
                    return __vlan_add(pv, vid, flags);
                pv = kzalloc(sizeof(*pv), GFP_KERNEL);
                pv->port_idx = port->port_no;
                pv->parent.port = port;
                __vlan_add(pv, vid, flags);
                    br_fdb_insert(br, p, dev->dev_addr, vid);
                        set_bit(vid, v->vlan_bitmap);
                        v->num_vlans++;
                        __vlan_add_flags(v, vid, flags);
                            if (flags & BRIDGE_VLAN_INFO_PVID)
                                __vlan_add_pvid(v, vid);
                            else
                                __vlan_delete_pvid(v, vid);
    
                            if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
                                set_bit(vid, v->untagged_bitmap);
                            else
                                clear_bit(vid, v->untagged_bitmap);
                rcu_assign_pointer(port->vlan_info, pv);
        changed_addr = br_stp_recalculate_bridge_id(br);
        if (netif_running(dev) && netif_oper_up(dev) &&
            (br->dev->flags & IFF_UP))
            br_stp_enable_port(p);
                br_init_port(p);
                    p->port_id = br_make_port_id(p->priority, p->port_no);
                    br_become_designated_port(p);
                    br_set_state(p, BR_STATE_BLOCKING);
                    p->topology_change_ack = 0;
                    p->config_pending = 0;
                br_port_state_selection(p->br);
                br_log_state(p);
                    br_info(p->br, "port %u(%s) entered %s state\n",(unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]);
        if (changed_addr)
            call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
        //将桥上所有接口mtu的最小值设置到bridge接口上
        dev_set_mtu(br->dev, br_min_mtu(br));
        kobject_uevent(&p->kobj, KOBJ_ADD);
    

    4. 接收报文处理流程

    在协议栈入口函数 __netif_receive_skb_core 调用添加接口时注册的回调函数 br_handle_frame
    rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
        struct net_bridge_port *p;
        struct sk_buff *skb = *pskb;
        const unsigned char *dest = eth_hdr(skb)->h_dest;
        br_should_route_hook_t *rhook;
        
        //不处理loopback报文
        if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
            return RX_HANDLER_PASS;
        //如果源mac地址为全0,或者为组播地址,则drop此报文
        if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
            goto drop;
        //取出net_bridge_port结构
        p = br_port_get_rcu(skb->dev); //rcu_dereference(dev->rx_handler_data);
        //如果目的ip前三字节为01-80-C2-xx-xx-xx,则需要特殊处理此种报文
        if (unlikely(is_link_local_ether_addr(dest))) {
            /*
             * See IEEE 802.1D Table 7-10 Reserved addresses
             *
             * Assignment               Value
             * Bridge Group Address     01-80-C2-00-00-00
             * (MAC Control) 802.3      01-80-C2-00-00-01
             * (Link Aggregation) 802.3 01-80-C2-00-00-02
             * 802.1X PAE address       01-80-C2-00-00-03
             *
             * 802.1AB LLDP         01-80-C2-00-00-0E
             *
             * Others reserved for future standardization
             */
            switch (dest[5]) {
            case 0x00:  /* Bridge Group Address */
                /* If STP is turned off,
                   then must forward to keep loop detection */
                if (p->br->stp_enabled == BR_NO_STP ||
                    fwd_mask & (1u << dest[5]))
                    goto forward;
                break;
    
            case 0x01:  /* IEEE MAC (Pause) */
                goto drop;
    
            default:
                /* Allow selective forwarding for most other protocols */
                fwd_mask |= p->br->group_fwd_mask;
                if (fwd_mask & (1u << dest[5]))
                    goto forward;
            }
    
            /* Deliver packet to local host only */
            if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
                    NULL, br_handle_local_finish)) {
                return RX_HANDLER_CONSUMED; /* consumed by filter */
            } else {
                *pskb = skb;
                return RX_HANDLER_PASS; /* continue processing */
            }
        }
    forward:
        switch (p->state) {
        case BR_STATE_FORWARDING:
            //如果支持 broute
            rhook = rcu_dereference(br_should_route_hook); //ebt_broute
            if (rhook) {
                if ((*rhook)(skb)) {
                    *pskb = skb;
                    return RX_HANDLER_PASS;
                }
                dest = eth_hdr(skb)->h_dest;
            }
            /* fall through */
        case BR_STATE_LEARNING:
            //如果报文目的mac是br接口的mac,则设置 PACKET_HOST
            if (ether_addr_equal(p->br->dev->dev_addr, dest))
                skb->pkt_type = PACKET_HOST;
            //netfilter处理
            NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
                br_handle_frame_finish);
            break;
        default:
    drop:
            kfree_skb(skb);
        }
        return RX_HANDLER_CONSUMED;
        
    //广播/组播/未知单播报文flood到所有端口。
    //查找到fdb表项的已知单播报文,发送到此表项的出端口。
    //广播/组播/已知单播并且dst为locol的报文,或者网桥设备使能了混杂模式,这几种情况都需要通过网桥设备将报文上送本机协议栈处理。
    /* note: already called with rcu_read_lock */
    int br_handle_frame_finish(struct sk_buff *skb)
    {
        const unsigned char *dest = eth_hdr(skb)->h_dest;
        struct net_bridge_port *p = br_port_get_rcu(skb->dev);
        struct net_bridge *br;
        struct net_bridge_fdb_entry *dst;
        struct net_bridge_mdb_entry *mdst;
        struct sk_buff *skb2;
        bool unicast = true;
        u16 vid = 0;
    
        if (!p || p->state == BR_STATE_DISABLED)
            goto drop;
    
        if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
            goto out;
            
        /* insert into forwarding database after filtering to avoid spoofing */
        br = p->br;
        //更新fdb表项,如果之前没有就新创建
        if (p->flags & BR_LEARNING)
            br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
        //处理组播报文
        if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
            br_multicast_rcv(br, p, skb, vid))
            goto drop;
    
        if (p->state == BR_STATE_LEARNING)
            goto drop;
        //将网桥设备保存到 skb 中
        BR_INPUT_SKB_CB(skb)->brdev = br->dev;
    
        /* The packet skb2 goes to the local host (NULL to skip). */
        //如果skb2不为空,则需要上送本地协议栈
        skb2 = NULL;
        //如果网桥设备打开了混杂模式,则设置 skb2=skb,说明需要上送本地协议栈
        if (br->dev->flags & IFF_PROMISC)
            skb2 = skb;
    
        dst = NULL;
        //如果是广播报文,则也设置skb2=skb,说明需要上送本地协议栈
        if (is_broadcast_ether_addr(dest)) {
            skb2 = skb;
            unicast = false;
        } else if (is_multicast_ether_addr(dest)) {
            //组播报文处理
            mdst = br_mdb_get(br, skb, vid);
            if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
                br_multicast_querier_exists(br, eth_hdr(skb))) {
                if ((mdst && mdst->mglist) ||
                    br_multicast_is_router(br))
                    skb2 = skb;
                br_multicast_forward(mdst, skb, skb2);
                skb = NULL;
                if (!skb2)
                    goto out;
            } else
                skb2 = skb;
            unicast = false;
            br->dev->stats.multicast++;
        //根据mac和vid查找fdb,如果目的地为local,则也要设置skb2=skb,说明需要上送本地协议栈
        } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) {
            skb2 = skb;
            /* Do not forward the packet since it's local. */
            skb = NULL;
        }
        //以下四种情况时,skb不为空
        //a. 广播报文
        //b. 组播报文
        //c. 单播报文,查找到了dst并且dst为非local
        //d. 单播报文,查找不到dst,未知单播
        if (skb) {
            if (dst) {
                dst->used = jiffies;
                //查找到了dst并且dst为非local的单播报文。
                //如果网桥设备没有使能混杂模式,则此时skb2为NULL
                br_forward(dst->dst, skb, skb2);
                    if (should_deliver(to, skb)) {
                        if (skb0)
                            deliver_clone(to, skb, __br_forward);
                        else
                            __br_forward(to, skb);
                        return;
                    }
            } else
                //广播,组播和查找不到dst的单播报文
                br_flood_forward(br, skb, skb2, unicast);
                    br_flood(br, skb, skb2, __br_forward, unicast);
                        //遍历网桥上所有端口,如果端口满足条件则给此端口发送一份报文
                        list_for_each_entry_rcu(p, &br->port_list, list) {
                            /* Do not flood unicast traffic to ports that turn it off */
                            //单播报文并且端口允许flood,则将报文发给此端口一份.
                            if (unicast && !(p->flags & BR_FLOOD))
                                continue;
                            prev = maybe_deliver(prev, p, skb, __packet_hook);
                                if (!should_deliver(p, skb))
                                    //此处判断是否应该发给此端口,满足下面三个条件
                                    //此端口不是接收报文端口。或者此端口使能了 hairpin 模式(针对此端口为接收报文端口来说)
                                    //并且报文满足vlan过滤条件或者vlan过滤功能关闭
                                    //并且端口状态为BR_STATE_FORWARDING
                                    return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
                                        br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) &&
                                        p->state == BR_STATE_FORWARDING;
                                    return prev;
    
                                if (!prev)
                                    goto out;
    
                                err = deliver_clone(prev, skb, __packet_hook);
                                if (err)
                                    return ERR_PTR(err);
                            out:
                                return p;
                            if (IS_ERR(prev))
                                goto out;
                        }
    
                        if (!prev)
                            goto out;
    
                        if (skb0)
                            deliver_clone(prev, skb, __packet_hook);
                        else
                            __packet_hook(prev, skb);
                        return;     
        }
        //网桥设备使能了混杂模式,skb2肯定不为NULL
        //广播/组播报文
        //单播报文,查找到了dst并且dst为local
        if (skb2)
            return br_pass_frame_up(skb2);
    
    out:
        return 0;
    drop:
        kfree_skb(skb);
        goto out;
    }
    
    //将报文发送给指定出端口
    static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
    {
        struct net_device *indev;
    
        if (skb_warn_if_lro(skb)) {
            kfree_skb(skb);
            return;
        }
    
        skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
        if (!skb)
            return;
    
        indev = skb->dev;
        //将skb中的dev换成出端口的dev
        skb->dev = to->dev;
        skb_forward_csum(skb);
    
        NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
            br_forward_finish);
    }
    int br_forward_finish(struct sk_buff *skb)
    {
        return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
                   br_dev_queue_push_xmit);
    
    }
    int br_dev_queue_push_xmit(struct sk_buff *skb)
    {
        /* ip_fragment doesn't copy the MAC header */
        if (nf_bridge_maybe_copy_header(skb) ||
            !is_skb_forwardable(skb->dev, skb)) {
            kfree_skb(skb);
        } else {
            skb_push(skb, ETH_HLEN);
            br_drop_fake_rtable(skb);
            //从网卡发送出去
            dev_queue_xmit(skb);
        }
    
        return 0;
    }
    
    //通过网桥设备将报文上送本机协议栈
    static int br_pass_frame_up(struct sk_buff *skb)
    {
        struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
        struct net_bridge *br = netdev_priv(brdev);
        struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
        struct net_port_vlans *pv;
    
        u64_stats_update_begin(&brstats->syncp);
        brstats->rx_packets++;
        brstats->rx_bytes += skb->len;
        u64_stats_update_end(&brstats->syncp);
    
        /* Bridge is just like any other port.  Make sure the
         * packet is allowed except in promisc modue when someone
         * may be running packet capture.
         */
        pv = br_get_vlan_info(br);
        if (!(brdev->flags & IFF_PROMISC) &&
            !br_allowed_egress(br, pv, skb)) {
            kfree_skb(skb);
            return NET_RX_DROP;
        }
    
        indev = skb->dev;
        //将skb中的dev换成网桥设备的dev
        //网桥设备dev没有注册 br_handle_frame,可以经过netif_receive_skb上送到协议栈
        skb->dev = brdev;
        skb = br_handle_vlan(br, pv, skb);
        if (!skb)
            return NET_RX_DROP;
    
        return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
                   netif_receive_skb);
    }
    

    5. 网桥设备发送报文流程

    处理比较简单,广播/组播/未知单播报文,flood到所有端口。
    能查找到fdb表项的单播报文,从表项的出端口发送出去。
    netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
        struct net_bridge *br = netdev_priv(dev);
        const unsigned char *dest = skb->data;
        struct net_bridge_fdb_entry *dst;
        struct net_bridge_mdb_entry *mdst;
        struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
        u16 vid = 0;
    
        rcu_read_lock();
    #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
            br_nf_pre_routing_finish_bridge_slow(skb);
            rcu_read_unlock();
            return NETDEV_TX_OK;
        }
    #endif
    
        u64_stats_update_begin(&brstats->syncp);
        brstats->tx_packets++;
        brstats->tx_bytes += skb->len;
        u64_stats_update_end(&brstats->syncp);
        //将网桥设备dev保存到skb
        BR_INPUT_SKB_CB(skb)->brdev = dev;
    
        skb_reset_mac_header(skb);
        skb_pull(skb, ETH_HLEN);
        //是否满足vlan filter或者vlan filter功能关闭
        if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid))
            goto out;
    
        if (is_broadcast_ether_addr(dest))
            //广播报文,发送到所有网桥上的端口
            br_flood_deliver(br, skb, false);
                br_flood(br, skb, NULL, __br_deliver, unicast);
        else if (is_multicast_ether_addr(dest)) {
            //组播报文处理,不详细分析了
            if (unlikely(netpoll_tx_running(dev))) {
                br_flood_deliver(br, skb, false);
                goto out;
            }
            if (br_multicast_rcv(br, NULL, skb, vid)) {
                kfree_skb(skb);
                goto out;
            }
    
            mdst = br_mdb_get(br, skb, vid);
            if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
                br_multicast_querier_exists(br, eth_hdr(skb)))
                br_multicast_deliver(mdst, skb);
            else
                br_flood_deliver(br, skb, false);
        } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL)
            br_deliver(dst->dst, skb);
                //查找到fdb表项,经过netfilter处理后,最终调用dev_queue_xmit从网卡发送出去
                if (to && should_deliver(to, skb)) {
                    __br_deliver(to, skb);
                        skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
                        skb->dev = to->dev;
                        NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, br_forward_finish);
                            NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, br_dev_queue_push_xmit);
                                dev_queue_xmit(skb);
        else
            //未知单播
            br_flood_deliver(br, skb, true);
    
    out:
        rcu_read_unlock();
        return NETDEV_TX_OK;
    

    相关文章

      网友评论

          本文标题:linux bridge - mac转发

          本文链接:https://www.haomeiwen.com/subject/xhwqvktx.html