美文网首页
dpvs学习笔记: 4 nat 完整流程

dpvs学习笔记: 4 nat 完整流程

作者: 董泽润 | 来源:发表于2018-10-31 16:25 被阅读123次

    Nat 用途很广,家里的宽带就是这种模式,将局域网的私有地址转换成公网地址。没有 dr 二层的限制,但是 nat 也有缺点,需要配置路由或是指定为 real server 的网关,同时也会有性能扩展问题。


    nat模式

    对于进入的流量,实际上做的是 dnat, 将目标 ip 由 lb ip 换成真正的 rs ip, 此时后端 rs 是能拿到 client ip 的。返回的流量做 snat, 将源地址换成 lb ip.

    三层处理 ipv4_rcv

    数据接收和上文都是一样的,直接看 ipv4_rcv

    INET_HOOK(INET_HOOK_PRE_ROUTING, mbuf, port, NULL, ipv4_rcv_fin);
    

    INET_HOOK_PRE_ROUTING 注册两个函数,dp_vs_pre_routingdp_vs_in,由于 nat 不做 syn_proxy, 所以直接看 dp_vs_in

    static int dp_vs_in(void *priv, struct rte_mbuf *mbuf, 
                        const struct inet_hook_state *state)
    {
        struct dp_vs_iphdr iph;
        struct dp_vs_proto *prot;
        struct dp_vs_conn *conn;
        int dir, af, verdict, err, related;
        bool drop = false;
        eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */
        assert(mbuf && state);
       ......
        prot = dp_vs_proto_lookup(iph.proto);
        if (unlikely(!prot))
            return INET_ACCEPT;
        /* packet belongs to existing connection ? */
        conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop);
    
        if (unlikely(drop)) {
            RTE_LOG(DEBUG, IPVS, "%s: deny ip try to visit.\n", __func__);
            return INET_DROP;
        }
        // 如果没找到,那么调用 conn_sched 去和 real server 连接
        if (unlikely(!conn)) {
            /* try schedule RS and create new connection */
            if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
                /* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.\n", __func__); */
                return verdict;
            }
    
            /* only SNAT triggers connection by inside-outside traffic. */
            if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
                dir = DPVS_CONN_DIR_OUTBOUND;
            else
                dir = DPVS_CONN_DIR_INBOUND;
        }
       ......
        if (prot->state_trans) {
            err = prot->state_trans(prot, conn, mbuf, dir);
            if (err != EDPVS_OK)
                RTE_LOG(WARNING, IPVS, "%s: fail to trans state.", __func__);
        }
        conn->old_state = conn->state;
    
        /* holding the conn, need a "put" later. */
        if (dir == DPVS_CONN_DIR_INBOUND)
            return xmit_inbound(mbuf, prot, conn);
        else
            return xmit_outbound(mbuf, prot, conn);
    }
    

    忽略部分源码,一共四步操作

    1. dp_vs_proto_lookup 获取四层处理协义,以 tcp 为例
    2. conn_lookup 在流表中查找连接,有时叫 session 也可以
    3. conn_sched 如果不存在 conn,那么一定是新来的请求,调度
    4. state_trans 状态转移
    5. xmit_inbound 或是 xmit_outbound 根据不同方向的流量将数据写回网卡

    新请求绑定 nat 回调

    上文介绍 dr 时,讲到 conn_sched 会根据一定算法选择后端 rs 建立连接。最重要的一步操作就是 conn_bind_dest

        switch (dest->fwdmode) {
        case DPVS_FWD_MODE_NAT:
            conn->packet_xmit = dp_vs_xmit_nat;
            conn->packet_out_xmit = dp_vs_out_xmit_nat;
            break;
        case DPVS_FWD_MODE_TUNNEL:
            conn->packet_xmit = dp_vs_xmit_tunnel;
            break;
        case DPVS_FWD_MODE_DR:
            conn->packet_xmit = dp_vs_xmit_dr;
            break;
        case DPVS_FWD_MODE_FNAT:
            conn->packet_xmit = dp_vs_xmit_fnat;
            conn->packet_out_xmit = dp_vs_out_xmit_fnat;
            break;
        case DPVS_FWD_MODE_SNAT:
            conn->packet_xmit = dp_vs_xmit_snat;
            conn->packet_out_xmit = dp_vs_out_xmit_snat;
            break;
        default:
            return EDPVS_NOTSUPP;
        }
    

    可以看到当前 dpvs 支持 nat, tunnel, dr, fullnat, snat.

    进入流量处理 dp_vs_xmit_nat

    int dp_vs_xmit_nat(struct dp_vs_proto *proto,
                       struct dp_vs_conn *conn,
                       struct rte_mbuf *mbuf)
    {
        struct flow4 fl4;
        struct ipv4_hdr *iph = ip4_hdr(mbuf);
        struct route_entry *rt;
        int err, mtu;
    
        if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) {
            dp_vs_save_xmit_info(mbuf, proto, conn);
            if (!dp_vs_fast_xmit_nat(proto, conn, mbuf)) {
                return EDPVS_OK;
            }
        }
    
        /*
         * drop old route. just for safe, because
         * NAT is PREROUTING, should not have route.
         */
        if (unlikely(mbuf->userdata != NULL)) {
            RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n",
                    __func__, mbuf->userdata);
            route4_put((struct route_entry*)mbuf->userdata);
        }
    
        memset(&fl4, 0, sizeof(struct flow4));
        fl4.daddr = conn->daddr.in;
        fl4.saddr = conn->caddr.in;
        fl4.tos = iph->type_of_service;
        rt = route4_output(&fl4);
        if (!rt) {
            err = EDPVS_NOROUTE;
            goto errout;
        }
    

    这里最重要的就是 route4_output 查找路由

    
        dp_vs_conn_cache_rt(conn, rt, true);
    
        mtu = rt->mtu;
        if (mbuf->pkt_len > mtu
                && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) {
            RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__);
            err = EDPVS_FRAG;
            goto errout;
        }
    
        mbuf->userdata = rt;
    

    设路由赋给 mbuf

        /* after route lookup and before translation */
        if (xmit_ttl) {
            if (unlikely(iph->time_to_live <= 1)) {
                icmp_send(mbuf, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
                err = EDPVS_DROP;
                goto errout;
            }
    
            iph->time_to_live--;
        }
    
        /* L3 translation before l4 re-csum */
        iph->hdr_checksum = 0;
        iph->dst_addr = conn->daddr.in.s_addr;
    

    注意这里 iph->dst_addr = conn->daddr.in.s_addr 将目标地址换成了后端 rs 地址

        /* L4 NAT translation */
        if (proto->fnat_in_handler) {
            err = proto->nat_in_handler(proto, conn, mbuf);
            if (err != EDPVS_OK)
                goto errout;
        }
    

    L4 nat 处理,由于是 tcp 协义,查看 dp_vs_proto_tcp 变量得知这里会调用 tcp_snat_in_handler

        if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) {
            iph->hdr_checksum = 0;
        } else {
            ip4_send_csum(iph);
        }
    
        return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output);
    
    errout:
        if (rt)
            route4_put(rt);
        rte_pktmbuf_free(mbuf);
        return err;
    }
    

    回调 INET_HOOK_LOCAL_OUT 链注册的回调,查看源码这里没有,所以最后调用 ipv4_output

    进入流量处理 tcp_snat_in_handler

    static int tcp_snat_in_handler(struct dp_vs_proto *proto,
                                   struct dp_vs_conn *conn, struct rte_mbuf *mbuf)
    {
        struct tcphdr *th;
        int ip4hlen = ip4_hdrlen(mbuf);
        struct netif_port *dev = NULL;
        struct route_entry *rt = mbuf->userdata;
    
        if (mbuf_may_pull(mbuf, ip4hlen + sizeof(*th)) != 0)
            return EDPVS_INVPKT;
    
        th = tcp_hdr(mbuf);
        if (unlikely(!th))
            return EDPVS_INVPKT;
    
        if (mbuf_may_pull(mbuf, ip4hlen + (th->doff<<2)) != 0)
            return EDPVS_INVPKT;
    
        /* L4 translation */
        th->dest = conn->dport;
    

    注意这里 th->dest = conn->dport 将目标端口换成了 rs port

        /* L4 re-checksum */
        if (rt && rt->port)
            dev = rt->port;
    
        /* leverage HW TX TCP csum offload if possible */
        if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) {
            mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen;
            mbuf->l3_len = ip4hlen;
            mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4);
            th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags);
        } else {
            if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)
                return EDPVS_INVPKT;
            tcp4_send_csum(ip4_hdr(mbuf), th);
        }
    
        return EDPVS_OK;
    }
    

    因为修改了数据包内容,所以 checksum 也要重新计算

    进入流量处理 ipv4_output

    int ipv4_output(struct rte_mbuf *mbuf)
    {
        struct route_entry *rt = mbuf->userdata;
        assert(rt);
    
        IP4_UPD_PO_STATS(out, mbuf->pkt_len);
    
        return INET_HOOK(INET_HOOK_POST_ROUTING, mbuf,
                NULL, rt->port, ipv4_output_fin);
    }
    

    查看源码并没有 INET_HOOK_POST_ROUTING 回调,所以直接调用 ipv4_output_fin

    static int ipv4_output_fin(struct rte_mbuf *mbuf)
    {
        struct route_entry *rt = mbuf->userdata;
    
        if (mbuf->pkt_len > rt->mtu)
            return ipv4_fragment(mbuf, rt->mtu, ipv4_output_fin2);
    
        return ipv4_output_fin2(mbuf);
    }
    

    如果包长度大于 mtu,那么要分片发送,正常走 ipv4_output_fin2 逻辑,最后调用 neigh_resolve_output 发送数据到网卡。

    返回流量处理 dp_vs_out_xmit_nat

    int dp_vs_out_xmit_nat(struct dp_vs_proto *proto,
                       struct dp_vs_conn *conn,
                       struct rte_mbuf *mbuf)
    {
        struct flow4 fl4;
        struct ipv4_hdr *iph = ip4_hdr(mbuf);
        struct route_entry *rt;
        int err, mtu;
       ...
        /* L3 translation before l4 re-csum */
        iph->hdr_checksum = 0;
        iph->src_addr = conn->vaddr.in.s_addr;
    

    这里省略部份代码,最重要的就是 iph->src_addr = conn->vaddr.in.s_addr 设置源地址为 lb ip.

        /* L4 NAT translation */
        if (proto->fnat_in_handler) {
            err = proto->nat_out_handler(proto, conn, mbuf);
            if (err != EDPVS_OK)
                goto errout;
        }
    
        if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) {
            iph->hdr_checksum = 0;
        } else {
            ip4_send_csum(iph);
        }
    
        return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output);
    }
    

    调用 nat_out_handler 处理数据,查看源码回调 tcp_snat_out_handler 函数

    static int tcp_snat_out_handler(struct dp_vs_proto *proto,
                                    struct dp_vs_conn *conn, struct rte_mbuf *mbuf)
    {
       ...
        /* L4 translation */
        th->source = conn->vport;
    
        /* L4 re-checksum */
        if (rt && rt->port)
            dev = rt->port;
    
        /* leverage HW TX TCP csum offload if possible */
        if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) {
            mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen;
            mbuf->l3_len = ip4hlen;
            mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4);
            th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags);
        } else {
            if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)
                return EDPVS_INVPKT;
            tcp4_send_csum(ip4_hdr(mbuf), th);
        }
    
        return EDPVS_OK;
    }
    

    省略部份源码,这里最重要的就是 th->source = conn->vport 设置源端口为 lb port
    在 nat 的最后也是调用 ipv4_output 将数据写回网卡,完成返回流量的转发。

    小结

    由于有上一篇的存在,所以本文代码较少,可以看到 nat 实现还是很简洁明了的。

    相关文章

      网友评论

          本文标题:dpvs学习笔记: 4 nat 完整流程

          本文链接:https://www.haomeiwen.com/subject/gvnotqtx.html