美文网首页
ovs conntrack及nat

ovs conntrack及nat

作者: 分享放大价值 | 来源:发表于2021-04-25 16:40 被阅读0次

    本文分析ovs中对conntrack的支持,分为命令行解析ct action及ovs-vswitchd端对conntrack和nat的处理流程。

    根据datapath不同,实现也不一样,但是原理都类似。对于kernel datapath来说,使用kernel的conntrack来实现,对于userspace datapath来说,ovs本身来实现,可参考 lib/conntrack.c 文件。

    conntrack相关字段

    匹配域

    可参考 http://openvswitch.org/support/dist-docs/ovs-fields.7.txt

    1. ct_state 连接状态,可能的值如下
    new 通过ct action指定报文经过conntrack模块处理,不一定有commit。通常是数据流的第一个数据包
    est 表示conntrack模块看到了报文双向数据流,一定是在commit 的conntrack后
    rel 表示和已经存在的conntrack相关,比如icmp不可达消息或者ftp的数据流
    rpl 表示反方向的报文
    inv 无效的,表示conntrack模块没有正确识别到报文,比如L3/L4 protocol handler没有加载,或者L3/L4 protocol handler认为报文错误
    trk 表示报文经过了conntrack模块处理,如果这个flag不设置,其他flag都不能被设置
    snat 表示报文经过了snat,源ip或者port
    dnat 表示报文经过了dnat,目的ip或者port
    

    这些flag得结合"+"或者"-"来使用,"+"表示必须匹配,"-"表示必须不匹配。可以同时指定多个flag,比如 ct_state=+trk+new。

    数据包经过ct模块处理了就会设置状态 trk。什么叫经过ct模块处理?流表的action指定了ct,并且报文通过了协议验证。
    pkt->md.ct_state = CS_TRACKED

    什么是 commit?只有ct的action有了commit,才会在内存中建立connection

    1. ct_zone zone用来隔离连接跟踪表项,可以通过ct zone action来设置
    2. ct_mark 32位的值,可以通过 ct exec(set_field: 1->ct_mark)来设置。报文第一次匹配后,通过此action设置ct_mark到报文的metadata,重新注入datapath时,用来匹配流表指定的ct_mark。
    3. ct_label 128的值,可以通过 ct exec(set_field: 1->ct_label)来设置,用法和ct_mark类似
    4. ct_nw_src / ct_ipv6_src 用来匹配conntrack表项原始方向的源ip
    5. ct_nw_dst / ct_ipv6_dst 用来匹配conntrack表项原始方向的目的ip
    6. ct_nw_proto 用来匹配conntrack表项原始方向的协议类型
    7. ct_tp_src 用来匹配conntrack表项原始方向的源端口号
    8. ct_tp_dst 用来匹配conntrack表项原始方向的目的端口号

    匹配域和flow 中下的以下字段对应,用来匹配流表

    struct flow {
        ...
        uint8_t ct_state;           /* Connection tracking state. */
        uint8_t ct_nw_proto;        /* CT orig tuple IP protocol. */
        uint16_t ct_zone;           /* Connection tracking zone. */
        uint32_t ct_mark;           /* Connection mark.*/
        ovs_be32 ct_nw_src;         /* CT orig tuple IPv4 source address. */
        ovs_be32 ct_nw_dst;         /* CT orig tuple IPv4 destination address. */
        struct in6_addr ct_ipv6_src; /* CT orig tuple IPv6 source address. */
        struct in6_addr ct_ipv6_dst; /* CT orig tuple IPv6 destination address. */
        ovs_be16 ct_tp_src;         /* CT original tuple source port/ICMP type. */
        ovs_be16 ct_tp_dst;         /* CT original tuple dst port/ICMP code. */
        ...
    }
    

    动作

    ovs通过ct action实现conntrack,格式如下,ct会将报文送到conntrack模块进行处理
    ct([argument][,argument…])

    ct支持下面的参数

    commit 只有执行了commit,才会在conntrack模块创建conntrack表项
    force 强制删除已存在的conntrack表项
    table 跳转到指定的table执行
    zone 设置zone,隔离conntrack
    exec 执行其他action,目前只支持设置ct_mark和ct_label,比如exec(set_field: 1->ct_mark)
    alg=<ftp/tftp> 指定alg类型,目前只支持ftp和tftp
    nat 指定ip和port
    

    流表例子

    #添加nat表项
    ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, in_port=veth_l0, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333))"
    
    //在一个ct里指定多次nat,只有最后一个nat生效,可参考do_xlate_actions中,ctx->ct_nat_action = ofpact_get_NAT(a)只有一个ctx->ct_nat_action 
    ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333), nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"
    
    //可以通过指定多个ct,实现fullnat,即同时转换源目的ip。
    //但是这两个ct必须指定不同的zone,否则只有第一个ct生效。因为在 handle_nat 中,判断只有zone不一样才会进行后续的nat操作
    //错误方式,指定了src和dst nat,但是zone相同,只有前面的snat生效
    ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333)), ct(commit,nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"
    
    //正确方式,使用不同zone,指定fullnat
    ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,zone=100, nat(src=10.1.1.240-10.2.2.2:2222-3333)), ct(commit, zone=200, nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"
    

    源码分析

    命令行解析ct参数

    比如下面这条流表,通过ct_state匹配没经过conntrack处理的报文,一般刚被ovs接收的报文都能匹配到,执行的action是ct,其参数为commit和nat,表示需要创建conntrack表项,同时对报文做snat。
    ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, in_port=veth_l0, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333))"

    重点分析下命令行解析ct action的代码。
    先看下面两个结构体,struct ofpact_conntrack用来保存ct后面的参数,并使用另一个结构体struct ofpact_nat专门保存ct的nat信息。

    /* OFPACT_NAT.
     *
     * Used for NXAST_NAT. */
    struct ofpact_nat {
        struct ofpact ofpact; //指定类型为 OFPACT_NAT
        uint8_t range_af; /* AF_UNSPEC, AF_INET, or AF_INET6 */
        uint16_t flags;  /* NX_NAT_F_* */
        struct {
            struct {
                uint16_t min;
                uint16_t max;
            } proto;
            union {
                struct {
                    ovs_be32 min;
                    ovs_be32 max;
                } ipv4;
                struct {
                    struct in6_addr min;
                    struct in6_addr max;
                } ipv6;
            } addr;
        } range;
    };
    
    /* OFPACT_CT.
     *
     * Used for NXAST_CT. */
    struct ofpact_conntrack {
        OFPACT_PADDED_MEMBERS(
            struct ofpact ofpact; //{ofpact = {type = OFPACT_CT, raw = 255 '\377', len = 32}
            uint16_t flags; //NX_CT_F_COMMIT和NX_CT_F_FORCE
            uint16_t zone_imm; //zone
            struct mf_subfield zone_src;
            uint16_t alg; //alg类型
            uint8_t recirc_table; //跳转到指定table
        );
        struct ofpact actions[0]; //保存 nat 信息 struct ofpact_nat {type = OFPACT_NAT, raw = 255 '\377', len = 48}
    };
    

    解析命令行参数 ct 指定的action,保存到 struct ofpact_conntrack,如果同时指定了nat,则nat信息保存在struct ofpact_nat,位置在 struct ofpact_conntrack->actions,最后会更新 struct ofpact_conntrack->ofpact.len 为总长度

    static char * OVS_WARN_UNUSED_RESULT
    parse_CT(char *arg, const struct ofputil_port_map *port_map,
             struct ofpbuf *ofpacts, enum ofputil_protocol *usable_protocols)
    {
        const size_t ct_offset = ofpacts_pull(ofpacts);
        struct ofpact_conntrack *oc;
        char *error = NULL;
        char *key, *value;
    
        //ofpact_put_CT 在文件./include/openvswitch/ofp-actions.h:1163 ofpact_put_##ENUM 定义
        //设置 OFPACT_CT
        oc = ofpact_put_CT(ofpacts);
        oc->flags = 0;
        oc->recirc_table = NX_CT_RECIRC_NONE;
        while (ofputil_parse_key_value(&arg, &key, &value)) {
            if (!strcmp(key, "commit")) {
                oc->flags |= NX_CT_F_COMMIT;
            } else if (!strcmp(key, "force")) {
                oc->flags |= NX_CT_F_FORCE;
            } else if (!strcmp(key, "table")) {
                error = str_to_u8(value, "recirc_table", &oc->recirc_table);
                if (!error && oc->recirc_table == NX_CT_RECIRC_NONE) {
                    error = xasprintf("invalid table %#"PRIx8, oc->recirc_table);
                }
            } else if (!strcmp(key, "zone")) {
                error = str_to_u16(value, "zone", &oc->zone_imm);
    
                if (error) {
                    free(error);
                    error = mf_parse_subfield(&oc->zone_src, value);
                    if (error) {
                        return error;
                    }
                }
            } else if (!strcmp(key, "alg")) {
                error = str_to_connhelper(value, &oc->alg);
            } else if (!strcmp(key, "nat")) {
                const size_t nat_offset = ofpacts_pull(ofpacts);
                //解析 nat 信息
                error = parse_NAT(value, port_map, ofpacts, usable_protocols);
                /* Update CT action pointer and length. */
                ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
                oc = ofpacts->header;
            } else if (!strcmp(key, "exec")) {
                /* Hide existing actions from ofpacts_parse_copy(), so the
                 * nesting can be handled transparently. */
                enum ofputil_protocol usable_protocols2;
                const size_t exec_offset = ofpacts_pull(ofpacts);
    
                /* Initializes 'usable_protocol2', fold it back to
                 * '*usable_protocols' afterwards, so that we do not lose
                 * restrictions already in there. */
                //解析 exec 参数,比如 set_field ct(commit,exec(set_field:1->ct_mark)) (->后面的ct_mark为key,前面的1为value)
                //parse_SET_FIELD
                error = ofpacts_parse_copy(value, port_map, ofpacts, &usable_protocols2, false, OFPACT_CT);
                *usable_protocols &= usable_protocols2;
                ofpacts->header = ofpbuf_push_uninit(ofpacts, exec_offset);
                oc = ofpacts->header;
            } else {
                error = xasprintf("invalid argument to \"ct\" action: `%s'", key);
            }
            if (error) {
                break;
            }
        }
        if (!error && oc->flags & NX_CT_F_FORCE && !(oc->flags & NX_CT_F_COMMIT)) {
            error = xasprintf("\"force\" flag requires \"commit\" flag.");
        }
        //更新 struct ofpact_conntrack->ofpact.len,包含nat的长度
        ofpact_finish_CT(ofpacts, &oc);
        ofpbuf_push_uninit(ofpacts, ct_offset);
        return error;
    }
    
    static char * OVS_WARN_UNUSED_RESULT
    parse_NAT(char *arg,
              const struct ofputil_port_map *port_map OVS_UNUSED,
              struct ofpbuf *ofpacts,
              enum ofputil_protocol *usable_protocols OVS_UNUSED)
    {
        struct ofpact_nat *on = ofpact_put_NAT(ofpacts);
        char *key, *value;
    
        on->flags = 0;
        on->range_af = AF_UNSPEC;
    
        while (ofputil_parse_key_value(&arg, &key, &value)) {
            char *error = NULL;
    
            if (!strcmp(key, "src")) {
                on->flags |= NX_NAT_F_SRC;
                error = str_to_nat_range(value, on);
            } else if (!strcmp(key, "dst")) {
                on->flags |= NX_NAT_F_DST;
                error = str_to_nat_range(value, on);
            } else if (!strcmp(key, "persistent")) {
                on->flags |= NX_NAT_F_PERSISTENT;
            } else if (!strcmp(key, "hash")) {
                on->flags |= NX_NAT_F_PROTO_HASH;
            } else if (!strcmp(key, "random")) {
                on->flags |= NX_NAT_F_PROTO_RANDOM;
            } else {
                error = xasprintf("invalid key \"%s\" in \"nat\" argument",
                                  key);
            }
            if (error) {
                return error;
            }
        }
        if (on->flags & NX_NAT_F_SRC && on->flags & NX_NAT_F_DST) {
            return xasprintf("May only specify one of \"src\" or \"dst\".");
        }
        if (!(on->flags & NX_NAT_F_SRC || on->flags & NX_NAT_F_DST)) {
            if (on->flags) {
                return xasprintf("Flags allowed only with \"src\" or \"dst\".");
            }
            if (on->range_af != AF_UNSPEC) {
                return xasprintf("Range allowed only with \"src\" or \"dst\".");
            }
        }
        if (on->flags & NX_NAT_F_PROTO_HASH && on->flags & NX_NAT_F_PROTO_RANDOM) {
            return xasprintf("Both \"hash\" and \"random\" are not allowed.");
        }
    
        return NULL;
    }
    

    解析成功后,可能的格式如下,其中struct ofpact_conntrack后面紧跟着其他嵌套的action,struct ofpact_conntrack->ofpact.len指定了ct参数总长度,包含nat和set_field的长度。struct ofpact_conntrack结构肯定是在前面,struct ofpact_nat和struct ofpact_set_field根据命令行指定的顺序而定,可以指定多次。

    struct ofpact_conntrack(OFPACT_CT) + struct ofpact_nat(OFPACT_NAT) + struct ofpact_set_field(OFPACT_SET_FIELD)
    

    辅助函数
    在上面解析代码中,有一些函数定义需要经过宏展开后才能看到,比如ofpact_put_CT,ofpact_finish_CT和ofpact_put_NAT
    这些函数都是在头文件./include/openvswitch/ofp-actions.h中定义的。宏OFPACT定义了五个函数,用来根据action类型进行操作。

    #define OFPACT(ENUM, STRUCT, MEMBER, NAME)                              \
        BUILD_ASSERT_DECL(offsetof(struct STRUCT, ofpact) == 0);            \
                                                                            \
        enum { OFPACT_##ENUM##_SIZE                                         \
               = (offsetof(struct STRUCT, MEMBER)                           \
                  ? offsetof(struct STRUCT, MEMBER)                         \
                  : OFPACT_ALIGN(sizeof(struct STRUCT))) };                 \
                                                                            \
        static inline struct STRUCT *                                       \
        ofpact_get_##ENUM(const struct ofpact *ofpact)                      \
        {                                                                   \
            ovs_assert(ofpact->type == OFPACT_##ENUM);                      \
            return ALIGNED_CAST(struct STRUCT *, ofpact);                   \
        }                                                                   \
                                                                            \
        static inline struct STRUCT *                                       \
        ofpact_get_##ENUM##_nullable(const struct ofpact *ofpact)           \
        {                                                                   \
            ovs_assert(!ofpact || ofpact->type == OFPACT_##ENUM);           \
            return ALIGNED_CAST(struct STRUCT *, ofpact);                   \
        }                                                                   \
                                                                            \
        static inline struct STRUCT *                                       \
        ofpact_put_##ENUM(struct ofpbuf *ofpacts)                           \
        {                                                                   \
            return (struct STRUCT *) ofpact_put(ofpacts, OFPACT_##ENUM,     \
                                                OFPACT_##ENUM##_SIZE);      \
        }                                                                   \
                                                                            \
        static inline void                                                  \
        ofpact_init_##ENUM(struct STRUCT *ofpact)                           \
        {                                                                   \
            ofpact_init(&ofpact->ofpact, OFPACT_##ENUM,                     \
                        OFPACT_##ENUM##_SIZE);                              \
        }                                                                   \
                                                                            \
        static inline void                                                  \
        ofpact_finish_##ENUM(struct ofpbuf *ofpbuf, struct STRUCT **ofpactp) \
        {                                                                   \
            struct ofpact *ofpact = &(*ofpactp)->ofpact;                    \
            ovs_assert(ofpact->type == OFPACT_##ENUM);                      \
            *ofpactp = (struct STRUCT *) ofpact_finish(ofpbuf, ofpact);     \
        }
    OFPACTS
    #undef OFPACT
    
    OFPACTS 为如下的宏定义:
       ENUM                    STRUCT              MEMBER  NAME
    #define OFPACTS                                                         \
        /* Output. */                                                       \
        OFPACT(OUTPUT,          ofpact_output,      ofpact, "output")       \
        ...
        /* Header changes. */                                               \
        OFPACT(SET_FIELD,       ofpact_set_field,   ofpact, "set_field")    \
        ...
        OFPACT(CT,              ofpact_conntrack,   ofpact, "ct")           \
        OFPACT(CT_CLEAR,        ofpact_null,        ofpact, "ct_clear")     \
        OFPACT(NAT,             ofpact_nat,         ofpact, "nat")          \
    

    比如对于CT action来说,宏展开后为

        enum { OFPACT_CT_SIZE                                         \
               = (offsetof(struct ofpact_conntrack, ofpact)                           \
                  ? offsetof(struct ofpact_conntrack, ofpact)                         \
                  : OFPACT_ALIGN(sizeof(struct ofpact_conntrack))) };                 \
                                                                            \
        static inline struct ofpact_conntrack *                                       \
        ofpact_get_CT(const struct ofpact *ofpact)                      \
        {                                                                   \
            ovs_assert(ofpact->type == OFPACT_CT);                      \
            return ALIGNED_CAST(struct ofpact_conntrack *, ofpact);                   \
        }                                                                   \
                                                                            \
        static inline struct ofpact_conntrack *                                       \
        ofpact_get_CT_nullable(const struct ofpact *ofpact)           \
        {                                                                   \
            ovs_assert(!ofpact || ofpact->type == OFPACT_CT);           \
            return ALIGNED_CAST(struct ofpact_conntrack *, ofpact);                   \
        }                                                                   \
                                                                            \
        static inline struct ofpact_conntrack *                                       \
        ofpact_put_CT(struct ofpbuf *ofpacts)                           \
        {                                                                   \
            return (struct ofpact_conntrack *) ofpact_put(ofpacts, OFPACT_CT,     \
                                                OFPACT_CT_SIZE);      \
        }                                                                   \
                                                                            \
        static inline void                                                  \
        ofpact_init_CT(struct ofpact_conntrack *ofpact)                           \
        {                                                                   \
            ofpact_init(&ofpact->ofpact, OFPACT_CT,                     \
                        OFPACT_CT_SIZE);                              \
        }                                                                   \
                                                                            \
        static inline void                                                  \
        ofpact_finish_CT(struct ofpbuf *ofpbuf, struct ofpact_conntrack **ofpactp) \
        {                                                                   \
            struct ofpact *ofpact = &(*ofpactp)->ofpact;                    \
            ovs_assert(ofpact->type == OFPACT_CT);                      \
            *ofpactp = (struct ofpact_conntrack *) ofpact_finish(ofpbuf, ofpact);     \
        }
    

    ovs-vswitchd端处理

    ovs-vswitchd接收到命令行添加流表消息并解析后,添加到本地flow table中,等待匹配报文。

    slowpath解析ct action
    ovs接收到报文后,查找fastpath失败,继续slowpath查找,如果匹配到的流表的action为ct,处理流程如下

    do_xlate_actions
    const struct ofpact *a;
    OFPACT_FOR_EACH (a, ofpacts, ofpacts_len)
        switch (a->type)
        //action为CT
        case OFPACT_CT:
            //ofpact_get_CT获取struct ofpact_conntrack及其后面嵌套的action
            compose_conntrack_action(ctx, ofpact_get_CT(a));
    

    将 struct ofpact_conntrack 结构中action信息转换到datapath能识别的action结构odp_actions中。

    static void
    compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc)
        //内部再次调用do_xlate_actions,解析nat和ct_mark,ct_label信息
        do_xlate_actions(ofc->actions, ofpact_ct_get_action_len(ofc), ctx);
            //获取nat信息,保存到 ctx->ct_nat_action,如果指定了多次nat,只有最后一次会生效
            case OFPACT_NAT:
                /* This will be processed by compose_conntrack_action(). */
                ctx->ct_nat_action = ofpact_get_NAT(a);
                break;
    
            //解析 ct_mark 或者 ct_label 并保存到 flow->ct_mark和 flow->ct_label
            case OFPACT_SET_FIELD:
                set_field = ofpact_get_SET_FIELD(a);
                mf = set_field->field;
    
                /* Set the field only if the packet actually has it. */
                if (mf_are_prereqs_ok(mf, flow, wc)) {
                    mf_mask_field_masked(mf, ofpact_set_field_mask(set_field), wc);
                    mf_set_flow_value_masked(mf, set_field->value,
                                             ofpact_set_field_mask(set_field),
                                             flow);
        if (ofc->zone_src.field) {
            zone = mf_get_subfield(&ofc->zone_src, &ctx->xin->flow);
        } else {
            zone = ofc->zone_imm;
        }
    
        //添加第一个 datapath action OVS_ACTION_ATTR_CT
        //OVS_ACTION_ATTR_CT 开始
        ct_offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_CT);
        if (ofc->flags & NX_CT_F_COMMIT) {
            nl_msg_put_flag(ctx->odp_actions, ofc->flags & NX_CT_F_FORCE ? OVS_CT_ATTR_FORCE_COMMIT : OVS_CT_ATTR_COMMIT);
            if (ctx->xbridge->support.ct_eventmask) {
                nl_msg_put_u32(ctx->odp_actions, OVS_CT_ATTR_EVENTMASK, OVS_CT_EVENTMASK_DEFAULT);
            }
        }
    
        nl_msg_put_u16(ctx->odp_actions, OVS_CT_ATTR_ZONE, zone);
        put_ct_mark(&ctx->xin->flow, ctx->odp_actions, ctx->wc);
            if (wc->masks.ct_mark) {
                struct {
                    uint32_t key;
                    uint32_t mask;
                } *odp_ct_mark;
    
                odp_ct_mark = nl_msg_put_unspec_uninit(odp_actions, OVS_CT_ATTR_MARK, sizeof(*odp_ct_mark));
                odp_ct_mark->key = flow->ct_mark & wc->masks.ct_mark;
                odp_ct_mark->mask = wc->masks.ct_mark;
            }
        put_ct_label(&ctx->xin->flow, ctx->odp_actions, ctx->wc);
    
        put_ct_helper(ctx, ctx->odp_actions, ofc);
            if (ofc->alg) {
                switch(ofc->alg) {
                case IPPORT_FTP:
                    nl_msg_put_string(odp_actions, OVS_CT_ATTR_HELPER, "ftp");
                    break;
                case IPPORT_TFTP:
                    nl_msg_put_string(odp_actions, OVS_CT_ATTR_HELPER, "tftp");
                    break;
                default:
                    xlate_report_error(ctx, "cannot serialize ct_helper %d", ofc->alg);
                    break;
                }
            }
    
        put_ct_nat(ctx);
            struct ofpact_nat *ofn = ctx->ct_nat_action;
            nat_offset = nl_msg_start_nested(ctx->odp_actions, OVS_CT_ATTR_NAT);
            if (ofn->flags & NX_NAT_F_SRC || ofn->flags & NX_NAT_F_DST) {
                nl_msg_put_flag(ctx->odp_actions, ofn->flags & NX_NAT_F_SRC
                                ? OVS_NAT_ATTR_SRC : OVS_NAT_ATTR_DST);
                if (ofn->flags & NX_NAT_F_PERSISTENT) {
                    nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PERSISTENT);
                }
                if (ofn->flags & NX_NAT_F_PROTO_HASH) {
                    nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PROTO_HASH);
                } else if (ofn->flags & NX_NAT_F_PROTO_RANDOM) {
                    nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PROTO_RANDOM);
                }
                ...
            }
            nl_msg_end_nested(ctx->odp_actions, nat_offset);
    
        ctx->ct_nat_action = NULL;
        //OVS_ACTION_ATTR_CT 结束
        nl_msg_end_nested(ctx->odp_actions, ct_offset);
    
        //如果配置 ct(table=x) 则需要添加第二个 datapath action OVS_ACTION_ATTR_RECIRC
        //recirc_table 值为table id,表示需要转到其他table继续执行,比如 actions=ct(table=0)
        //值为 NX_CT_RECIRC_NONE,说明不需要
        if (ofc->recirc_table == NX_CT_RECIRC_NONE) {
            /* If we do not recirculate as part of this action, hide the results of
             * connection tracking from subsequent recirculations. */
            ctx->conntracked = false;
        } else {
            /* Use ct_* fields from datapath during recirculation upcall. */
            ctx->conntracked = true;
            compose_recirculate_and_fork(ctx, ofc->recirc_table);
                uint32_t recirc_id;
                ctx->freezing = true;
                recirc_id = finish_freezing__(ctx, table);
                    struct frozen_state state = {
                        //保存需要跳转到的 table id,即 recirc_table
                        .table_id = table,
                        .ofproto_uuid = ctx->xbridge->ofproto->uuid,
                        .stack = ctx->stack.data,
                        .stack_size = ctx->stack.size,
                        .mirrors = ctx->mirrors,
                        .conntracked = ctx->conntracked,
                        .xport_uuid = ctx->xin->xport_uuid,
                        .ofpacts = ctx->frozen_actions.data,
                        .ofpacts_len = ctx->frozen_actions.size,
                        .action_set = ctx->action_set.data,
                        .action_set_len = ctx->action_set.size,
                    };
                    frozen_metadata_from_flow(&state.metadata, &ctx->xin->flow);
    
                    //获取 recirc_id,保存到 odp_actions,作为datapath的其中一个action
                    id = recirc_alloc_id_ctx(&state);
                        uint32_t hash = frozen_state_hash(state);
                        struct recirc_id_node *node = recirc_ref_equal(state, hash);
                        node = recirc_alloc_id__(state, hash);
                            struct recirc_id_node *node = xzalloc(sizeof *node);
                            node->hash = hash;
                            ovs_refcount_init(&node->refcount);
                            frozen_state_clone(CONST_CAST(struct frozen_state *, &node->state), state);
                            cmap_insert(&id_map, &node->id_node, node->id);
                            cmap_insert(&metadata_map, &node->metadata_node, node->hash);
                            return node;
                        node->id;
                    nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, id);
            ctx->conntracked = false;
        }
    

    通过上面的解析,对于openflow中的action ct会生成在datapath中用的action,可能包含两种action: OVS_ACTION_ATTR_CT和OVS_ACTION_ATTR_RECIRC,前者又包含了commit(OVS_CT_ATTR_FORCE_COMMIT),ct_mark(OVS_CT_ATTR_MARK), ct_label和nat(OVS_CT_ATTR_NAT)等信息,后者仅仅包含了recirc_id,用来重新注入datapath后查看到table id,即用来跳转到指定table执行。

    fastpath执行ct action
    在将上面获取到的action添加到datapath后,还需要立即对触发slowpath的报文执行action。

    packet_batch_per_flow_execute
        actions = dp_netdev_flow_get_actions(flow);
        dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow, actions->actions, actions->size, now);
            odp_execute_actions
                //遍历执行匹配流表的所有 actions
                NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len)
                    int type = nl_attr_type(a);
                    //dp_execute_cb
                    dp_execute_action(dp, batch, a, may_steal);
                        //执行 ct action
                        case OVS_ACTION_ATTR_CT: {
                            NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), nl_attr_get_size(a)) {
                                enum ovs_ct_attr sub_type = nl_attr_type(b);
                                switch(sub_type) {
                                case OVS_CT_ATTR_FORCE_COMMIT:
                                    force = true;
                                    /* fall through. */
                                case OVS_CT_ATTR_COMMIT:
                                    commit = true;
                                    break;
                                case OVS_CT_ATTR_ZONE:
                                    zone = nl_attr_get_u16(b);
                                    break;
                                case OVS_CT_ATTR_HELPER:
                                    helper = nl_attr_get_string(b);
                                    break;
                                case OVS_CT_ATTR_MARK:
                                    setmark = nl_attr_get(b);
                                    break;
                                case OVS_CT_ATTR_LABELS:
                                    setlabel = nl_attr_get(b);
                                    break;
                                case OVS_CT_ATTR_EVENTMASK:
                                    /* Silently ignored, as userspace datapath does not generate
                                     * netlink events. */
                                    break;
                                case OVS_CT_ATTR_NAT: {
                                    const struct nlattr *b_nest;
                                    unsigned int left_nest;
                                    bool ip_min_specified = false;
                                    bool proto_num_min_specified = false;
                                    bool ip_max_specified = false;
                                    bool proto_num_max_specified = false;
                                    memset(&nat_action_info, 0, sizeof nat_action_info);
                                    nat_action_info_ref = &nat_action_info;
    
                                    NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
                                        enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
    
                                        switch (sub_type_nest) {
                                        case OVS_NAT_ATTR_SRC:
                                        case OVS_NAT_ATTR_DST:
                                            nat_config = true;
                                            nat_action_info.nat_action |=
                                                ((sub_type_nest == OVS_NAT_ATTR_SRC)
                                                    ? NAT_ACTION_SRC : NAT_ACTION_DST);
                                            break;
                                        case OVS_NAT_ATTR_IP_MIN:
                                            memcpy(&nat_action_info.min_addr,
                                                   nl_attr_get(b_nest),
                                                   nl_attr_get_size(b_nest));
                                            ip_min_specified = true;
                                            break;
                                        case OVS_NAT_ATTR_IP_MAX:
                                            memcpy(&nat_action_info.max_addr,
                                                   nl_attr_get(b_nest),
                                                   nl_attr_get_size(b_nest));
                                            ip_max_specified = true;
                                            break;
                                        case OVS_NAT_ATTR_PROTO_MIN:
                                            nat_action_info.min_port =
                                                nl_attr_get_u16(b_nest);
                                            proto_num_min_specified = true;
                                            break;
                                        case OVS_NAT_ATTR_PROTO_MAX:
                                            nat_action_info.max_port =
                                                nl_attr_get_u16(b_nest);
                                            proto_num_max_specified = true;
                                            break;
                                        //persistent,hash和random在 userspace datapath中没用到
                                        case OVS_NAT_ATTR_PERSISTENT:
                                        case OVS_NAT_ATTR_PROTO_HASH:
                                        case OVS_NAT_ATTR_PROTO_RANDOM:
                                            break;
                                        case OVS_NAT_ATTR_UNSPEC:
                                        case __OVS_NAT_ATTR_MAX:
                                            OVS_NOT_REACHED();
                                        }
                                    }
    
                                    if (ip_min_specified && !ip_max_specified) {
                                        nat_action_info.max_addr = nat_action_info.min_addr;
                                    }
                                    if (proto_num_min_specified && !proto_num_max_specified) {
                                        nat_action_info.max_port = nat_action_info.min_port;
                                    }
                                    if (proto_num_min_specified || proto_num_max_specified) {
                                        if (nat_action_info.nat_action & NAT_ACTION_SRC) {
                                            nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
                                        } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
                                            nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
                                        }
                                    }
                                    break;
                                }
                                }
                            }
    
                            /* We won't be able to function properly in this case, hence
                             * complain loudly. */
                            if (nat_config && !commit) {
                                static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
                                VLOG_WARN_RL(&rl, "NAT specified without commit.");
                            }
                            //struct dp_netdev 是全局的,所以 dp->conntrack 也是全局的,多个pmd共享dp->conntrack
                            struct dp_netdev *dp = pmd->dp;
                            conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force, commit, zone, setmark, setlabel, helper, nat_action_info_ref);
                        //跳转到其他table执行
                        case OVS_ACTION_ATTR_RECIRC:
                            if (*depth < MAX_RECIRC_DEPTH) {
                                struct dp_packet_batch recirc_pkts;
    
                                if (!may_steal) {
                                   dp_packet_batch_clone(&recirc_pkts, packets_);
                                   packets_ = &recirc_pkts;
                                }
    
                                struct dp_packet *packet;
                                DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
                                    //获取 recirc_id
                                    packet->md.recirc_id = nl_attr_get_u32(a);
                                }
    
                                (*depth)++;
                                //重新进入 slowpath,查找指定table的openflow流表
                                dp_netdev_recirculate(pmd, packets_);
                                    dp_netdev_input__(pmd, packets, true, 0);
                                        emc_processing
                                        fast_path_processing
                                (*depth)--;
    
                                return;
                            }
    

    清除conntrack表项

    创建datapath时,会启动专门的线程clean_thread_main清除超期的conntrack表项

    create_dp_netdev
        struct dp_netdev *dp;
    
        dp = xzalloc(sizeof *dp);
        conntrack_init(&dp->conntrack);
            ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
            
    static void *
    clean_thread_main(void *f_)
    {
        struct conntrack *ct = f_;
    
        while (!latch_is_set(&ct->clean_thread_exit)) {
            long long next_wake;
            long long now = time_msec();
    
            next_wake = conntrack_clean(ct, now);
    
            if (next_wake < now) {
                poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
            } else {
                poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
            }
            latch_wait(&ct->clean_thread_exit);
            poll_block();
        }
    
        return NULL;
    }
    

    对于ct action来说,conntrack_execute是主要处理函数

    int
    conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
                      ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
                      const uint32_t *setmark,
                      const struct ovs_key_ct_labels *setlabel,
                      const char *helper,
                      const struct nat_action_info_t *nat_action_info)
        for (size_t i = 0; i < cnt; i++) {
            //从 pkts 中提取报文信息到 ct->key,并判断报文是否合法
            if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone))
                ctx->key.zone = zone;
                ctx->key.dl_type = dl_type;
                extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, !hwol_good_l3_csum);
                    key->src.addr.ipv4 = ip->ip_src;
                    key->dst.addr.ipv4 = ip->ip_dst;
                    key->nw_proto = ip->ip_proto;
                extract_l4(&ctx->key, l4, tail - l4, &ctx->icmp_related, l3, !hwol_good_l4_csum);
                //计算 hash 值
                ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
            {
                //如果报文不合法,则设置 CS_INVALID 后,继续处理下一个报文
                pkts[i]->md.ct_state = CS_INVALID;
                write_ct_md(pkts[i], zone, NULL, NULL, NULL);
                continue;
            }
            //开始处理合法报文
            process_one(ct, pkts[i], &ctx, zone, force, commit, now, setmark, setlabel, nat_action_info, helper);
                struct conn *conn;
                //根据 hash 值,得出一个 hash 桶
                unsigned bucket = hash_to_bucket(ctx->hash);
                    #define CONNTRACK_BUCKETS_SHIFT 8
                    #define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
                    //hash 桶大小 256
                    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
    
                //根据 ctx->key 查找 conn,如果是reply方向数据流,则设置reply标志
                conn_key_lookup(&ct->buckets[bucket], ctx, now);
                    uint32_t hash = ctx->hash;
                    struct conn *conn;
                    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
                        if (!conn_key_cmp(&conn->key, &ctx->key)
                                && !conn_expired(conn, now)) {
                            ctx->conn = conn;
                            ctx->reply = false;
                            break;
                        }
                        if (!conn_key_cmp(&conn->rev_key, &ctx->key)
                                && !conn_expired(conn, now)) {
                            ctx->conn = conn;
                            ctx->reply = true;
                            break;
                        }
                    }
                conn = ctx->conn;
    
                /* Delete found entry if in wrong direction. 'force' implies commit. */
                if (conn && force && ctx->reply) {
                    conn_clean(ct, conn, &ct->buckets[bucket]);
                    conn = NULL;
                }
    
                bool create_new_conn = false;
                struct conn conn_for_un_nat_copy;
                conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
                bool ftp_ctl = is_ftp_ctl(pkt);
    
                if (OVS_LIKELY(conn)) {
                    if (ftp_ctl) {
                        /* Keep sequence tracking in sync with the source of the
                         * sequence skew. */
                        if (ctx->reply != conn->seq_skew_dir) {
                            handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
                                           !!nat_action_info);
                            create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
                                                                bucket);
                        } else {
                            create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
                                                                bucket);
                            handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
                                           !!nat_action_info);
                        }
                    } else {
                        create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
                                                            bucket);
                    }
                    if (nat_action_info && !create_new_conn) {
                        handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
                    }
    
                }else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
                                           nat_action_info)) {
                    create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
                                                        bucket);
                } else {
                    if (ctx->icmp_related) {
                        /* An icmp related conn should always be found; no new
                           connection is created based on an icmp related packet. */
                        pkt->md.ct_state = CS_INVALID;
                    } else {
                        create_new_conn = true;
                    }
                }
    
                if (OVS_UNLIKELY(create_new_conn)) {
                    conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info, &conn_for_un_nat_copy, helper, alg_exp);
                        unsigned bucket = hash_to_bucket(ctx->hash);
                        struct conn *nc = NULL;
                        
                        //四层协议判断报文是否有效
                        if (!valid_new(pkt, &ctx->key))
                            return l4_protos[key->nw_proto]->valid_new(pkt);
                        {
                            pkt->md.ct_state = CS_INVALID;
                            return nc;
                        }
    
                        //设置 CS_NEW
                        pkt->md.ct_state = CS_NEW;
    
                        //只有设置了 commit,才会将conn添加到hash表
                        if (commit) {
                            //判断是否超过 conn 表项最大限制
                            unsigned int n_conn_limit;
                            atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
                            if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
                                COVERAGE_INC(conntrack_full);
                                return nc;
                            }
    
                            //创建新表项
                            nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
                                struct conn *newconn;
                                //tcp_new_conn
                                newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
                                newconn->key = *key;
                                return newconn;
    
                            ctx->conn = nc;
                            nc->rev_key = nc->key;
                            //翻转key
                            conn_key_reverse(&nc->rev_key);
    
                            if (nat_action_info) {
                                nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
                                if (alg_exp) {
                                } else {
                                    *conn_for_un_nat_copy = *nc;
                                    ct_rwlock_wrlock(&ct->resources_lock);
                                    //根据nat配置,选择合适的ip和port
                                    bool nat_res = nat_select_range_tuple(ct, nc, conn_for_un_nat_copy);
                                        bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys, nat_conn, ct->hash_basis);
                                            //将 nat 的conn插入 nat_conn_keys
                                            hmap_insert(nat_conn_keys, &nat_conn_key->node, nat_conn_key_hash);
                                    if (!nat_res) {
                                        goto nat_res_exhaustion;
                                    }
                                    /* Update nc with nat adjustments made to
                                     * conn_for_un_nat_copy by nat_select_range_tuple(). */
                                    *nc = *conn_for_un_nat_copy;
                                    ct_rwlock_unlock(&ct->resources_lock);
                                }
                                //设置 conn_type 为 CT_CONN_TYPE_UN_NAT,表示此表项需要nat
                                conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
                                conn_for_un_nat_copy->nat_info = NULL;
                                conn_for_un_nat_copy->alg = NULL;
                                //将报文做nat转换
                                nat_packet(pkt, nc, ctx->icmp_related);
                                    if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
                                        pkt->md.ct_state |= CS_SRC_NAT;
                                        if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
                                            struct ip_header *nh = dp_packet_l3(pkt);
                                            packet_set_ipv4_addr(pkt, &nh->ip_src, conn->rev_key.dst.addr.ipv4_aligned);
                                        }
                                        if (!related) {
                                            pat_packet(pkt, conn);
                                        }
                                    } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
                                        pkt->md.ct_state |= CS_DST_NAT;
                                    }
                            }
                            //将新建表项插入hash表
                            hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
                            //增加表项个数
                            atomic_count_inc(&ct->n_conn);
                        }
                        return nc;
                }
    
                write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
                    pkt->md.ct_state |= CS_TRACKED;
                    pkt->md.ct_zone = zone;
                    pkt->md.ct_mark = conn ? conn->mark : 0;
                    pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
    
                    pkt->md.ct_orig_tuple_ipv6 = false;
                    if (key) {
                        if (key->dl_type == htons(ETH_TYPE_IP)) {
                            //ct_orig_tuple 保存原始报文(第一次进ct模块时)的五元组信息
                            pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
                                key->src.addr.ipv4_aligned,
                                key->dst.addr.ipv4_aligned,
                                key->nw_proto != IPPROTO_ICMP
                                ? key->src.port : htons(key->src.icmp_type),
                                key->nw_proto != IPPROTO_ICMP
                                ? key->dst.port : htons(key->src.icmp_code),
                                key->nw_proto,
                            };
                        }
                    }
    
                if (conn && setmark) {
                    set_mark(pkt, conn, setmark[0], setmark[1]);
                }
    
                if (conn && setlabel) {
                    set_label(pkt, conn, &setlabel[0], &setlabel[1]);
                }
        }
    }
    

    参考

    https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/
    https://zhuanlan.zhihu.com/p/25089778

    相关文章

      网友评论

          本文标题:ovs conntrack及nat

          本文链接:https://www.haomeiwen.com/subject/bxvocltx.html