美文网首页
dpvs学习笔记: 8 rs调度算法

dpvs学习笔记: 8 rs调度算法

作者: 董泽润 | 来源:发表于2018-11-13 15:16 被阅读91次

    后端 rs 一般有多个,通过一定算法做负载均衡。程序初始化时 dp_vs_init 调用 dp_vs_sched_init 进行全局注册。

    int dp_vs_sched_init(void)
    {
        INIT_LIST_HEAD(&dp_vs_schedulers);
        rte_rwlock_init(&__dp_vs_sched_lock);
        dp_vs_rr_init();
        dp_vs_wrr_init();
        dp_vs_wlc_init();
        dp_vs_conhash_init();
    
        return EDPVS_OK;
    }
    

    可以看到 dpvs 当前支持 rr, wrr, wlc, conhash 四种调度算法。

    调度入口

    当 client 首次请求时,dp_vs_schedule 选择 rs,然后建立连接

    dest = svc->scheduler->schedule(svc, mbuf);
    

    这就是调度入口,具体使用哪种算法,由程序初始化时,配置 service 服务时决定。dp_vs_bind_scheduler 调用 scheduler->init_service 函数指针初始化。

    轮循算法 rr

    首先这是最简单的算法,看下如何实现的。首先看 init_service 初始化函数

    static int dp_vs_rr_init_svc(struct dp_vs_service *svc)
    {
        svc->sched_data = &svc->dests;
        return EDPVS_OK;
    }
    

    很简单,直接将后端 rs 链表 dests 赋给 sched_data. 再看一下 schedule 入口

    static struct dp_vs_dest *dp_vs_rr_schedule(struct dp_vs_service *svc,
                            const struct rte_mbuf *mbuf)
    {
        struct list_head *p, *q;
        struct dp_vs_dest *dest;
    
        rte_rwlock_write_lock(&svc->sched_lock);
    
        p = (struct list_head *)svc->sched_data;
        p = p->next;
        q = p;
    
        do {
            /* skip list head */
            if (q == &svc->dests) {
                q = q->next;
                continue;
            }
    
            dest = list_entry(q, struct dp_vs_dest, n_list);
            if (!(dest->flags & DPVS_DEST_F_OVERLOAD) &&
                (dest->flags & DPVS_DEST_F_AVAILABLE) &&
                rte_atomic16_read(&dest->weight) > 0)
                /* HIT */
                goto out;
            q = q->next;
        } while (q != p);
        rte_rwlock_write_unlock(&svc->sched_lock);
    
        return NULL;
    
    out:
        svc->sched_data = q;
        rte_rwlock_write_unlock(&svc->sched_lock);
    
        return dest;
    }
    

    原理也很明了,遍历 sched_data 链表,找到第一个可用的 dest 后返回,并保存找到的位置,下次选择时从这个位置的下一个开始查找。

    加权轮循 wrr

    带权重的轮循,首先看 init_service 初始化函数

    static int dp_vs_wrr_init_svc(struct dp_vs_service *svc)
    {
        struct dp_vs_wrr_mark *mark;
    
        /*
         *    Allocate the mark variable for WRR scheduling
         */
        mark = rte_zmalloc("wrr_mark", sizeof(struct dp_vs_wrr_mark), RTE_CACHE_LINE_SIZE);
        if (mark == NULL) {
            return EDPVS_NOMEM;
        }
        mark->cl = &svc->dests;
        mark->cw = 0;
        mark->mw = dp_vs_wrr_max_weight(svc);
        mark->di = dp_vs_wrr_gcd_weight(svc);
        svc->sched_data = mark;
    
        return EDPVS_OK;
    }
    
    struct dp_vs_wrr_mark {
        struct list_head *cl;   /* current list head */
        int cw;         /* current weight */
        int mw;         /* maximum weight */
        int di;         /* decreasing interval */
    };
    

    分配 dp_vs_wrr_mark 结构体,初始化赋值给 svc->sched_data. dp_vs_wrr_max_weight 求出后端 rs 权重最大值,dp_vs_wrr_gcd_weight 求出这些权重值的最大公约数,这个 gcd 用于权重值增减的步长。比如权重分别是 100,20,50,那么 gcd 就是 10,所谓的 decreasing interval 就是 10.

    static struct dp_vs_dest *dp_vs_wrr_schedule(struct dp_vs_service *svc,
                             const struct rte_mbuf *mbuf)
    {
        struct dp_vs_dest *dest;
        struct dp_vs_wrr_mark *mark = svc->sched_data;
        struct list_head *p;
    
        /*
         * This loop will always terminate, because mark->cw in (0, max_weight]
         * and at least one server has its weight equal to max_weight.
         */
        rte_rwlock_write_lock(&svc->sched_lock);
        p = mark->cl;
        while (1) {
            if (mark->cl == &svc->dests) {
                /* it is at the head of the destination list */
    
                if (mark->cl == mark->cl->next) {
                    /* no dest entry */
                    dest = NULL;
                    goto out;
                }
    
                mark->cl = svc->dests.next;
                mark->cw -= mark->di;
                if (mark->cw <= 0) {
                    mark->cw = mark->mw;
                    /*
                     * Still zero, which means no available servers.
                     */
                    if (mark->cw == 0) {
                        mark->cl = &svc->dests;
                        dest = NULL;
                        goto out;
                    }
                }
            } else
                mark->cl = mark->cl->next;
    
            if (mark->cl != &svc->dests) {
                /* not at the head of the list */
                dest = list_entry(mark->cl, struct dp_vs_dest, n_list);
                if (!(dest->flags & DPVS_DEST_F_OVERLOAD) &&
                    (dest->flags & DPVS_DEST_F_AVAILABLE) &&
                    rte_atomic16_read(&dest->weight) >= mark->cw) {
                    /* got it */
                    break;
                }
            }
    
            if (mark->cl == p && mark->cw == mark->di) {
                /* back to the start, and no dest is found.
                   It is only possible when all dests are OVERLOADED */
                dest = NULL;
                goto out;
            }
        }
    
          out:
        rte_rwlock_write_unlock(&svc->sched_lock);
    
        return dest;
    }
    
    1. 首次循环 mark->cl = &svc->dests, 也就是队首,cw 减去一个步长 di. 并且将 mark->cl 置为队首后的第一个元素 svc->dests.next 然后查看 mark->cl 的权重如果大于 cw, 那么返回当前 mark->cl 做为 rs.
    2. 如果此时 mark->cl 不是队首,那么直接 mark->cl = mark->cl->next 查看下一个元素,是否满足可用,并且权重大于 cw, 符合要求返回 mark->cl 做为 rs
    3. 如果mark->cl 权重小于 cw, 那么 while 循环,继续遍历下一个。
      这个算法的原理清楚了,但是 while 循环的复杂度暂时不知道,水平有限...

    加权最小连接 wlc

    static struct dp_vs_scheduler dp_vs_wlc_scheduler = {
        .name = "wlc",
        .n_list = LIST_HEAD_INIT(dp_vs_wlc_scheduler.n_list),
        .schedule = dp_vs_wlc_schedule,
    };
    

    这个调度算法是没有 init_service 函数的,那么直接看 dp_vs_wlc_schedule

    static struct dp_vs_dest *dp_vs_wlc_schedule(struct dp_vs_service *svc,
                       const struct rte_mbuf *mbuf)
    {
        struct dp_vs_dest *dest, *least;
        unsigned int loh, doh;
    
        /*
         * We calculate the load of each dest server as follows:
         *                (dest overhead) / dest->weight
         *
         * The server with weight=0 is quiesced and will not receive any
         * new connections.
         */
    
        list_for_each_entry(dest, &svc->dests, n_list) {
            if (!(dest->flags & DPVS_DEST_F_OVERLOAD) &&
                (dest->flags & DPVS_DEST_F_AVAILABLE) &&
                rte_atomic16_read(&dest->weight) > 0) {
                least = dest;
                loh = dp_vs_wlc_dest_overhead(least);
                goto nextstage;
            }
        }
        return NULL;
    
        /*
         *    Find the destination with the least load.
         */
          nextstage:
        list_for_each_entry_continue(dest, &svc->dests, n_list) {
            if (dest->flags & DPVS_DEST_F_OVERLOAD)
                continue;
            doh = dp_vs_wlc_dest_overhead(dest);
            if (loh * rte_atomic16_read(&dest->weight) >
                doh * rte_atomic16_read(&least->weight)) {
                least = dest;
                loh = doh;
            }
        }
    
        return least;
    }
    
    static inline unsigned int dp_vs_wlc_dest_overhead(struct dp_vs_dest *dest)
    {
        return (rte_atomic32_read(&dest->actconns) << 8) +
               rte_atomic32_read(&dest->inactconns);
    }
    
    1. 首先是给后端 rs 打分的算法, (dest overhead) / dest->weight,负载除以权重。负载是由 dp_vs_wlc_dest_overhead 完成计算,活跃连接和非活跃连接的加权统计,活跃占比最大。那么这个打分,肯定是越小,被选择的可能性越大
    2. 第一个 list_for_each_entry,找出第一个 rs, 算出打分。第二个遍历所有 rs, 将打分进行对比,最后分最少的被赋值 least, 返回给上游使用。

    连接哈希 conhash

    这个算法用的好像不多,会将相同 hash 算法的固定分配置某个后端 rs. 先看 init_service

    static int dp_vs_conhash_init_svc(struct dp_vs_service *svc)
    {
        svc->sched_data = conhash_init(NULL);
        if (!svc->sched_data) {
            RTE_LOG(ERR, SERVICE, "%s: conhash init faild!\n", __func__);
            return EDPVS_NOMEM;
        }
        dp_vs_conhash_assign(svc);
    
        return EDPVS_OK;
    }
    

    这块 conhash_init 居然用了红黑树的实现,然后调用 dp_vs_conhash_assign 将后端 rs hash 到这个 rbtree 里.

    static int
    dp_vs_conhash_assign(struct dp_vs_service *svc)
    {
        struct dp_vs_dest *dest;
        struct node_s *p_node;
        int weight = 0;
        char str[40];
    
        list_for_each_entry(dest, &svc->dests, n_list) {
           weight = rte_atomic16_read(&dest->weight);
           if (weight > 0) {
    
               p_node = rte_zmalloc("p_node", sizeof(struct node_s), RTE_CACHE_LINE_SIZE);
               if (p_node == NULL) {
                    return EDPVS_NOMEM;
                }
    
               rte_atomic32_inc(&dest->refcnt);
               p_node->data = dest;
    
               snprintf(str, sizeof(str), "%u%d", dest->addr.in.s_addr, dest->port);
    
               conhash_set_node(p_node, str, weight*REPLICA);
               conhash_add_node(svc->sched_data, p_node);
            }
        }
        return EDPVS_OK;
    }
    
    1. 只有 weight 大于 0 才是可用的 rs
    2. 生成 node 节点加入到树里,这里看节点分复制成 weightREPLICA 份,做一致性 hash 用,建了 权重160个虚拟节点
    static struct dp_vs_dest *
    dp_vs_conhash_schedule(struct dp_vs_service *svc, const struct rte_mbuf *mbuf)
    {
        struct dp_vs_dest *dest;
    
        dest = dp_vs_conhash_get(svc, (struct conhash_s *)svc->sched_data, mbuf);
    
        if (!dest
            || !(dest->flags & DPVS_DEST_F_AVAILABLE)
            || rte_atomic16_read(&dest->weight) <= 0
            || is_overloaded(dest)) {
    
            return NULL;
        }
        else
            return dest;
    }
    

    schedule 函数也不难,直接根据 mbuf 调用 dp_vs_conhash_get, 获取最近的虚拟节点即可。

    static inline struct dp_vs_dest *
    dp_vs_conhash_get(struct dp_vs_service *svc, struct conhash_s *conhash,
                      const struct rte_mbuf *mbuf)
    {
        char str[40] = {0};
        uint64_t quic_cid;
        uint32_t sip;
        const struct node_s *node;
    
        if (svc->flags & DP_VS_SVC_F_QID_HASH) {
            if (svc->proto != IPPROTO_UDP) {
                RTE_LOG(ERR, IPVS, "QUIC cid hash scheduler should only be set in UDP service.\n");
                return NULL;
            }
            /* try to get CID for hash target first, then source IP. */
            if (EDPVS_OK == get_quic_hash_target(mbuf, &quic_cid)) {
                snprintf(str, sizeof(str), "%lu", quic_cid);
            } else if (EDPVS_OK == get_sip_hash_target(mbuf, &sip)) {
                snprintf(str, sizeof(str), "%u", sip);
            } else {
                return NULL;
            }
    
        } else if (svc->flags & DP_VS_SVC_F_SIP_HASH) {
            if (EDPVS_OK == get_sip_hash_target(mbuf, &sip)) {
                snprintf(str, sizeof(str), "%u", sip);
            } else {
                return NULL;
            }
    
        } else {
            RTE_LOG(ERR, IPVS, "%s: invalid hash target.\n", __func__);
            return NULL;
        }
    
        node = conhash_lookup(conhash, str);
        return node == NULL? NULL: node->data;
    }
    

    可以看到,对于 udp 如果如果开启了 CID,那么调用 get_quic_hash_target 生成 hash 值。其它情况一律使用 get_sip_hash_target 即源地址 ip 生成 hash

    总结

    这块比较常见,也没什么难度,大家看看就好

    相关文章

      网友评论

          本文标题:dpvs学习笔记: 8 rs调度算法

          本文链接:https://www.haomeiwen.com/subject/jamkfqtx.html