美文网首页
Redis深度历险 - dict的rehash

Redis深度历险 - dict的rehash

作者: 突击手平头哥 | 来源:发表于2021-04-17 15:49 被阅读0次

Redis深度历险 - dict的rehash

字典是Redis中非常重要的一个数据结构,本质就是一个hash表;那么这个hash表是怎么解决冲突的呢?是怎么进行rehash的呢?

dict结构体

相关代码在: dict.c、dict.h中

/* This is our hash table structure. Every dictionary has two of this as we
 * implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
    dictEntry **table;              //使用的是数组来存储数据
    unsigned long size;             //数组空间大小
    unsigned long sizemask;         //数组空间大小-1
    unsigned long used;             //已使用空间
} dictht;

typedef struct dict {
    dictType *type;                     //这里并不是什么类型,二十hash表使用的hash、比较、销毁等函数
    void *privdata;
    dictht ht[2];                       //hash表结构体
    long rehashidx;                     //rehash标记
    unsigned long iterators; 
} dict;

hash冲突处理

/* Add an element to the target hash table */
static int dictAdd(dict *ht, void *key, void *val) {
    int index;
    dictEntry *entry;

    /* Get the index of the new element, or -1 if
     * the element already exists. */
    if ((index = _dictKeyIndex(ht, key)) == -1)
        return DICT_ERR;

    /* Allocates the memory and stores key */
    entry = malloc(sizeof(*entry));             //创建一个entry(一项即将插入的数据)
    entry->next = ht->table[index];             //ht->table中之前在这个位置上的旧entry放在新entry的next上
    ht->table[index] = entry;                   //将新entry放在ht->table这个位置上

    /* Set the hash entry fields. */
    dictSetHashKey(ht, entry, key);
    dictSetHashVal(ht, entry, val);
    ht->used++;
    return DICT_OK;
}

typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;             //链表
} dictEntry;

  dictEntry是字典中用来存放数据的结构体,ht->table就是用来存储entry的数组;

  dictEntry实现成了链表的形式,以链表法来解决hash冲突的问题,ht->table[index]指向的就是一个链表开头

  在插入一个新的元素时,就相当于在ht->table[index]这个链表的头部进行插入

rehash处理

rehash的条件

/* Expand the hash table if needed */
static int _dictExpandIfNeeded(dict *d)
{
    /* Incremental rehashing already in progress. Return. */
    if (dictIsRehashing(d)) return DICT_OK;

    /* If the hash table is empty expand it to the initial size. */
    if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
     * table (global setting) or we should avoid it but the ratio between
     * elements/buckets is over the "safe" threshold, we resize doubling
     * the number of buckets. */
    if (d->ht[0].used >= d->ht[0].size &&
        (dict_can_resize ||
         d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))
    {
        return dictExpand(d, d->ht[0].used*2);
    }
    return DICT_OK;
}

static int dict_can_resize = 1;
static unsigned int dict_force_resize_ratio = 5;

  此函数是在插入元素时判断是否需要扩充空间,扩充空间后就需要reahsh操作;判断条件为如下之一即可:

  • 元素个数大于hash表数组长度,并且允许重新设置大小(该变量与备份等操作有关)
  • 元素个数达到hash表数组长度的5倍,则强制进行hash操作(hash表使用链表法解决冲突,元素个数是可以大于数组长度的)

reahsh的处理

/* Expand or create the hash table */
int dictExpand(dict *d, unsigned long size)
{
    /* the size is invalid if it is smaller than the number of
     * elements already inside the hash table */
    if (dictIsRehashing(d) || d->ht[0].used > size)
        return DICT_ERR;

    dictht n; /* the new hash table */
    unsigned long realsize = _dictNextPower(size);

    /* Rehashing to the same table size is not useful. */
    if (realsize == d->ht[0].size) return DICT_ERR;

    /* Allocate the new hash table and initialize all pointers to NULL */
    n.size = realsize;
    n.sizemask = realsize-1;
    n.table = zcalloc(realsize*sizeof(dictEntry*));
    n.used = 0;

    /* Is this the first initialization? If so it's not really a rehashing
     * we just set the first hash table so that it can accept keys. */
    if (d->ht[0].table == NULL) {
        d->ht[0] = n;
        return DICT_OK;
    }

    /* Prepare a second hash table for incremental rehashing */
    d->ht[1] = n;
    d->rehashidx = 0;                   //打标志,表示开始reahsh操作
    return DICT_OK;
}

  如上可知,在扩容hash表空间时仅仅设置了一个表示,而没有进行任何rehash操作;实际的rehash操作是在后续的操作中进行的

dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
{
    long index;
    dictEntry *entry;
    dictht *ht;

    if (dictIsRehashing(d)) _dictRehashStep(d);
...................
    
#define dictIsRehashing(d) ((d)->rehashidx != -1)
...................
    
static void _dictRehashStep(dict *d) {
    if (d->iterators == 0) dictRehash(d,1);
}
....................
int dictRehash(dict *d, int n) {
    int empty_visits = n*10; 
    if (!dictIsRehashing(d)) return 0;

    while(n-- && d->ht[0].used != 0) {
        dictEntry *de, *nextde;

        /* Note that rehashidx can't overflow as we are sure there are more
         * elements because ht[0].used != 0 */
        assert(d->ht[0].size > (unsigned long)d->rehashidx);
        while(d->ht[0].table[d->rehashidx] == NULL) {               //rehashidx指代即将进行rehash的数据下标
            d->rehashidx++;
            if (--empty_visits == 0) return 1;                      //从rehashidx开始查找一个有数据的位置进行rehash,最多10次
        }
        de = d->ht[0].table[d->rehashidx];
        /* Move all the keys in this bucket from the old to the new hash HT */
        while(de) {                                                 //每一个数据都是一个链表,全部需要进行rehash操作
            uint64_t h;

            nextde = de->next;
            /* Get the index in the new hash table */
            h = dictHashKey(d, de->key) & d->ht[1].sizemask;
            de->next = d->ht[1].table[h];
            d->ht[1].table[h] = de;                                 //rehash的数据放到d->ht[1]中
            d->ht[0].used--;
            d->ht[1].used++;
            de = nextde;
        }
        d->ht[0].table[d->rehashidx] = NULL;
        d->rehashidx++;
    }

    /* Check if we already rehashed the whole table... */
    if (d->ht[0].used == 0) {                                       //如果rehash完毕,释放空间清楚标记、切换d->ht[0]和d->ht[1]
        zfree(d->ht[0].table);
        d->ht[0] = d->ht[1];
        _dictReset(&d->ht[1]);
        d->rehashidx = -1;
        return 0;
    }

    /* More to rehash... */
    return 1;
}

  rehashidxht->table的下标,表示从改下标开始进行rehash操作,每次reahsh数据都需要处理该位置的整个链表;还记得dictht ht[2];吗?两个hash表一个用来存储实际数据,另外一个是在hash过程中配合存储数据的。

rehash的效率

  在扩容时没有进行rehash操作,具体的rehash操作分散在每一次的添加、修改等操作中;时间复杂度始终控制在O(1)级别。

相关文章

网友评论

      本文标题:Redis深度历险 - dict的rehash

      本文链接:https://www.haomeiwen.com/subject/sbaulltx.html