美文网首页
Python源码学习笔记 5 字典对象

Python源码学习笔记 5 字典对象

作者: openex | 来源:发表于2018-04-04 16:52 被阅读0次

    Python中对于字典的实现是根据key进行hash生成散列表,算法为“开放定址法”

    1.PyDictEntry(K, V对)

    字典中每一个kv对,实际上就是一个entry对象
    entry的状态存在3种状态 Active, Unused, Dummy
    其含义显而易见,值得注意的是Dummy状态,该状态实际上是因为散列表的“开放定址法”缘故,某entry恰巧在碰撞链中时,它不可删除,以免找不到真正的key而直接返回查询失败。

    [dictobject.h]
    typedef struct {
        Py_ssize_t me_hash;      /* cached hash code of me_key */
        PyObject *me_key;
        PyObject *me_value;
    } PyDictEntry;
    
    entry

    2. PyDictObject对象结构(关联容器dict)

    由源码可知,该对象是一个定长对象,初始时会分配一个8个entry的数组ma_smalltable

    #define PyDict_MINSIZE 8
    typedef struct _dictobject PyDictObject;
    struct _dictobject {
        PyObject_HEAD
        Py_ssize_t ma_fill;  //entry个数: Active + Dummy
        Py_ssize_t ma_used;  //entry个数: Active 
        Py_ssize_t ma_mask; //ma_table能容纳元素数,搜索时用以对hash值做与操作
        PyDictEntry *ma_table; //entry超过8个时会分配较大数组,指针指向该数组
        PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, long hash);//查询函数
        PyDictEntry ma_smalltable[PyDict_MINSIZE];//默认存在的小entry数组避免频繁分配内存
    };
    

    3.PyDictObject对象的创建

    typedef PyDictEntry dictentry;
    typedef PyDictObject dictobject;
    
    /* 将ma_table指向ma_smalltable 并初始化ma_mask */
    #define INIT_NONZERO_DICT_SLOTS(mp) do {          \
        (mp)->ma_table = (mp)->ma_smalltable;     \
        (mp)->ma_mask = PyDict_MINSIZE - 1;       \
        } while(0)
    
    /* 将ma_smalltable清零,重置ma_used和ma_fill并调用INIT_NONZERO_DICT_SLOTS */
    #define EMPTY_TO_MINSIZE(mp) do {               \
        memset((mp)->ma_smalltable, 0, sizeof((mp)->ma_smalltable));    \
        (mp)->ma_used = (mp)->ma_fill = 0;        \
        INIT_NONZERO_DICT_SLOTS(mp);                \
        } while(0)
    
    PyObject *
    PyDict_New(void)
    {
        register PyDictObject *mp;
        if (dummy == NULL) { /* Auto-initialize dummy */
            dummy = PyString_FromString("<dummy key>");
            if (dummy == NULL)
                return NULL;
    #ifdef SHOW_CONVERSION_COUNTS
            Py_AtExit(show_counts);
    #endif
    #ifdef SHOW_ALLOC_COUNT
            Py_AtExit(show_alloc);
    #endif
    #ifdef SHOW_TRACK_COUNT
            Py_AtExit(show_track);
    #endif
        }
        if (numfree) { /* 判断dict缓冲池是否可用 */
            mp = free_list[--numfree];
            assert (mp != NULL);
            assert (Py_TYPE(mp) == &PyDict_Type);
            _Py_NewReference((PyObject *)mp);
            if (mp->ma_fill) { /* 检查ma_fill判断是否需要EMPTY_TO_MINSIZE */
                EMPTY_TO_MINSIZE(mp);
            } else {
                /* 否则至少需要进行INIT_NONZERO_DICT_SLOTS操作 */
                INIT_NONZERO_DICT_SLOTS(mp);
            }
            assert (mp->ma_used == 0);
            assert (mp->ma_table == mp->ma_smalltable);
            assert (mp->ma_mask == PyDict_MINSIZE - 1);
    #ifdef SHOW_ALLOC_COUNT
            count_reuse++;
    #endif
        /* 缓冲池不可用时,进行内存分配操作 */
        } else { 
            mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
            if (mp == NULL)
                return NULL;
            EMPTY_TO_MINSIZE(mp);
    #ifdef SHOW_ALLOC_COUNT
            count_alloc++;
    #endif
        }
        mp->ma_lookup = lookdict_string;
    #ifdef SHOW_TRACK_COUNT
        count_untracked++;
    #endif
    #ifdef SHOW_CONVERSION_COUNTS
        ++created;
    #endif
        return (PyObject *)mp;
    }
    

    4.entry的搜索

    大致流程如下

    首先寻找第一个entry:
    • 通过hash & mask获取索引i,在ma_table[i]处取出该entry对象
    • 根据该entry的key产生2种可能:
    • ep->key==NULL 搜索失败返回该entry
    • ep->key == key 搜索成功返回该entry
    • 如果me_key == dummy,令freeslot = ep
    • 检查active态的entry,判断hash是否相同,若相同则继续比较key值是否相同
    • 失败的话继续寻找下一个散列位置,这样迭代下去
    static PyDictEntry *
    lookdict(PyDictObject *mp, PyObject *key, register long hash)
    {
        register size_t i;
        register size_t perturb;
        register PyDictEntry *freeslot;
        register size_t mask = (size_t)mp->ma_mask;
        PyDictEntry *ep0 = mp->ma_table;
        register PyDictEntry *ep;
        register int cmp;
        PyObject *startkey;
    
        i = (size_t)hash & mask; //与运算防止溢出
        ep = &ep0[i];
        if (ep->me_key == NULL || ep->me_key == key)//搜索成功(引用相同)或搜索失败(unused)
            return ep;
    
        if (ep->me_key == dummy) //key为dummy,置位freeslot,便于服用内存
            freeslot = ep;
        else {
            /* active态的查询 */
            if (ep->me_hash == hash) {//先判断hash
                startkey = ep->me_key;
                Py_INCREF(startkey);
                cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);//再判断key值是否相同
                Py_DECREF(startkey);
                if (cmp < 0)
                    return NULL;
                if (ep0 == mp->ma_table && ep->me_key == startkey) {
                    if (cmp > 0)
                        return ep;
                }
                else {
                    /* The compare did major nasty stuff to the
                     * dict:  start over.
                     * XXX A clever adversary could prevent this
                     * XXX from terminating.
                     */
                    return lookdict(mp, key, hash);
                }
            }
            freeslot = NULL;
        }
    
        /* In the loop, me_key == dummy is by far (factor of 100s) the
           least likely outcome, so test for that last. */
        for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
            i = (i << 2) + i + perturb + 1;//重定位下一位置
            ep = &ep0[i & mask];
            if (ep->me_key == NULL)
                return freeslot == NULL ? ep : freeslot;
            if (ep->me_key == key)
                return ep;
            if (ep->me_hash == hash && ep->me_key != dummy) {
                startkey = ep->me_key;
                Py_INCREF(startkey);
                cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
                Py_DECREF(startkey);
                if (cmp < 0)
                    return NULL;
                if (ep0 == mp->ma_table && ep->me_key == startkey) {
                    if (cmp > 0)
                        return ep;
                }
                else {
                    /* The compare did major nasty stuff to the
                     * dict:  start over.
                     * XXX A clever adversary could prevent this
                     * XXX from terminating.
                     */
                    return lookdict(mp, key, hash);
                }
            }
            else if (ep->me_key == dummy && freeslot == NULL)
                freeslot = ep;
        }
        assert(0);          /* NOT REACHED */
        return 0;
    }
    

    lookdict_string默认搜索方法

    [dictobject.c]
    static dictentry* lookdict_string(dictobject *mp, PyObject *key, register long hash)
    {
        register int i;
        register unsigned int perturb;
        register dictentry *freeslot;
        register unsigned int mask = mp->ma_mask;
        dictentry *ep0 = mp->ma_table;
        register dictentry *ep;
      
        if (!PyString_CheckExact(key)) { //判断key是否为string类型,若非则返回传统搜索方式
            mp->ma_lookup = lookdict;
            return lookdict(mp, key, hash);
        }
    
        i = hash & mask;
        ep = &ep0[i];
    
    
        if (ep->me_key == NULL || ep->me_key == key)
            return ep;
    
        if (ep->me_key == dummy)
            freeslot = ep;
        else 
        {
            //string默认策略主要不同在此处,判断函数较为轻量
            if (ep->me_hash == hash && _PyString_Eq(ep->me_key, key)) 
            {
                return ep;
            }
            freeslot = NULL;
        }
    
        //搜索第二阶段:遍历冲突链,检查每一个entry
        for (perturb = hash; ; perturb >>= PERTURB_SHIFT) 
        {
            i = (i << 2) + i + perturb + 1;
            ep = &ep0[i & mask];
            if (ep->me_key == NULL)
                return freeslot == NULL ? ep : freeslot;
            if (ep->me_key == key
                || (ep->me_hash == hash && ep->me_key != dummy &&
                      _PyString_Eq(ep->me_key, key)))
                return ep;
            if (ep->me_key == dummy && freeslot == NULL)
                freeslot = ep;
        }
    }
    

    5.元素插入

    insertdict函数:
    该函数关心ma_lookup返回的对象类型,决定插入的策略

    [dictobject.c]
    static void 
    insertdict(register dictobject *mp, PyObject *key, long hash, PyObject *value)
    {
        PyObject *old_value;
        register dictentry *ep;
        
        ep = mp->ma_lookup(mp, key, hash);
        /*搜索成功*/
        if (ep->me_value != NULL) {
            old_value = ep->me_value;
            ep->me_value = value;
            Py_DECREF(old_value); 
            Py_DECREF(key);
        }
        /* 搜索失败,返回的值可能是unused或dummy*/
        else {
            if (ep->me_key == NULL) //为unused时ma_fill++
                mp->ma_fill++;
            else  //否则为dummy
                Py_DECREF(ep->me_key);
            ep->me_key = key;
            ep->me_hash = hash;
            ep->me_value = value;
            mp->ma_used++;
        }
    }
    

    PyDict_SetItem函数:
    insertdict函数被该函数调用,该函数主要关心取得hash值

    [dictobject.c]
    int PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
    {
        register dictobject *mp;
        register long hash;
        register Py_ssize_t n_used;
    
        mp = (dictobject *)op;
        //[1]:计算hash值
        if (PyString_CheckExact(key)) {
            hash = ((PyStringObject *)key)->ob_shash;
            if (hash == -1)
                hash = PyObject_Hash(key);
        }
        else {
            hash = PyObject_Hash(key);
            if (hash == -1)
                return -1;
        }
        //[2]:插入(key, value)元素对
        n_used = mp->ma_used;
        insertdict(mp, key, hash, value);
        
        //[3]:必要时调整dict的内存空间,实际上为判断装填率是否大于2/3
        if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
            return 0;
        return dictresize(mp, mp->ma_used * (mp->ma_used>50000 ? 2 : 4));
    }
    

    dictresize函数
    该函数关心字典列表的调整,根据需求使用ma_smalltable或重新分配新的空间,旧空间的Acttive entry依次插入新空间中,dummy趁机释放掉

    [dictobject.c]
    static int dictresize(dictobject *mp, int minused)
    {
        Py_ssize_t newsize;
        dictentry *oldtable, *newtable, *ep;
        Py_ssize_t i;
        int is_oldtable_malloced;
        dictentry small_copy[PyDict_MINSIZE];
        //[1]:确定新的table的大小
        for(newsize = PyDict_MINSIZE; newsize <= minused && newsize > 0; newsize <<= 1)
            ;
        oldtable = mp->ma_table;
        is_oldtable_malloced = (oldtable != mp->ma_smalltable);
    
        //[2]: 新的table可以使用mp->ma_smalltable
        if (newsize == PyDict_MINSIZE) {
            newtable = mp->ma_smalltable;
            if (newtable == oldtable) {
                if (mp->ma_fill == mp->ma_used) {
                    //没有任何Dummy态entry,直接返回
                    return 0;
                }
                //将旧table拷贝,进行备份
                memcpy(small_copy, oldtable, sizeof(small_copy));
                oldtable = small_copy;
            }
        }
        //[3]: 新的table不能使用mp->ma_smalltable,需要在系统堆上申请
        else {
            newtable = PyMem_NEW(dictentry, newsize);
        }
    
        //[4]:设置新table
        mp->ma_table = newtable;
        mp->ma_mask = newsize - 1;
        memset(newtable, 0, sizeof(dictentry) * newsize);
        mp->ma_used = 0;
        i = mp->ma_fill;
        mp->ma_fill = 0;
    
        //[5]:处理旧table中的entry:
        //    1、Active态entry,搬移到新table中
        //    2、Dummy态entry,调整key的引用计数,丢弃该entry
        for (ep = oldtable; i > 0; ep++) {
            if (ep->me_value != NULL) { /* active entry */
                --i;
                insertdict(mp, ep->me_key, ep->me_hash, ep->me_value);
            }
            else if (ep->me_key != NULL) {  /* dummy entry */
                --i;
                assert(ep->me_key == dummy);
                Py_DECREF(ep->me_key);
            }
        }
        //[6]:必要时释放旧table所维护的内存空间
        if (is_oldtable_malloced)
            PyMem_DEL(oldtable);
        return 0;
    }
    
    

    6.删除元素

    [dictobject.c]
    int PyDict_DelItem(PyObject *op, PyObject *key)
    {
        register dictobject *mp;
        register long hash;
        register dictentry *ep;
        PyObject *old_value, *old_key;
        //[1]:获得hash值
        if (!PyString_CheckExact(key) ||
            (hash = ((PyStringObject *) key)->ob_shash) == -1) {
            hash = PyObject_Hash(key);
            if (hash == -1)
                return -1;
        }
        //[2]:搜索entry
        mp = (dictobject *)op;
        ep = (mp->ma_lookup)(mp, key, hash);
        if (ep->me_value == NULL) { //搜索失败,entry不存在
            return -1;
        }
        //[3]:删除entry所维护的元素,将entry的状态转为dummy态
        old_key = ep->me_key;
        ep->me_key = dummy;
        old_value = ep->me_value;
        ep->me_value = NULL;
        mp->ma_used--;
        Py_DECREF(old_value);
        Py_DECREF(old_key);
        return 0;
    }
     
    

    7.字典缓冲池

    与列表对象相似,也是待删除该字典对象时尝试加入到缓冲区。加入前进行一系列的清理动作。

    [dictobject.c]
    static void dict_dealloc(register dictobject *mp)
    {
        register dictentry *ep;
        Py_ssize_t fill = mp->ma_fill;
        //[1]:调整dict中对象的引用计数
        for (ep = mp->ma_table; fill > 0; ep++) {
            if (ep->me_key) {
                --fill;
                Py_DECREF(ep->me_key);
                Py_XDECREF(ep->me_value);
            }
        }
        //[2] :释放从系统堆中申请的内存空间
        if (mp->ma_table != mp->ma_smalltable)
            PyMem_DEL(mp->ma_table);
        //[3] :将被销毁的PyDictObject对象放入缓冲池
        if (num_free_dicts < MAXFREEDICTS && mp->ob_type == &PyDict_Type)
            free_dicts[num_free_dicts++] = mp;
        else 
            mp->ob_type->tp_free((PyObject *)mp);
    }
    
    

    相关文章

      网友评论

          本文标题:Python源码学习笔记 5 字典对象

          本文链接:https://www.haomeiwen.com/subject/mixlhftx.html