美文网首页
大师兄的Python源码学习笔记(七): Set对象

大师兄的Python源码学习笔记(七): Set对象

作者: superkmi | 来源:发表于2021-03-08 10:51 被阅读0次

大师兄的Python源码学习笔记(六): Dict对象
大师兄的Python源码学习笔记(八): Tuple对象

一、关于Set

  • Python中的Set(集合)是一个无序的不重复元素序列,甚至可以把Set看作是只有keydict
  • Set在Python中用PySetObject实现。
1. Set结构的存储
  • Set通过结构setentry保存数据值,类似Dict中的entry
Include\setobject.h

typedef struct {
    PyObject *key;
    Py_hash_t hash;             /* Cached hash code of the key */
} setentry;
  • 其中key是保存的数据。
  • hash是保存数据的哈希值。
  • 与entry类似,setentry在三种状态中切换:
状态 条件
Unused key==NULL和hash==0。
Active key==dummy和hash==-1。
Dummy key!=NULL和key!=dummy和hash!=-1
2. PySetObject对象
  • PysetObject对象的结构如下:
Include\setobject.h

#define PySet_MINSIZE 8
... ...
typedef struct {
    PyObject_HEAD

    Py_ssize_t fill;            /* Number active and dummy entries*/
    Py_ssize_t used;            /* Number active entries */

    /* The table contains mask + 1 slots, and that's a power of 2.
     * We store the mask instead of the size because the mask is more
     * frequently needed.
     */
    Py_ssize_t mask;

    /* The table points to a fixed-size smalltable for small tables
     * or to additional malloc'ed memory for bigger tables.
     * The table pointer is never NULL which saves us from repeated
     * runtime null-tests.
     */
    setentry *table;
    Py_hash_t hash;             /* Only used by frozenset objects */
    Py_ssize_t finger;          /* Search finger for pop() */

    setentry smalltable[PySet_MINSIZE];
    PyObject *weakreflist;      /* List of weak references */
} PySetObject;
  • fill是已经使用的和空entry的总数量。
  • used是状态为active的entry数量。
  • mask是hash的掩码。
  • table是指向保存数据的指针,分为两种情况:如果是smalltable,那么就指向这个固定大小的数组首地址;如set容量大于PySet_MINSIZE,则指向一个通过malloc分配的内存地址。
  • hash是哈希值,只能用在不可变集合(frozenset)。
  • 当创建一个set对象时,如果容量低于PySet_MINSIZE(定义为8个)时,存放在smalltable数组中。
3. Set的类型对象
  • Set对应的类型对象为PySet_Type
Objects\setobject.c

PyTypeObject PySet_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "set",                              /* tp_name */
    sizeof(PySetObject),                /* tp_basicsize */
    0,                                  /* tp_itemsize */
    /* methods */
    (destructor)set_dealloc,            /* tp_dealloc */
    0,                                  /* tp_print */
    0,                                  /* tp_getattr */
    0,                                  /* tp_setattr */
    0,                                  /* tp_reserved */
    (reprfunc)set_repr,                 /* tp_repr */
    &set_as_number,                     /* tp_as_number */
    &set_as_sequence,                   /* tp_as_sequence */
    0,                                  /* tp_as_mapping */
    PyObject_HashNotImplemented,        /* tp_hash */
    0,                                  /* tp_call */
    0,                                  /* tp_str */
    PyObject_GenericGetAttr,            /* tp_getattro */
    0,                                  /* tp_setattro */
    0,                                  /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
        Py_TPFLAGS_BASETYPE,            /* tp_flags */
    set_doc,                            /* tp_doc */
    (traverseproc)set_traverse,         /* tp_traverse */
    (inquiry)set_clear_internal,        /* tp_clear */
    (richcmpfunc)set_richcompare,       /* tp_richcompare */
    offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
    (getiterfunc)set_iter,              /* tp_iter */
    0,                                  /* tp_iternext */
    set_methods,                        /* tp_methods */
    0,                                  /* tp_members */
    0,                                  /* tp_getset */
    0,                                  /* tp_base */
    0,                                  /* tp_dict */
    0,                                  /* tp_descr_get */
    0,                                  /* tp_descr_set */
    0,                                  /* tp_dictoffset */
    (initproc)set_init,                 /* tp_init */
    PyType_GenericAlloc,                /* tp_alloc */
    set_new,                            /* tp_new */
    PyObject_GC_Del,                    /* tp_free */
};
  • frozenset对应的对象类型为PyFrozenSet_Type
Objects\setobject.c

PyTypeObject PyFrozenSet_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "frozenset",                        /* tp_name */
    sizeof(PySetObject),                /* tp_basicsize */
    0,                                  /* tp_itemsize */
    /* methods */
    (destructor)set_dealloc,            /* tp_dealloc */
    0,                                  /* tp_print */
    0,                                  /* tp_getattr */
    0,                                  /* tp_setattr */
    0,                                  /* tp_reserved */
    (reprfunc)set_repr,                 /* tp_repr */
    &frozenset_as_number,               /* tp_as_number */
    &set_as_sequence,                   /* tp_as_sequence */
    0,                                  /* tp_as_mapping */
    frozenset_hash,                     /* tp_hash */
    0,                                  /* tp_call */
    0,                                  /* tp_str */
    PyObject_GenericGetAttr,            /* tp_getattro */
    0,                                  /* tp_setattro */
    0,                                  /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
        Py_TPFLAGS_BASETYPE,            /* tp_flags */
    frozenset_doc,                      /* tp_doc */
    (traverseproc)set_traverse,         /* tp_traverse */
    (inquiry)set_clear_internal,        /* tp_clear */
    (richcmpfunc)set_richcompare,       /* tp_richcompare */
    offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
    (getiterfunc)set_iter,              /* tp_iter */
    0,                                  /* tp_iternext */
    frozenset_methods,                  /* tp_methods */
    0,                                  /* tp_members */
    0,                                  /* tp_getset */
    0,                                  /* tp_base */
    0,                                  /* tp_dict */
    0,                                  /* tp_descr_get */
    0,                                  /* tp_descr_set */
    0,                                  /* tp_dictoffset */
    0,                                  /* tp_init */
    PyType_GenericAlloc,                /* tp_alloc */
    frozenset_new,                      /* tp_new */
    PyObject_GC_Del,                    /* tp_free */
};

二、创建PySetObject对象

  • 创建PySetObject对象时,会调用PySet_New函数。
Objects\setobject.c

PyObject *
PySet_New(PyObject *iterable)
{
    return make_new_set(&PySet_Type, iterable);
}
  • PySet_New将一个可迭代对象和类型对象传入make_new_set
Objects\setobject.c

static PyObject *
make_new_set(PyTypeObject *type, PyObject *iterable)
{
    PySetObject *so;

    so = (PySetObject *)type->tp_alloc(type, 0);
    if (so == NULL)
        return NULL;

    so->fill = 0;
    so->used = 0;
    so->mask = PySet_MINSIZE - 1;
    so->table = so->smalltable;
    so->hash = -1;
    so->finger = 0;
    so->weakreflist = NULL;

    if (iterable != NULL) {
        if (set_update_internal(so, iterable)) {
            Py_DECREF(so);
            return NULL;
        }
    }

    return (PyObject *)so;
}
  • make_new_set完成了PySetObject的初始化并返回一个对象指针。
  • 这里值得注意的是当传入的可迭代对象存在时,会将可迭代对象的元素存入到PySetObject中。
 if (iterable != NULL) {
        if (set_update_internal(so, iterable)) {
            Py_DECREF(so);
            return NULL;
        }
    }

三、插入元素

  • 插入元素会首先调用PySet_Add
Objects\setobject.c

int
PySet_Add(PyObject *anyset, PyObject *key)
{
    if (!PySet_Check(anyset) &&
        (!PyFrozenSet_Check(anyset) || Py_REFCNT(anyset) != 1)) {
        PyErr_BadInternalCall();
        return -1;
    }
    return set_add_key((PySetObject *)anyset, key);
}
  • 在检查传入对象是否是PySetObject后,调用set_add_key
Objects\setobject.c

static int
set_add_key(PySetObject *so, PyObject *key)
{
    Py_hash_t hash;

    if (!PyUnicode_CheckExact(key) ||
        (hash = ((PyASCIIObject *) key)->hash) == -1) {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return -1;
    }
    return set_add_entry(so, key, hash);
}
  • 在检查key的合法性后,调用set_add_entry
Objects\setobject.c

static int
set_add_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
{
    setentry *table;
    setentry *freeslot;
    setentry *entry;
    size_t perturb;
    size_t mask;
    size_t i;                       /* Unsigned for defined overflow behavior */
    size_t j;
    int cmp;

    /* Pre-increment is necessary to prevent arbitrary code in the rich
       comparison from deallocating the key just before the insertion. */
    Py_INCREF(key);

  restart:

    mask = so->mask;
    i = (size_t)hash & mask;

    entry = &so->table[i];
    if (entry->key == NULL)
        goto found_unused;

    freeslot = NULL;
    perturb = hash;

    while (1) {
        if (entry->hash == hash) {
            PyObject *startkey = entry->key;
            /* startkey cannot be a dummy because the dummy hash field is -1 */
            assert(startkey != dummy);
            if (startkey == key)
                goto found_active;
            if (PyUnicode_CheckExact(startkey)
                && PyUnicode_CheckExact(key)
                && _PyUnicode_EQ(startkey, key))
                goto found_active;
            table = so->table;
            Py_INCREF(startkey);
            cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
            Py_DECREF(startkey);
            if (cmp > 0)                                          /* likely */
                goto found_active;
            if (cmp < 0)
                goto comparison_error;
            /* Continuing the search from the current entry only makes
               sense if the table and entry are unchanged; otherwise,
               we have to restart from the beginning */
            if (table != so->table || entry->key != startkey)
                goto restart;
            mask = so->mask;                 /* help avoid a register spill */
        }
        else if (entry->hash == -1)
            freeslot = entry;

        if (i + LINEAR_PROBES <= mask) {
            for (j = 0 ; j < LINEAR_PROBES ; j++) {
                entry++;
                if (entry->hash == 0 && entry->key == NULL)
                    goto found_unused_or_dummy;
                if (entry->hash == hash) {
                    PyObject *startkey = entry->key;
                    assert(startkey != dummy);
                    if (startkey == key)
                        goto found_active;
                    if (PyUnicode_CheckExact(startkey)
                        && PyUnicode_CheckExact(key)
                        && _PyUnicode_EQ(startkey, key))
                        goto found_active;
                    table = so->table;
                    Py_INCREF(startkey);
                    cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
                    Py_DECREF(startkey);
                    if (cmp > 0)
                        goto found_active;
                    if (cmp < 0)
                        goto comparison_error;
                    if (table != so->table || entry->key != startkey)
                        goto restart;
                    mask = so->mask;
                }
                else if (entry->hash == -1)
                    freeslot = entry;
            }
        }

        perturb >>= PERTURB_SHIFT;
        i = (i * 5 + 1 + perturb) & mask;

        entry = &so->table[i];
        if (entry->key == NULL)
            goto found_unused_or_dummy;
    }

  found_unused_or_dummy:
    if (freeslot == NULL)
        goto found_unused;
    so->used++;
    freeslot->key = key;
    freeslot->hash = hash;
    return 0;

  found_unused:
    so->fill++;
    so->used++;
    entry->key = key;
    entry->hash = hash;
    if ((size_t)so->fill*5 < mask*3)
        return 0;
    return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);

  found_active:
    Py_DECREF(key);
    return 0;

  comparison_error:
    Py_DECREF(key);
    return -1;
}
  • 如果传入的哈希值计算出的索引值为空,则直接将值存入entry汇总。
 entry = &so->table[i];
   if (entry->key == NULL)
       goto found_unused;
... ...
 found_unused:
   so->fill++;
   so->used++;
   entry->key = key;
   entry->hash = hash;
   if ((size_t)so->fill*5 < mask*3)
       return 0;
   return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
  • 如果索引值与插入值相同则不插入。
       if (entry->hash == hash) {
           PyObject *startkey = entry->key;
           /* startkey cannot be a dummy because the dummy hash field is -1 */
           assert(startkey != dummy);
           if (startkey == key)
               goto found_active;
           if (PyUnicode_CheckExact(startkey)
               && PyUnicode_CheckExact(key)
               && _PyUnicode_EQ(startkey, key))
               goto found_active;
           table = so->table;
           Py_INCREF(startkey);
           cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
           Py_DECREF(startkey);
           if (cmp > 0)                                          /* likely */
               goto found_active;
           if (cmp < 0)
               goto comparison_error;
           /* Continuing the search from the current entry only makes
              sense if the table and entry are unchanged; otherwise,
              we have to restart from the beginning */
           if (table != so->table || entry->key != startkey)
               goto restart;
           mask = so->mask;                 /* help avoid a register spill */
       }
... ...
 found_active:
   Py_DECREF(key);
   return 0;
  • 如果值不为空且值不同,则遍历从该索引往后9个位置的值,依次找到有空余位置的值,并将该值设置进去。
#define LINEAR_PROBES 9
... ...
       if (i + LINEAR_PROBES <= mask) {
           for (j = 0 ; j < LINEAR_PROBES ; j++) {
               entry++;
               if (entry->hash == 0 && entry->key == NULL)
                   goto found_unused_or_dummy;
               if (entry->hash == hash) {
                   PyObject *startkey = entry->key;
                   assert(startkey != dummy);
                   if (startkey == key)
                       goto found_active;
                   if (PyUnicode_CheckExact(startkey)
                       && PyUnicode_CheckExact(key)
                       && _PyUnicode_EQ(startkey, key))
                       goto found_active;
                   table = so->table;
                   Py_INCREF(startkey);
                   cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
                   Py_DECREF(startkey);
                   if (cmp > 0)
                       goto found_active;
                   if (cmp < 0)
                       goto comparison_error;
                   if (table != so->table || entry->key != startkey)
                       goto restart;
                   mask = so->mask;
               }
               else if (entry->hash == -1)
                   freeslot = entry;
           }
       }
       perturb >>= PERTURB_SHIFT;
       i = (i * 5 + 1 + perturb) & mask;

       entry = &so->table[i];
       if (entry->key == NULL)
           goto found_unused_or_dummy;
   }
  • 最后,如果插入值后已使用容量超过总容量的3/5,则扩充 set
  • 如果set->used>50000进行两倍扩充,否则就进行四倍扩充。
found_unused:
   so->fill++;
   so->used++;
   entry->key = key;
   entry->hash = hash;
   if ((size_t)so->fill*5 < mask*3)
       return 0;
   return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);

四、删除元素

  • 元素的删除调用set_remove函数。
Objects\setobject.c

static PyObject *
set_remove(PySetObject *so, PyObject *key)
{
    PyObject *tmpkey;
    int rv;

    rv = set_discard_key(so, key);
    if (rv < 0) {
        if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
            return NULL;
        PyErr_Clear();
        tmpkey = make_new_set(&PyFrozenSet_Type, key);
        if (tmpkey == NULL)
            return NULL;
        rv = set_discard_key(so, tmpkey);
        Py_DECREF(tmpkey);
        if (rv < 0)
            return NULL;
    }

    if (rv == DISCARD_NOTFOUND) {
        _PyErr_SetKeyError(key);
        return NULL;
    }
    Py_RETURN_NONE;
}
  • set_remove函数中首先调用的是set_discard_key函数。
Objects\setobject.c

static int
set_discard_key(PySetObject *so, PyObject *key)
{
    Py_hash_t hash;

    if (!PyUnicode_CheckExact(key) ||
        (hash = ((PyASCIIObject *) key)->hash) == -1) {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return -1;
    }
    return set_discard_entry(so, key, hash);
}
  • set_discard_key函数在检查hash值后,又调用了set_discard_entry函数。
Objects\setobject.c

static int
set_discard_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
{
    setentry *entry;
    PyObject *old_key;

    entry = set_lookkey(so, key, hash);
    if (entry == NULL)
        return -1;
    if (entry->key == NULL)
        return DISCARD_NOTFOUND;
    old_key = entry->key;
    entry->key = dummy;
    entry->hash = -1;
    so->used--;
    Py_DECREF(old_key);
    return DISCARD_FOUND;
}
  • set_discard_entry函数通过set_lookkey函数查找值。
  • 如果找到值,则将该值状态设置为dummy,且used值减1。

五、调整容量

  • 从上面的代码中,可以发现调整容量调用了set_table_reseize函数。
Objects\setobject.c

static int
set_table_resize(PySetObject *so, Py_ssize_t minused)
{
    setentry *oldtable, *newtable, *entry;
    Py_ssize_t oldmask = so->mask;
    size_t newmask;
    int is_oldtable_malloced;
    setentry small_copy[PySet_MINSIZE];

    assert(minused >= 0);

    /* Find the smallest table size > minused. */
    /* XXX speed-up with intrinsics */
    size_t newsize = PySet_MINSIZE;
    while (newsize <= (size_t)minused) {
        newsize <<= 1; // The largest possible value is PY_SSIZE_T_MAX + 1.
    }

    /* Get space for a new table. */
    oldtable = so->table;
    assert(oldtable != NULL);
    is_oldtable_malloced = oldtable != so->smalltable;

    if (newsize == PySet_MINSIZE) {
        /* A large table is shrinking, or we can't get any smaller. */
        newtable = so->smalltable;
        if (newtable == oldtable) {
            if (so->fill == so->used) {
                /* No dummies, so no point doing anything. */
                return 0;
            }
            /* We're not going to resize it, but rebuild the
               table anyway to purge old dummy entries.
               Subtle:  This is *necessary* if fill==size,
               as set_lookkey needs at least one virgin slot to
               terminate failing searches.  If fill < size, it's
               merely desirable, as dummies slow searches. */
            assert(so->fill > so->used);
            memcpy(small_copy, oldtable, sizeof(small_copy));
            oldtable = small_copy;
        }
    }
    else {
        newtable = PyMem_NEW(setentry, newsize);
        if (newtable == NULL) {
            PyErr_NoMemory();
            return -1;
        }
    }

    /* Make the set empty, using the new table. */
    assert(newtable != oldtable);
    memset(newtable, 0, sizeof(setentry) * newsize);
    so->mask = newsize - 1;
    so->table = newtable;

    /* Copy the data over; this is refcount-neutral for active entries;
       dummy entries aren't copied over, of course */
    newmask = (size_t)so->mask;
    if (so->fill == so->used) {
        for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
            if (entry->key != NULL) {
                set_insert_clean(newtable, newmask, entry->key, entry->hash);
            }
        }
    } else {
        so->fill = so->used;
        for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
            if (entry->key != NULL && entry->key != dummy) {
                set_insert_clean(newtable, newmask, entry->key, entry->hash);
            }
        }
    }

    if (is_oldtable_malloced)
        PyMem_DEL(oldtable);
    return 0;
}
  • 基本理念就是创建一个新表替代旧表,将非dummy非空的entry重新插入新表,以实现调整set的容量。
Objects\setobject.c

static void
set_insert_clean(setentry *table, size_t mask, PyObject *key, Py_hash_t hash)
{
    setentry *entry;
    size_t perturb = hash;
    size_t i = (size_t)hash & mask;
    size_t j;

    while (1) {
        entry = &table[i];
        if (entry->key == NULL)
            goto found_null;
        if (i + LINEAR_PROBES <= mask) {
            for (j = 0; j < LINEAR_PROBES; j++) {
                entry++;
                if (entry->key == NULL)
                    goto found_null;
            }
        }
        perturb >>= PERTURB_SHIFT;
        i = (i * 5 + 1 + perturb) & mask;
    }
  found_null:
    entry->key = key;
    entry->hash = hash;
}

相关文章

网友评论

      本文标题:大师兄的Python源码学习笔记(七): Set对象

      本文链接:https://www.haomeiwen.com/subject/zrjxqltx.html