大师兄的Python源码学习笔记(六): Dict对象
大师兄的Python源码学习笔记(八): Tuple对象
一、关于Set
- Python中的Set(集合)是一个无序的不重复元素序列,甚至可以把Set看作是只有key的dict。
- Set在Python中用PySetObject实现。
1. Set结构的存储
- Set通过结构setentry保存数据值,类似Dict中的entry。
Include\setobject.h
typedef struct {
PyObject *key;
Py_hash_t hash; /* Cached hash code of the key */
} setentry;
- 其中key是保存的数据。
- hash是保存数据的哈希值。
- 与entry类似,setentry在三种状态中切换:
状态 | 条件 |
---|---|
Unused | key==NULL和hash==0。 |
Active | key==dummy和hash==-1。 |
Dummy | key!=NULL和key!=dummy和hash!=-1 |
2. PySetObject对象
- PysetObject对象的结构如下:
Include\setobject.h
#define PySet_MINSIZE 8
... ...
typedef struct {
PyObject_HEAD
Py_ssize_t fill; /* Number active and dummy entries*/
Py_ssize_t used; /* Number active entries */
/* The table contains mask + 1 slots, and that's a power of 2.
* We store the mask instead of the size because the mask is more
* frequently needed.
*/
Py_ssize_t mask;
/* The table points to a fixed-size smalltable for small tables
* or to additional malloc'ed memory for bigger tables.
* The table pointer is never NULL which saves us from repeated
* runtime null-tests.
*/
setentry *table;
Py_hash_t hash; /* Only used by frozenset objects */
Py_ssize_t finger; /* Search finger for pop() */
setentry smalltable[PySet_MINSIZE];
PyObject *weakreflist; /* List of weak references */
} PySetObject;
- fill是已经使用的和空entry的总数量。
- used是状态为active的entry数量。
- mask是hash的掩码。
- table是指向保存数据的指针,分为两种情况:如果是smalltable,那么就指向这个固定大小的数组首地址;如set容量大于PySet_MINSIZE,则指向一个通过malloc分配的内存地址。
- hash是哈希值,只能用在不可变集合(frozenset)。
- 当创建一个set对象时,如果容量低于PySet_MINSIZE(定义为8个)时,存放在smalltable数组中。
3. Set的类型对象
- Set对应的类型对象为PySet_Type。
Objects\setobject.c
PyTypeObject PySet_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"set", /* tp_name */
sizeof(PySetObject), /* tp_basicsize */
0, /* tp_itemsize */
/* methods */
(destructor)set_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
(reprfunc)set_repr, /* tp_repr */
&set_as_number, /* tp_as_number */
&set_as_sequence, /* tp_as_sequence */
0, /* tp_as_mapping */
PyObject_HashNotImplemented, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
Py_TPFLAGS_BASETYPE, /* tp_flags */
set_doc, /* tp_doc */
(traverseproc)set_traverse, /* tp_traverse */
(inquiry)set_clear_internal, /* tp_clear */
(richcmpfunc)set_richcompare, /* tp_richcompare */
offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
(getiterfunc)set_iter, /* tp_iter */
0, /* tp_iternext */
set_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)set_init, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
set_new, /* tp_new */
PyObject_GC_Del, /* tp_free */
};
- frozenset对应的对象类型为PyFrozenSet_Type。
Objects\setobject.c
PyTypeObject PyFrozenSet_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"frozenset", /* tp_name */
sizeof(PySetObject), /* tp_basicsize */
0, /* tp_itemsize */
/* methods */
(destructor)set_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
(reprfunc)set_repr, /* tp_repr */
&frozenset_as_number, /* tp_as_number */
&set_as_sequence, /* tp_as_sequence */
0, /* tp_as_mapping */
frozenset_hash, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
Py_TPFLAGS_BASETYPE, /* tp_flags */
frozenset_doc, /* tp_doc */
(traverseproc)set_traverse, /* tp_traverse */
(inquiry)set_clear_internal, /* tp_clear */
(richcmpfunc)set_richcompare, /* tp_richcompare */
offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
(getiterfunc)set_iter, /* tp_iter */
0, /* tp_iternext */
frozenset_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
frozenset_new, /* tp_new */
PyObject_GC_Del, /* tp_free */
};
二、创建PySetObject对象
- 创建PySetObject对象时,会调用PySet_New函数。
Objects\setobject.c
PyObject *
PySet_New(PyObject *iterable)
{
return make_new_set(&PySet_Type, iterable);
}
- 而PySet_New将一个可迭代对象和类型对象传入make_new_set。
Objects\setobject.c
static PyObject *
make_new_set(PyTypeObject *type, PyObject *iterable)
{
PySetObject *so;
so = (PySetObject *)type->tp_alloc(type, 0);
if (so == NULL)
return NULL;
so->fill = 0;
so->used = 0;
so->mask = PySet_MINSIZE - 1;
so->table = so->smalltable;
so->hash = -1;
so->finger = 0;
so->weakreflist = NULL;
if (iterable != NULL) {
if (set_update_internal(so, iterable)) {
Py_DECREF(so);
return NULL;
}
}
return (PyObject *)so;
}
- make_new_set完成了PySetObject的初始化并返回一个对象指针。
- 这里值得注意的是当传入的可迭代对象存在时,会将可迭代对象的元素存入到PySetObject中。
if (iterable != NULL) {
if (set_update_internal(so, iterable)) {
Py_DECREF(so);
return NULL;
}
}
三、插入元素
- 插入元素会首先调用PySet_Add。
Objects\setobject.c
int
PySet_Add(PyObject *anyset, PyObject *key)
{
if (!PySet_Check(anyset) &&
(!PyFrozenSet_Check(anyset) || Py_REFCNT(anyset) != 1)) {
PyErr_BadInternalCall();
return -1;
}
return set_add_key((PySetObject *)anyset, key);
}
- 在检查传入对象是否是PySetObject后,调用set_add_key。
Objects\setobject.c
static int
set_add_key(PySetObject *so, PyObject *key)
{
Py_hash_t hash;
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
return set_add_entry(so, key, hash);
}
- 在检查key的合法性后,调用set_add_entry。
Objects\setobject.c
static int
set_add_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
{
setentry *table;
setentry *freeslot;
setentry *entry;
size_t perturb;
size_t mask;
size_t i; /* Unsigned for defined overflow behavior */
size_t j;
int cmp;
/* Pre-increment is necessary to prevent arbitrary code in the rich
comparison from deallocating the key just before the insertion. */
Py_INCREF(key);
restart:
mask = so->mask;
i = (size_t)hash & mask;
entry = &so->table[i];
if (entry->key == NULL)
goto found_unused;
freeslot = NULL;
perturb = hash;
while (1) {
if (entry->hash == hash) {
PyObject *startkey = entry->key;
/* startkey cannot be a dummy because the dummy hash field is -1 */
assert(startkey != dummy);
if (startkey == key)
goto found_active;
if (PyUnicode_CheckExact(startkey)
&& PyUnicode_CheckExact(key)
&& _PyUnicode_EQ(startkey, key))
goto found_active;
table = so->table;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp > 0) /* likely */
goto found_active;
if (cmp < 0)
goto comparison_error;
/* Continuing the search from the current entry only makes
sense if the table and entry are unchanged; otherwise,
we have to restart from the beginning */
if (table != so->table || entry->key != startkey)
goto restart;
mask = so->mask; /* help avoid a register spill */
}
else if (entry->hash == -1)
freeslot = entry;
if (i + LINEAR_PROBES <= mask) {
for (j = 0 ; j < LINEAR_PROBES ; j++) {
entry++;
if (entry->hash == 0 && entry->key == NULL)
goto found_unused_or_dummy;
if (entry->hash == hash) {
PyObject *startkey = entry->key;
assert(startkey != dummy);
if (startkey == key)
goto found_active;
if (PyUnicode_CheckExact(startkey)
&& PyUnicode_CheckExact(key)
&& _PyUnicode_EQ(startkey, key))
goto found_active;
table = so->table;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp > 0)
goto found_active;
if (cmp < 0)
goto comparison_error;
if (table != so->table || entry->key != startkey)
goto restart;
mask = so->mask;
}
else if (entry->hash == -1)
freeslot = entry;
}
}
perturb >>= PERTURB_SHIFT;
i = (i * 5 + 1 + perturb) & mask;
entry = &so->table[i];
if (entry->key == NULL)
goto found_unused_or_dummy;
}
found_unused_or_dummy:
if (freeslot == NULL)
goto found_unused;
so->used++;
freeslot->key = key;
freeslot->hash = hash;
return 0;
found_unused:
so->fill++;
so->used++;
entry->key = key;
entry->hash = hash;
if ((size_t)so->fill*5 < mask*3)
return 0;
return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
found_active:
Py_DECREF(key);
return 0;
comparison_error:
Py_DECREF(key);
return -1;
}
- 如果传入的哈希值计算出的索引值为空,则直接将值存入entry汇总。
entry = &so->table[i]; if (entry->key == NULL) goto found_unused; ... ... found_unused: so->fill++; so->used++; entry->key = key; entry->hash = hash; if ((size_t)so->fill*5 < mask*3) return 0; return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
- 如果索引值与插入值相同则不插入。
if (entry->hash == hash) { PyObject *startkey = entry->key; /* startkey cannot be a dummy because the dummy hash field is -1 */ assert(startkey != dummy); if (startkey == key) goto found_active; if (PyUnicode_CheckExact(startkey) && PyUnicode_CheckExact(key) && _PyUnicode_EQ(startkey, key)) goto found_active; table = so->table; Py_INCREF(startkey); cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); Py_DECREF(startkey); if (cmp > 0) /* likely */ goto found_active; if (cmp < 0) goto comparison_error; /* Continuing the search from the current entry only makes sense if the table and entry are unchanged; otherwise, we have to restart from the beginning */ if (table != so->table || entry->key != startkey) goto restart; mask = so->mask; /* help avoid a register spill */ } ... ... found_active: Py_DECREF(key); return 0;
- 如果值不为空且值不同,则遍历从该索引往后9个位置的值,依次找到有空余位置的值,并将该值设置进去。
#define LINEAR_PROBES 9 ... ... if (i + LINEAR_PROBES <= mask) { for (j = 0 ; j < LINEAR_PROBES ; j++) { entry++; if (entry->hash == 0 && entry->key == NULL) goto found_unused_or_dummy; if (entry->hash == hash) { PyObject *startkey = entry->key; assert(startkey != dummy); if (startkey == key) goto found_active; if (PyUnicode_CheckExact(startkey) && PyUnicode_CheckExact(key) && _PyUnicode_EQ(startkey, key)) goto found_active; table = so->table; Py_INCREF(startkey); cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); Py_DECREF(startkey); if (cmp > 0) goto found_active; if (cmp < 0) goto comparison_error; if (table != so->table || entry->key != startkey) goto restart; mask = so->mask; } else if (entry->hash == -1) freeslot = entry; } } perturb >>= PERTURB_SHIFT; i = (i * 5 + 1 + perturb) & mask; entry = &so->table[i]; if (entry->key == NULL) goto found_unused_or_dummy; }
- 最后,如果插入值后已使用容量超过总容量的3/5,则扩充 set
- 如果set->used>50000进行两倍扩充,否则就进行四倍扩充。
found_unused: so->fill++; so->used++; entry->key = key; entry->hash = hash; if ((size_t)so->fill*5 < mask*3) return 0; return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
四、删除元素
- 元素的删除调用set_remove函数。
Objects\setobject.c
static PyObject *
set_remove(PySetObject *so, PyObject *key)
{
PyObject *tmpkey;
int rv;
rv = set_discard_key(so, key);
if (rv < 0) {
if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
return NULL;
PyErr_Clear();
tmpkey = make_new_set(&PyFrozenSet_Type, key);
if (tmpkey == NULL)
return NULL;
rv = set_discard_key(so, tmpkey);
Py_DECREF(tmpkey);
if (rv < 0)
return NULL;
}
if (rv == DISCARD_NOTFOUND) {
_PyErr_SetKeyError(key);
return NULL;
}
Py_RETURN_NONE;
}
- set_remove函数中首先调用的是set_discard_key函数。
Objects\setobject.c
static int
set_discard_key(PySetObject *so, PyObject *key)
{
Py_hash_t hash;
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
return set_discard_entry(so, key, hash);
}
- 而set_discard_key函数在检查hash值后,又调用了set_discard_entry函数。
Objects\setobject.c
static int
set_discard_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
{
setentry *entry;
PyObject *old_key;
entry = set_lookkey(so, key, hash);
if (entry == NULL)
return -1;
if (entry->key == NULL)
return DISCARD_NOTFOUND;
old_key = entry->key;
entry->key = dummy;
entry->hash = -1;
so->used--;
Py_DECREF(old_key);
return DISCARD_FOUND;
}
- set_discard_entry函数通过set_lookkey函数查找值。
- 如果找到值,则将该值状态设置为dummy,且used值减1。
五、调整容量
- 从上面的代码中,可以发现调整容量调用了set_table_reseize函数。
Objects\setobject.c
static int
set_table_resize(PySetObject *so, Py_ssize_t minused)
{
setentry *oldtable, *newtable, *entry;
Py_ssize_t oldmask = so->mask;
size_t newmask;
int is_oldtable_malloced;
setentry small_copy[PySet_MINSIZE];
assert(minused >= 0);
/* Find the smallest table size > minused. */
/* XXX speed-up with intrinsics */
size_t newsize = PySet_MINSIZE;
while (newsize <= (size_t)minused) {
newsize <<= 1; // The largest possible value is PY_SSIZE_T_MAX + 1.
}
/* Get space for a new table. */
oldtable = so->table;
assert(oldtable != NULL);
is_oldtable_malloced = oldtable != so->smalltable;
if (newsize == PySet_MINSIZE) {
/* A large table is shrinking, or we can't get any smaller. */
newtable = so->smalltable;
if (newtable == oldtable) {
if (so->fill == so->used) {
/* No dummies, so no point doing anything. */
return 0;
}
/* We're not going to resize it, but rebuild the
table anyway to purge old dummy entries.
Subtle: This is *necessary* if fill==size,
as set_lookkey needs at least one virgin slot to
terminate failing searches. If fill < size, it's
merely desirable, as dummies slow searches. */
assert(so->fill > so->used);
memcpy(small_copy, oldtable, sizeof(small_copy));
oldtable = small_copy;
}
}
else {
newtable = PyMem_NEW(setentry, newsize);
if (newtable == NULL) {
PyErr_NoMemory();
return -1;
}
}
/* Make the set empty, using the new table. */
assert(newtable != oldtable);
memset(newtable, 0, sizeof(setentry) * newsize);
so->mask = newsize - 1;
so->table = newtable;
/* Copy the data over; this is refcount-neutral for active entries;
dummy entries aren't copied over, of course */
newmask = (size_t)so->mask;
if (so->fill == so->used) {
for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
if (entry->key != NULL) {
set_insert_clean(newtable, newmask, entry->key, entry->hash);
}
}
} else {
so->fill = so->used;
for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
if (entry->key != NULL && entry->key != dummy) {
set_insert_clean(newtable, newmask, entry->key, entry->hash);
}
}
}
if (is_oldtable_malloced)
PyMem_DEL(oldtable);
return 0;
}
- 基本理念就是创建一个新表替代旧表,将非dummy非空的entry重新插入新表,以实现调整set的容量。
Objects\setobject.c
static void
set_insert_clean(setentry *table, size_t mask, PyObject *key, Py_hash_t hash)
{
setentry *entry;
size_t perturb = hash;
size_t i = (size_t)hash & mask;
size_t j;
while (1) {
entry = &table[i];
if (entry->key == NULL)
goto found_null;
if (i + LINEAR_PROBES <= mask) {
for (j = 0; j < LINEAR_PROBES; j++) {
entry++;
if (entry->key == NULL)
goto found_null;
}
}
perturb >>= PERTURB_SHIFT;
i = (i * 5 + 1 + perturb) & mask;
}
found_null:
entry->key = key;
entry->hash = hash;
}
网友评论