大师兄的Python源码学习笔记(三): 整数对象
大师兄的Python源码学习笔记(五): List对象
一、关于字符串对象
- Python2和Python3的字符串对象有显著差别,Python3中的文本总是unicode,而二进制数据是bytes:
Python2对应类型 Python3对应类型 str bytes unicode str
- 8-bit字符串对象在Python2中为PyStringObject, Python3后改由PyBytesObject实现。
- unicode字符串对象在Python3中由PyUnicodeObject实现。
1 PyBytesObject对象
- PyBytesObject属于变长对象(PyVarObject)中的不可变对象(immutable)。
比如"Hello"和"World!"是两个不同的PyBytesObject,他们所占的内存空间不同。
但当创建后,他们内部维护的字符串就不能改变了。
1.1 PyBytesObject的结构
include/bytesobject.h
typedef struct {
PyObject_VAR_HEAD
Py_hash_t ob_shash;
char ob_sval[1];
/* Invariants:
* ob_sval contains space for 'ob_size+1' elements.
* ob_sval[ob_size] == 0.
* ob_shash is the hash of the string or -1 if not computed yet.
*/
} PyBytesObject;
- 由以上代码可以发现对象头部是一个PyObject_VAR_HEAD,为变长对象的公共部分。
- ob_shash是这个对象存储的字节数组的哈希值,如果哈希值没有计算则为-1。
- ob_sval指向内存中的真实的位置,这段内存保存实际的字符串。
- PyObject_VAR_HEAD中还包含ob_size,记录ob_sval的可变长度大小。
- ob_sval指向的字符串必须满足
ob_sval[ob_size] = '\0'
。
1.2 计算哈希值
- PyBytesObject中的哈希值保存在ob_shash中,为了避免每一次都需要重新计算哈希值。
- 在计算哈希值时,使用以下算法:
objects/bytesobject.c
static Py_hash_t
bytes_hash(PyBytesObject *a)
{
if (a->ob_shash == -1) {
/* Can't fail */
a->ob_shash = _Py_HashBytes(a->ob_sval, Py_SIZE(a));
}
return a->ob_shash;
}
1.3 类型对象
- PyBytesObject对应的类型对象为PyBytes_Type:
objects/bytesobject.c
PyTypeObject PyBytes_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"bytes",
PyBytesObject_SIZE,
sizeof(char),
bytes_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
(reprfunc)bytes_repr, /* tp_repr */
&bytes_as_number, /* tp_as_number */
&bytes_as_sequence, /* tp_as_sequence */
&bytes_as_mapping, /* tp_as_mapping */
(hashfunc)bytes_hash, /* tp_hash */
0, /* tp_call */
bytes_str, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
&bytes_as_buffer, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_BYTES_SUBCLASS, /* tp_flags */
bytes_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
(richcmpfunc)bytes_richcompare, /* tp_richcompare */
0, /* tp_weaklistoffset */
bytes_iter, /* tp_iter */
0, /* tp_iternext */
bytes_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
&PyBaseObject_Type, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
bytes_new, /* tp_new */
PyObject_Del, /* tp_free */
};
- 在类型中,tp_itemsize被设置为一个字节大小
sizeof(char)
,指明了变长对象元素的单位长度。
&bytes_as_number, /* tp_as_number */
&bytes_as_sequence, /* tp_as_sequence */
&bytes_as_mapping, /* tp_as_mapping */
- 此外,tp_as_number、tp_as_sequence、tp_as_mapping全都设置了,表示支持数值操作、序列操作和映射操作。
2. PyUnicodeObject对象
2.1 PyUnicodeObject对象结构
- 基本就是各种字符map的集合。
Include\unicodeobject.h
typedef struct {
PyCompactUnicodeObject _base;
union {
void *any;
Py_UCS1 *latin1;
Py_UCS2 *ucs2;
Py_UCS4 *ucs4;
} data; /* Canonical, smallest-form Unicode buffer */
} PyUnicodeObject;
Include\unicodeobject.h
typedef struct {
PyASCIIObject _base;
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
* terminating \0. */
char *utf8; /* UTF-8 representation (null-terminated) */
Py_ssize_t wstr_length; /* Number of code points in wstr, possible
* surrogates count as two code points. */
} PyCompactUnicodeObject;
2.2 PyUnicodeObject类型对象
- PyUnicodeObject对应的类型为PyUnicode_Type,与PyBytes_Type类似。
object\unicodeobject.c
PyTypeObject PyUnicode_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"str", /* tp_name */
sizeof(PyUnicodeObject), /* tp_size */
0, /* tp_itemsize */
/* Slots */
(destructor)unicode_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
unicode_repr, /* tp_repr */
&unicode_as_number, /* tp_as_number */
&unicode_as_sequence, /* tp_as_sequence */
&unicode_as_mapping, /* tp_as_mapping */
(hashfunc) unicode_hash, /* tp_hash*/
0, /* tp_call*/
(reprfunc) unicode_str, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
unicode_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
PyUnicode_RichCompare, /* tp_richcompare */
0, /* tp_weaklistoffset */
unicode_iter, /* tp_iter */
0, /* tp_iternext */
unicode_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
&PyBaseObject_Type, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
unicode_new, /* tp_new */
PyObject_Del, /* tp_free */
};
二、创建PyBytesObject对象
- 从C的原生字符串创建PyBytesObject有两条路径:_PyBytes_FromString和PyBytes_FromStringAndSize。
1. _PyBytes_FromString
- 此方法的参数str必须是指向一个固定长度的空终止字符串。
objects/bytesobject.c
PyObject *
PyBytes_FromString(const char *str)
{
size_t size;
PyBytesObject *op;
assert(str != NULL);
size = strlen(str);
if (size > PY_SSIZE_T_MAX - PyBytesObject_SIZE) {
PyErr_SetString(PyExc_OverflowError,
"byte string is too long");
return NULL;
}
if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
null_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
#ifdef COUNT_ALLOCS
one_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
/* Inline PyObject_NewVar */
op = (PyBytesObject *)PyObject_MALLOC(PyBytesObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyBytes_Type, size);
op->ob_shash = -1;
memcpy(op->ob_sval, str, size+1);
/* share short strings */
if (size == 0) {
nullstring = op;
Py_INCREF(op);
} else if (size == 1) {
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
}
- 此方法首先判断传入指针指向的str长度,如果超出PY_SSIZE_T_MAX - PyBytesObject_SIZE长度则报错,PY_SSIZE_T_MAX的值与平台相关,在win32系统下约为2GB,64位系统约为4GB。
size_t size;
PyBytesObject *op;
assert(str != NULL);
size = strlen(str);
if (size > PY_SSIZE_T_MAX - PyBytesObject_SIZE) {
PyErr_SetString(PyExc_OverflowError,
"byte string is too long");
return NULL;
}
- 之后,会检查str是不是空串,如果是第一次在一个空字符串基础上创建PyBytesObject对象,会为这个空字符串建立一个对象,并通过intern机制共享。
if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
null_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
- 如果不是第一次,则直接引用。
if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
#ifdef COUNT_ALLOCS
one_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
- 如果不是创建空字符串对象,则申请内存创建PyBytesObject对象。
op = (PyBytesObject *)PyObject_MALLOC(PyBytesObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyBytes_Type, size);
op->ob_shash = -1;
memcpy(op->ob_sval, str, size+1);
/* share short strings */
if (size == 0) {
nullstring = op;
Py_INCREF(op);
} else if (size == 1) {
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
-
新创建的PyBytesObject布局:
2. PyBytes_FromStringAndSize
- 与PyBytes_FromString类似,但是不要求传入的str指针必须是空终止字符串,因为另一个参数size可以确定字符个数。
objects/bytesobject.c
PyObject *
PyBytes_FromStringAndSize(const char *str, Py_ssize_t size)
{
PyBytesObject *op;
if (size < 0) {
PyErr_SetString(PyExc_SystemError,
"Negative size passed to PyBytes_FromStringAndSize");
return NULL;
}
if (size == 1 && str != NULL &&
(op = characters[*str & UCHAR_MAX]) != NULL)
{
#ifdef COUNT_ALLOCS
one_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
op = (PyBytesObject *)_PyBytes_FromSize(size, 0);
if (op == NULL)
return NULL;
if (str == NULL)
return (PyObject *) op;
memcpy(op->ob_sval, str, size);
/* share short strings */
if (size == 1) {
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
}
objects/bytesobject.c
static PyObject *
_PyBytes_FromSize(Py_ssize_t size, int use_calloc)
{
PyBytesObject *op;
assert(size >= 0);
if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
null_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
if ((size_t)size > (size_t)PY_SSIZE_T_MAX - PyBytesObject_SIZE) {
PyErr_SetString(PyExc_OverflowError,
"byte string is too large");
return NULL;
}
/* Inline PyObject_NewVar */
if (use_calloc)
op = (PyBytesObject *)PyObject_Calloc(1, PyBytesObject_SIZE + size);
else
op = (PyBytesObject *)PyObject_Malloc(PyBytesObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyBytes_Type, size);
op->ob_shash = -1;
if (!use_calloc)
op->ob_sval[size] = '\0';
/* empty byte string singleton */
if (size == 0) {
nullstring = op;
Py_INCREF(op);
}
return (PyObject *) op;
}
三、字符串驻留机制
- 与数值类型一样,字符串都是一样的不可变对象,不需要用不同的对象来区分。
- 字符串驻留(intern)机制将值同样的字符串对象分享共用,避免频繁创建和销毁内存,提升效率。
- 由于与unicode字符相关,intern机制是在PyUnicodeObject对象中实现的:
objects\unicodeobject.c
void
PyUnicode_InternInPlace(PyObject **p)
{
PyObject *s = *p;
PyObject *t;
#ifdef Py_DEBUG
assert(s != NULL);
assert(_PyUnicode_CHECK(s));
#else
if (s == NULL || !PyUnicode_Check(s))
return;
#endif
/* If it's a subclass, we don't really know what putting
it in the interned dict might do. */
if (!PyUnicode_CheckExact(s))
return;
if (PyUnicode_CHECK_INTERNED(s))
return;
if (interned == NULL) {
interned = PyDict_New();
if (interned == NULL) {
PyErr_Clear(); /* Don't leave an exception */
return;
}
}
Py_ALLOW_RECURSION
t = PyDict_SetDefault(interned, s, s);
Py_END_ALLOW_RECURSION
if (t == NULL) {
PyErr_Clear();
return;
}
if (t != s) {
Py_INCREF(t);
Py_SETREF(*p, t);
return;
}
/* The two references in interned are not counted by refcnt.
The deallocator will take care of this */
Py_REFCNT(s) -= 2;
_PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
}
- 首先检查传入的是否是PyUnicodeObject对象:
#ifdef Py_DEBUG
assert(s != NULL);
assert(_PyUnicode_CHECK(s));
#else
if (s == NULL || !PyUnicode_Check(s))
return;
#endif
- 再检查传入的对象是否已经被intern机制处理过,如果没有处理过则返回:
if (!PyUnicode_CheckExact(s))
return;
if (PyUnicode_CHECK_INTERNED(s))
return;
if (interned == NULL) {
interned = PyDict_New();
if (interned == NULL) {
PyErr_Clear(); /* Don't leave an exception */
return;
}
}
- intern实际上指向一个包含所有unicode字符串的字典。
objects\unicodeobject.c
/* This dictionary holds all interned unicode strings. Note that references
to strings in this dictionary are *not* counted in the string's ob_refcnt.
When the interned string reaches a refcnt of 0 the string deallocation
function will delete the reference from this dictionary.
Another way to look at this is that to say that the actual reference
count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/
static PyObject *interned = NULL;
- 如果处理过,则将指针指向字符串地址并计数。
Py_ALLOW_RECURSION
t = PyDict_SetDefault(interned, s, s);
Py_END_ALLOW_RECURSION
if (t == NULL) {
PyErr_Clear();
return;
}
if (t != s) {
Py_INCREF(t);
Py_SETREF(*p, t);
return;
}
/* The two references in interned are not counted by refcnt.
The deallocator will take care of this */
Py_REFCNT(s) -= 2;
_PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
- 释放interned string:
objects\unicodeobject.c
void
_Py_ReleaseInternedUnicodeStrings(void)
{
PyObject *keys;
PyObject *s;
Py_ssize_t i, n;
Py_ssize_t immortal_size = 0, mortal_size = 0;
if (interned == NULL || !PyDict_Check(interned))
return;
keys = PyDict_Keys(interned);
if (keys == NULL || !PyList_Check(keys)) {
PyErr_Clear();
return;
}
/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
detector, interned unicode strings are not forcibly deallocated;
rather, we give them their stolen references back, and then clear
and DECREF the interned dict. */
n = PyList_GET_SIZE(keys);
fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
n);
for (i = 0; i < n; i++) {
s = PyList_GET_ITEM(keys, i);
if (PyUnicode_READY(s) == -1) {
Py_UNREACHABLE();
}
switch (PyUnicode_CHECK_INTERNED(s)) {
case SSTATE_NOT_INTERNED:
/* XXX Shouldn't happen */
break;
case SSTATE_INTERNED_IMMORTAL:
Py_REFCNT(s) += 1;
immortal_size += PyUnicode_GET_LENGTH(s);
break;
case SSTATE_INTERNED_MORTAL:
Py_REFCNT(s) += 2;
mortal_size += PyUnicode_GET_LENGTH(s);
break;
default:
Py_FatalError("Inconsistent interned string state.");
}
_PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
}
fprintf(stderr, "total size of all interned strings: "
"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
"mortal/immortal\n", mortal_size, immortal_size);
Py_DECREF(keys);
PyDict_Clear(interned);
Py_CLEAR(interned);
}
- PyUnicodeObject根据intern处理状态可分为三类:
/* Interning state. */
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2
四、字符缓冲池
- 字符缓冲池是Python为PyBytesObject的每一个字节的字符对应的对象设计的对象池:
objects/bytesobject.c
static PyBytesObject *characters[UCHAR_MAX + 1];
- 字符缓冲池以静态变量的形式存在,Python初始化完成后,缓冲池中的所有PyBytesObject指针为空。
- 在创建PyBytesObject时,无论调用哪种方法,如果字符串实际是一个字符,则会进行如下操作:
objects/bytesobject.c
PyBytes_FromString(const char *str)
{
... ...
if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
#ifdef COUNT_ALLOCS
one_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
/* Inline PyObject_NewVar */
op = (PyBytesObject *)PyObject_MALLOC(PyBytesObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyBytes_Type, size);
op->ob_shash = -1;
memcpy(op->ob_sval, str, size+1);
/* share short strings */
if (size == 0) {
nullstring = op;
Py_INCREF(op);
} else if (size == 1) {
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
}
- 如果不是空字符,在创建PyBytesObject后,会将其添加到池子中。
- 如果字符已经在池子中,则直接返回池子中的对象。
五、字符串对象的效率问题
- 由于PyUnicodeObject是不可变对象,当在Python中使用"+"对两个字符串拼接时,实际上会调用PyUnicode_Concat,从而创建一个新的PyUnicodeObject。
- 如果用"+"拼接n个PyUnicodeObject对象,意味着需要进行n-1次内存申请和搬运工作,严重影响效率。
objects\unicodeobject.c
/* Concat to string or Unicode object giving a new Unicode object. */
PyObject *
PyUnicode_Concat(PyObject *left, PyObject *right)
{
PyObject *result;
Py_UCS4 maxchar, maxchar2;
Py_ssize_t left_len, right_len, new_len;
if (ensure_unicode(left) < 0)
return NULL;
if (!PyUnicode_Check(right)) {
PyErr_Format(PyExc_TypeError,
"can only concatenate str (not \"%.200s\") to str",
right->ob_type->tp_name);
return NULL;
}
if (PyUnicode_READY(right) < 0)
return NULL;
/* Shortcuts */
if (left == unicode_empty)
return PyUnicode_FromObject(right);
if (right == unicode_empty)
return PyUnicode_FromObject(left);
left_len = PyUnicode_GET_LENGTH(left);
right_len = PyUnicode_GET_LENGTH(right);
if (left_len > PY_SSIZE_T_MAX - right_len) {
PyErr_SetString(PyExc_OverflowError,
"strings are too large to concat");
return NULL;
}
new_len = left_len + right_len;
maxchar = PyUnicode_MAX_CHAR_VALUE(left);
maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
maxchar = Py_MAX(maxchar, maxchar2);
/* Concat the two Unicode strings */
result = PyUnicode_New(new_len, maxchar);
if (result == NULL)
return NULL;
_PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
_PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
assert(_PyUnicode_CheckConsistency(result, 1));
return result;
}
- Python官方推荐的做法是通过PyUnicodeObject的PyUnicode_Join操作来对存储在list或tuple中的一组对象进行连接操作。
- PyUnicode_Join会首先统计出在list中一共有多少个PyUnicodeObject对象,并统计这些对象维护的字符串一共多长,然后申请内存,并将list中的所有字符串拷贝到新开辟的内存空间中,只需要一次内存空间申请。
objects\unicodeobject.c
PyObject *
PyUnicode_Join(PyObject *separator, PyObject *seq)
{
PyObject *res;
PyObject *fseq;
Py_ssize_t seqlen;
PyObject **items;
fseq = PySequence_Fast(seq, "can only join an iterable");
if (fseq == NULL) {
return NULL;
}
/* NOTE: the following code can't call back into Python code,
* so we are sure that fseq won't be mutated.
*/
items = PySequence_Fast_ITEMS(fseq);
seqlen = PySequence_Fast_GET_SIZE(fseq);
res = _PyUnicode_JoinArray(separator, items, seqlen);
Py_DECREF(fseq);
return res;
}
PyObject *
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
{
PyObject *res = NULL; /* the result */
PyObject *sep = NULL;
Py_ssize_t seplen;
PyObject *item;
Py_ssize_t sz, i, res_offset;
Py_UCS4 maxchar;
Py_UCS4 item_maxchar;
int use_memcpy;
unsigned char *res_data = NULL, *sep_data = NULL;
PyObject *last_obj;
unsigned int kind = 0;
/* If empty sequence, return u"". */
if (seqlen == 0) {
_Py_RETURN_UNICODE_EMPTY();
}
/* If singleton sequence with an exact Unicode, return that. */
last_obj = NULL;
if (seqlen == 1) {
if (PyUnicode_CheckExact(items[0])) {
res = items[0];
Py_INCREF(res);
return res;
}
seplen = 0;
maxchar = 0;
}
else {
/* Set up sep and seplen */
if (separator == NULL) {
/* fall back to a blank space separator */
sep = PyUnicode_FromOrdinal(' ');
if (!sep)
goto onError;
seplen = 1;
maxchar = 32;
}
else {
if (!PyUnicode_Check(separator)) {
PyErr_Format(PyExc_TypeError,
"separator: expected str instance,"
" %.80s found",
Py_TYPE(separator)->tp_name);
goto onError;
}
if (PyUnicode_READY(separator))
goto onError;
sep = separator;
seplen = PyUnicode_GET_LENGTH(separator);
maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
/* inc refcount to keep this code path symmetric with the
above case of a blank separator */
Py_INCREF(sep);
}
last_obj = sep;
}
/* There are at least two things to join, or else we have a subclass
* of str in the sequence.
* Do a pre-pass to figure out the total amount of space we'll
* need (sz), and see whether all argument are strings.
*/
sz = 0;
#ifdef Py_DEBUG
use_memcpy = 0;
#else
use_memcpy = 1;
#endif
for (i = 0; i < seqlen; i++) {
size_t add_sz;
item = items[i];
if (!PyUnicode_Check(item)) {
PyErr_Format(PyExc_TypeError,
"sequence item %zd: expected str instance,"
" %.80s found",
i, Py_TYPE(item)->tp_name);
goto onError;
}
if (PyUnicode_READY(item) == -1)
goto onError;
add_sz = PyUnicode_GET_LENGTH(item);
item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
maxchar = Py_MAX(maxchar, item_maxchar);
if (i != 0) {
add_sz += seplen;
}
if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
PyErr_SetString(PyExc_OverflowError,
"join() result is too long for a Python string");
goto onError;
}
sz += add_sz;
if (use_memcpy && last_obj != NULL) {
if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
use_memcpy = 0;
}
last_obj = item;
}
res = PyUnicode_New(sz, maxchar);
if (res == NULL)
goto onError;
/* Catenate everything. */
#ifdef Py_DEBUG
use_memcpy = 0;
#else
if (use_memcpy) {
res_data = PyUnicode_1BYTE_DATA(res);
kind = PyUnicode_KIND(res);
if (seplen != 0)
sep_data = PyUnicode_1BYTE_DATA(sep);
}
#endif
if (use_memcpy) {
for (i = 0; i < seqlen; ++i) {
Py_ssize_t itemlen;
item = items[i];
/* Copy item, and maybe the separator. */
if (i && seplen != 0) {
memcpy(res_data,
sep_data,
kind * seplen);
res_data += kind * seplen;
}
itemlen = PyUnicode_GET_LENGTH(item);
if (itemlen != 0) {
memcpy(res_data,
PyUnicode_DATA(item),
kind * itemlen);
res_data += kind * itemlen;
}
}
assert(res_data == PyUnicode_1BYTE_DATA(res)
+ kind * PyUnicode_GET_LENGTH(res));
}
else {
for (i = 0, res_offset = 0; i < seqlen; ++i) {
Py_ssize_t itemlen;
item = items[i];
/* Copy item, and maybe the separator. */
if (i && seplen != 0) {
_PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
res_offset += seplen;
}
itemlen = PyUnicode_GET_LENGTH(item);
if (itemlen != 0) {
_PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
res_offset += itemlen;
}
}
assert(res_offset == PyUnicode_GET_LENGTH(res));
}
Py_XDECREF(sep);
assert(_PyUnicode_CheckConsistency(res, 1));
return res;
onError:
Py_XDECREF(sep);
Py_XDECREF(res);
return NULL;
}
网友评论