美文网首页
Python 字符串对象

Python 字符串对象

作者: 阿布吃de饭 | 来源:发表于2017-06-08 14:48 被阅读59次

    字符串对象 PyStringObject

    PyStringObject是对字符串对象的实现,其具有可变长度的内存空间。即,不同字符串的PyStringObject对象其内存长度可能不同,例如“ni”,“hao”这两个字符串就具有不同的内存长度。同时,PyStringObject又是一个值不可变对象,即一旦创建后,其值一直将保持创建时的值。

    定义

    [stringobject.h]
    typedef struct {
        PyObject_VAR_HEAD //其中的ob_size保存了其可变内存的大小
        long ob_shash; //缓存对象的hash值(未计算时,默认-1)
        int ob_sstate;
        char ob_sval[1]; //字符的实际指针
    } PyStringObject;
    

    实际上,其实际的字符长度是由ob_size保存的,满足ob_sval[ob_size] == '\0'

    计算字符串对象的hash值如下

    [stringobject.h]
    static long
    string_hash(PyStringObject *a)
    {
        register Py_ssize_t len;
        register unsigned char *p;
        register long x;
    
    #ifdef Py_DEBUG
        assert(_Py_HashSecret_Initialized);
    #endif
        if (a->ob_shash != -1)
            return a->ob_shash;
        len = Py_SIZE(a);
        /*
          We make the hash of the empty string be 0, rather than using
          (prefix ^ suffix), since this slightly obfuscates the hash secret
        */
        if (len == 0) {
            a->ob_shash = 0;
            return 0;
        }
        p = (unsigned char *) a->ob_sval;
        x = _Py_HashSecret.prefix;
        x ^= *p << 7;
        while (--len >= 0)
            x = (1000003*x) ^ *p++;
        x ^= Py_SIZE(a);
        x ^= _Py_HashSecret.suffix;
        if (x == -1)
            x = -2;
        a->ob_shash = x;
        return x;
    }
    

    PyStringObject 类型,对应的对象类型为PyString_Type

    创建PyStringObject对象

    python提供了两种方法从C中原生的字符串创建PyStringObject对象

    • 利用 PyString_FromString
    [stringobject.h]
    PyObject *
    PyString_FromString(const char *str)
    {
        register size_t size;
        register PyStringObject *op;
    
        assert(str != NULL);
        size = strlen(str);
        // 判断字符串的长度是否超过限制
        if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
            PyErr_SetString(PyExc_OverflowError,
                "string is too long for a Python string");
            return NULL;
        }
        if (size == 0 && (op = nullstring) != NULL) {
            // 返回空字符串 这里的nullstring 已经创建好了
    #ifdef COUNT_ALLOCS
            null_strings++;
    #endif
            Py_INCREF(op);
            return (PyObject *)op;
        }
        if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
            // 返回相应的单字符串,这里的单字符串也是已经创建好的
    #ifdef COUNT_ALLOCS
            one_strings++;
    #endif
            Py_INCREF(op);
            return (PyObject *)op;
        }
    
        /* Inline PyObject_NewVar */
        //申请内存,注意这里的大小是 PyStringObject_SIZE + size
        op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
        if (op == NULL)
            return PyErr_NoMemory();
        (void)PyObject_INIT_VAR(op, &PyString_Type, size);
        op->ob_shash = -1;
        op->ob_sstate = SSTATE_NOT_INTERNED;
        Py_MEMCPY(op->ob_sval, str, size+1);
        /* share short strings */
        if (size == 0) {
            // 第一次创建空字符串
            // 这是将保留这个字符串,以备以后再次创建时直接使用
            PyObject *t = (PyObject *)op;
            PyString_InternInPlace(&t);
            op = (PyStringObject *)t;
            nullstring = op;
            Py_INCREF(op);
        } else if (size == 1) {
            // 第一次创建单字符字符串
            // 这是将保留这单字符个字符串(到数组中),以备以后再次创建时直接使用
            PyObject *t = (PyObject *)op;
            PyString_InternInPlace(&t);
            op = (PyStringObject *)t;
            characters[*str & UCHAR_MAX] = op;
            Py_INCREF(op);
        }
        return (PyObject *) op;
    }
    
    创建新的PyStringObject的内存分布
    • 利用 PyString_FromStringAndSize
    [stringobject.h]
    PyObject *
    PyString_FromStringAndSize(const char *str, Py_ssize_t size)
    {
        register PyStringObject *op;
        if (size < 0) {
            PyErr_SetString(PyExc_SystemError,
                "Negative size passed to PyString_FromStringAndSize");
            return NULL;
        }
        if (size == 0 && (op = nullstring) != NULL) {
    #ifdef COUNT_ALLOCS
            null_strings++;
    #endif
            Py_INCREF(op);
            return (PyObject *)op;
        }
        if (size == 1 && str != NULL &&
            (op = characters[*str & UCHAR_MAX]) != NULL)
        {
    #ifdef COUNT_ALLOCS
            one_strings++;
    #endif
            Py_INCREF(op);
            return (PyObject *)op;
        }
    
        if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
            PyErr_SetString(PyExc_OverflowError, "string is too large");
            return NULL;
        }
    
        /* Inline PyObject_NewVar */
        op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
        if (op == NULL)
            return PyErr_NoMemory();
        (void)PyObject_INIT_VAR(op, &PyString_Type, size);
        op->ob_shash = -1;
        op->ob_sstate = SSTATE_NOT_INTERNED;
        if (str != NULL)
            Py_MEMCPY(op->ob_sval, str, size);
        op->ob_sval[size] = '\0';
        /* share short strings */
        if (size == 0) {
            PyObject *t = (PyObject *)op;
            PyString_InternInPlace(&t);
            op = (PyStringObject *)t;
            nullstring = op;
            Py_INCREF(op);
        } else if (size == 1 && str != NULL) {
            PyObject *t = (PyObject *)op;
            PyString_InternInPlace(&t);
            op = (PyStringObject *)t;
            characters[*str & UCHAR_MAX] = op;
            Py_INCREF(op);
        }
        return (PyObject *) op;
    }
    

    PyString_FromString 传入的参数必须是以 NULL(‘\0’)结尾的字符数组的指针,而 PyString_FromStringAndSize 不会有这样的要求,因为通过传入的 size 参数就可以确定需要拷贝的字符的个数.

    字符串对象的intern机制

    PyString_FromStringPyString_FromStringAndSize中当size == 0size==1时,都使用了函数PyString_InternInPlace。其作用是保存常用字符串的对象,以备下次直接使用

    [stringobject.h]
    void
    PyString_InternInPlace(PyObject **p)
    {
        register PyStringObject *s = (PyStringObject *)(*p);
        PyObject *t;
        if (s == NULL || !PyString_Check(s))
            Py_FatalError("PyString_InternInPlace: strings only please!");
        /* If it's a string subclass, we don't really know what putting
           it in the interned dict might do. */
        //进行类型以及状态的检测
        if (!PyString_CheckExact(s))
            return;
        if (PyString_CHECK_INTERNED(s))
            return;
        //创建实现Intern机制的缓存字典
        if (interned == NULL) {
            interned = PyDict_New();
            if (interned == NULL) {
                PyErr_Clear(); /* Don't leave an exception */
                return;
            }
        }
        //从字典中获取对象
        t = PyDict_GetItem(interned, (PyObject *)s);
        if (t) {
            // 存在,则增加引用计数
            Py_INCREF(t);
            Py_SETREF(*p, t);
            return;
        }
        //不存在,则向字典中添加
        if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
            PyErr_Clear();
            return;
        }
        /* The two references in interned are not counted by refcnt.
           The string deallocator will take care of this */
        //调整其计数
        Py_REFCNT(s) -= 2;
        //调整其状态
        PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
    }
    

    字符串的连接问题

    虽然,python的PyStringObject提供了+来实现字符串的连接,但是,其实现是通过创建新的PyStringObject来容纳这个和,也就是,如果要实现N个对象的连接,利用+将分配N-1次内存。
    官方推荐的是利用PyStringObjectjoin操作来实现listtuple的连接,这样只需要一次内存分配
    例如

    a = 'abc'
    b = 'def'
    c = ''.join((a,b))
    

    参考

    《Python 源码剖析》

    相关文章

      网友评论

          本文标题:Python 字符串对象

          本文链接:https://www.haomeiwen.com/subject/tvxtqxtx.html