美文网首页python学习生信相关python爬虫
python 数据分析之Numpy(高级篇)

python 数据分析之Numpy(高级篇)

作者: Thinkando | 来源:发表于2017-10-20 17:06 被阅读117次

    1. 利用数组进行数据处理

    # -*- coding: utf-8 -*-
    import matplotlib.pyplot as plt
    import numpy as np
    import pylab
    points = np.arange(-5, 5, 0.01) # 生成100个点
    xs, ys = np.meshgrid(points, points)  # xs, ys互为转置矩阵
    print (xs)
    print (ys)
    z = np.sqrt(xs ** 2 + ys ** 2)
    print (z)
    # 画图
    plt.imshow(z, cmap = plt.cm.gray);
    plt.colorbar()
    plt.title("Image plot of $\sqrt{x^2 + y^2}$ for a grid of values")
    pylab.show() 
    
    image.png

    1.1 将条件逻辑表述为数组运算

    import numpy as np
    import numpy.random as np_random
    print ('更多where的例子')
    arr = np_random.randn(4, 4)
    print (arr)
    
    [[-0.24957277 -1.00358211 -0.73089612  0.2547624 ]
    [ 0.37150224 -0.13316825 -0.60619815 -1.92511077]
    [-0.42200075 -0.48937521 -1.25945836 -0.73708106]
    [-0.52699294  1.2829272  0.5382728  -0.33665592]]
    
    print (np.where(arr > 0, 2, -2))
    
    [[-2 -2 -2  2]
    [ 2 -2 -2 -2]
    [-2 -2 -2 -2]
    [-2  2  2 -2]]
    
    print (np.where(arr > 0, 2, arr))
    
    [[-0.24957277 -1.00358211 -0.73089612  2.        ]
    [ 2.        -0.13316825 -0.60619815 -1.92511077]
    
    print ('where嵌套')
    cond_1 = np.array([True, False, True, True, False])
    cond_2 = np.array([False, True, False, True, False])
    # 传统代码如下
    result = []
    for i in range(len(cond_1)):
        if cond_1[i] and cond_2[i]:
            result.append(0)
        elif cond_1[i]:
            result.append(1)
        elif cond_2[i]:
            result.append(2)
        else:
            result.append(3)
    print (result)
    
    [1, 2, 1, 0, 3]
    
    # np版本代码
    result = np.where(cond_1 & cond_2, 0, \
              np.where(cond_1, 1, np.where(cond_2, 2, 3)))
    print (result)
    
    [1 2 1 0 3]
    
    

    1.2 数学和统计方法

    image.png
    import numpy as np
    import numpy.random as np_random
    print ('求和,求平均')
    arr = np.random.randn(5, 4)
    print (arr)
    
    [[ 0.91340516 -0.28913556 -0.70409556  0.79581511]
    [ 0.03067817 -1.48720141 -0.12093709 -0.46505081]
    [ 1.19378156 -1.6550555  1.66422768 -0.21638596]
    [ 0.05186091 -0.26038421  1.10220866  0.15916608]
    [ 1.27671635 -1.88666043  1.97051236  0.2858806 ]]
    
    print (arr.mean())
    
    0.117967306233
    
    print (arr.sum())
    
    2.35934612466
    
    print (arr.mean(axis = 1))  # 对每一行的元素求平均
    
    [ 0.17899729 -0.51062779  0.24664195  0.26321286  0.41161222]
    print (arr.sum(0))  # 对每一列元素求和,axis可以省略。
    
    [ 3.46644215 -5.57843711  3.91191605  0.55942504]
    
    '''
    cumsum:
    - 按列操作:a[i][j] += a[i - 1][j]
    - 按行操作:a[i][j] *= a[i][j - 1]
    cumprod:
    - 按列操作:a[i][j] += a[i - 1][j]
    - 按行操作:a[i][j] *= a[i][j - 1]
    '''
    print ('cunsum和cumprod函数演示')
    arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
    print (arr.cumsum(0))  # 累加
    
    [[ 0  1  2]
    [ 3  5  7]
    [ 9 12 15]]
    
    print (arr.cumprod(1)) # 累乘
    
    [[  0  0  0]
    [  3  12  60]
    [  6  42 336]]
    

    1.3 布尔型数组的方法

    import numpy as np
    import numpy.random as np_random
    print ('对正数求和')
    arr = np_random.randn(100)
    
    47
    
    print ((arr > 0).sum())
    print ('对数组逻辑操作')
    bools = np.array([False, False, True, False])
    print (bools.any()) # 有一个为True则返回True
    
    True
    
    print (bools.all()) # 有一个为False则返回False
    
    False
    

    1.4 排序

    import numpy.random as np_random
    print ('一维数组排序')
    arr = np_random.randn(5)
    arr.sort()
    print (arr)
    
    [-0.95478899 -0.84025039 -0.81146653 -0.59203327 -0.08063988]
    
    print ('二维数组排序')
    arr = np_random.randn(3, 2)
    print (arr)
    
    [[ 0.22553826 -0.55803348]
    [-1.26086564  0.52919879]
    [-0.44668631 -0.53374782]]
    
    arr.sort(1) # 对每一行元素做排序
    print (arr)
    
    [[-0.55803348  0.22553826]
    [-1.26086564  0.52919879]
    [-0.53374782 -0.44668631]]
    
    

    1.5 去重以及其它集合运算

    image.png
    import numpy as np
    import numpy.random as np_random
    
    print ('用unique函数去重')
    names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
    print (sorted(set(names)))  # 传统Python做法
    
    ['Bob', 'Joe', 'Will']
    
    print (np.unique(names))
    
    ['Bob' 'Joe' 'Will']
    
    ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])
    print (np.unique(ints))
    
    [1 2 3 4]
    
    print ('查找数组元素是否在另一数组')
    values = np.array([6, 0, 0, 3, 2, 5, 6])
    print (np.in1d(values, [2, 3, 6]))
    
    [ True False False  True  True False  True]
    
    

    1.6 数组文件的输入输出

    import numpy as np
    print ('读取csv文件做为数组')
    arr = np.loadtxt('array_ex.txt', delimiter = ',')
    print (arr)
    
    print ('数组文件读写')
    arr = np.arange(10)
    np.save('some_array', arr)
    print (np.load('some_array.npy'))
    
    print ('多个数组压缩存储')
    np.savez('array_archive.npz', a = arr, b = arr)
    arch = np.load('array_archive.npz')
    print (arch['b'])
    
    image.png

    2 线性代数

    • 常用的numpy.linalg函数 I


      image.png
      image.png
    # -*- coding: utf-8 -*-
    import numpy as np
    import numpy.random as np_random
    from numpy.linalg import inv, qr
    
    print ('矩阵乘法')
    x = np.array([[1., 2., 3.], [4., 5., 6.]])
    y = np.array([[6., 23.], [-1, 7], [8, 9]])
    print (x.dot(y))
    
    [[  28.  64.]
    [  67.  181.]]
    
    print (np.dot(x, np.ones(3)))
    
    [  6.  15.]
    
    
    x = np_random.randn(5, 5)
    print ('矩阵求逆')
    mat = x.T.dot(x)
    print (inv(mat))  # 矩阵求逆
    
    [[ 1.56337148  2.0598732  0.71214902]
    [ 2.0598732  3.70889345  1.32647939]
    [ 0.71214902  1.32647939  0.67314987]]
    
    print (mat.dot(inv(mat))) # 与逆矩阵相乘,得到单位矩阵。
    
    [[  1.00000000e+00  -1.11022302e-16  -2.77555756e-17]
    [  2.22044605e-16  1.00000000e+00  0.00000000e+00]
    [ -4.44089210e-16  0.00000000e+00  1.00000000e+00]]
    
    print ('矩阵消元')
    print (mat)
    q, r = qr(mat)
    print (q)
    print (r)
    

    3. 随机数生成

    • numpy.random 函数
    image.png
    image.png
    print ('正态分布随机数')
    samples = np.random.normal(size=(4, 4))
    print (samples)
    
    print ('批量按正态分布生成0到1的随机数')
    N = 10
    print (np.random.normal(size = N))
    
    image.png

    4. 高级应用 数组重塑

    import numpy as np
    
    print ("将一维数组转换为二维数组")
    arr = np.arange(8)
    print (arr.reshape((4, 2)))
    
    [[0 1]
    [2 3]
    [4 5]
    [6 7]]
    
    print (arr.reshape((4, 2)).reshape((2, 4))) # 支持链式操作
    
    [[0 1 2 3]
    [4 5 6 7]]
    
    print ("维度大小自动推导")
    arr = np.arange(15)
    print (arr.reshape((5, -1)))
    
    [[ 0  1  2]
    [ 3  4  5]
    [ 6  7  8]
    [ 9 10 11]
    [12 13 14]]
    
    print ("获取维度信息并应用")
    other_arr = np.ones((3, 5))
    print (other_arr.shape)
    print (arr.reshape(other_arr.shape))
    (3, 5)
    [[ 0  1  2  3  4]
    [ 5  6  7  8  9]
    [10 11 12 13 14]]
    
    print ("高维数组拉平")
    arr = np.arange(15).reshape((5, 3))
    print (arr.ravel())
    
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
    

    4.1 数组的合并和拆分

    • 数组连接函数
    image.png
    # -*- coding: utf-8 -*-
    import numpy as np
    import numpy.random as np_random
    
    print ('连接两个二维数组')
    arr1 = np.array([[1, 2, 3], [4, 5, 6]])
    arr2 = np.array([[7, 8, 9], [10, 11, 12]])
    
    print (np.concatenate([arr1, arr2], axis = 0))  # 按行连接
    
    [[ 1  2  3]
    [ 4  5  6]
    [ 7  8  9]
    [10 11 12]]
    
    print (np.concatenate([arr1, arr2], axis = 1))  # 按列连接
    
    [[ 1  2  3  7  8  9]
    [ 4  5  6 10 11 12]]
    
    # 所谓堆叠,参考叠盘子。。。连接的另一种表述
    print ('垂直stack与水平stack')
    print (np.vstack((arr1, arr2))) # 垂直堆叠
    
    [[ 1  2  3]
    [ 4  5  6]
    [ 7  8  9]
    [10 11 12]]
    
    print (np.hstack((arr1, arr2))) # 水平堆叠
    
    [[ 1  2  3  7  8  9]
    [ 4  5  6 10 11 12]]
    
    print ('拆分数组')
    arr = np_random.randn(5, 5)
    print (arr)
    
    [[ 0.05528194 -1.40691043  1.25390116  0.01343898 -0.56362107]
    [-0.67196856 -2.01689483  0.530919  -0.01049739 -0.08576697]
    [-0.00469774  0.60896671  0.49864293 -1.16233372  1.09303811]
    [-1.6787031  0.1201058  0.83891825 -0.97806989  1.01268929]
    [ 0.23928598  0.50982453 -0.77814664  0.71755991 -0.66536248]]
    
    print ('水平拆分')
    first, second, third = np.split(arr, [1, 3], axis = 0)
    print ('first')
    print (first)
    
    [[ 0.05528194 -1.40691043  1.25390116  0.01343898 -0.56362107]]
    
    print ('second')
    print (second)
    
    [[-0.67196856 -2.01689483  0.530919  -0.01049739 -0.08576697]
    [-0.00469774  0.60896671  0.49864293 -1.16233372  1.09303811]]
    
    print ('third')
    print (third)
    
    [[-1.6787031  0.1201058  0.83891825 -0.97806989  1.01268929]
    [ 0.23928598  0.50982453 -0.77814664  0.71755991 -0.66536248]]
    
    print ('垂直拆分')
    first, second, third = np.split(arr, [1, 3], axis = 1)
    print ('first')
    print (first)
    
    [[ 0.05528194]
    [-0.67196856]
    [-0.00469774]
    [-1.6787031 ]
    [ 0.23928598]]
    
    print ('second')
    print (second)
    
    [[-1.40691043  1.25390116]
    [-2.01689483  0.530919  ]
    [ 0.60896671  0.49864293]
    [ 0.1201058  0.83891825]
    [ 0.50982453 -0.77814664]]
    
    print ('third')
    print (third)
    
    [[ 0.01343898 -0.56362107]
    [-0.01049739 -0.08576697]
    [-1.16233372  1.09303811]
    [-0.97806989  1.01268929]
    [ 0.71755991 -0.66536248]]
    
    # 堆叠辅助类
    arr = np.arange(6)
    arr1 = arr.reshape((3, 2))
    arr2 = np_random.randn(3, 2)
    print ('r_用于按行堆叠')
    
    [[ 0.          1.        ]
    [ 2.          3.        ]
    [ 4.          5.        ]
    [-0.1285436  1.78124128]
    [-1.18281363 -0.5240686 ]
    [ 0.14317738  1.4884177 ]]
    
    print (np.r_[arr1, arr2])
    print ('c_用于按列堆叠')
    print (np.c_[np.r_[arr1, arr2], arr])
    
    [[ 0.          1.          0.        ]
    [ 2.          3.          1.        ]
    [ 4.          5.          2.        ]
    [-0.1285436  1.78124128  3.        ]
    [-1.18281363 -0.5240686  4.        ]
    [ 0.14317738  1.4884177  5.        ]]
    
    print ('切片直接转为数组')
    print (np.c_[1:6, -10:-5])
    
    [[  1 -10]
    [  2  -9]
    [  3  -8]
    [  4  -7]
    [  5  -6]]
    
    

    4.2 元素的重复操作

    import numpy as np
    import numpy.random as np_random
    
    print ('Repeat: 按元素')
    arr = np.arange(3)
    print (arr.repeat(3))
    
    [0 0 0 1 1 1 2 2 2]
    
    print (arr.repeat([2, 3, 4])) # 3个元素,分别复制2, 3, 4次。长度要匹配!
    
    [0 0 1 1 1 2 2 2 2]
    
    print ('Repeat,指定轴')
    arr = np_random.randn(2, 2)
    print (arr)
    
    [[-0.24003754  0.87388196]
    [ 0.63967445  0.53702917]]
    
    print (arr.repeat(2, axis = 0)) # 按行repeat
    
    [[-0.24003754  0.87388196]
    [-0.24003754  0.87388196]
    [ 0.63967445  0.53702917]
    [ 0.63967445  0.53702917]]
    
    print (arr.repeat(2, axis = 1)) # 按列repeat
    
    [[-0.24003754 -0.24003754  0.87388196  0.87388196]
    [ 0.63967445  0.63967445  0.53702917  0.53702917]]
    
    print (arr.repeat(2, axis = 0)) # 按行repeat
    
    [[-0.24003754  0.87388196]
    [-0.24003754  0.87388196]
    [ 0.63967445  0.53702917]
    [ 0.63967445  0.53702917]]
    
    print ('Tile: 参考贴瓷砖')
    print (np.tile(arr, 2))
    
    [[-0.24003754  0.87388196 -0.24003754  0.87388196]
    [ 0.63967445  0.53702917  0.63967445  0.53702917]]
    
    print (np.tile(arr, (2, 3)))  # 指定每个轴的tile次数
    
    [[-0.24003754  0.87388196 -0.24003754  0.87388196 -0.24003754  0.87388196]
    [ 0.63967445  0.53702917  0.63967445  0.53702917  0.63967445  0.53702917]
    [-0.24003754  0.87388196 -0.24003754  0.87388196 -0.24003754  0.87388196]
    [ 0.63967445  0.53702917  0.63967445  0.53702917  0.63967445  0.53702917]]
    

    4.3 花式索引的等价函数

    # -*- coding: utf-8 -*-
    import numpy as np
    import numpy.random as np_random
    
    print ('Fancy Indexing例子代码')
    arr = np.arange(10) * 100
    inds = [7, 1, 2, 6]
    print (arr[inds])
    
    [700 100 200 600]
    
    print ('使用take')
    print (arr.take(inds))
    
    [700 100 200 600]
    
    print ('使用put更新内容')
    arr.put(inds, 50)
    print (arr)
    
    [  0  50  50 300 400 500  50  50 800 900]
    [  0  10  20 300 400 500  60  70 800 900]
    
    arr.put(inds, [70, 10, 20, 60])
    print (arr)
    
    [[-0.93008115  0.61977807 -0.25993519 -1.09063511]
    [ 0.73107853 -1.25725762  0.75042044  0.89568034]]
    
    arr.put(inds, [70, 10, 20, 60])
    print (arr)
    
    [[-0.25993519 -0.93008115 -0.25993519  0.61977807]
    [ 0.75042044  0.73107853  0.75042044 -1.25725762]]
    
    

    相关文章

      网友评论

        本文标题:python 数据分析之Numpy(高级篇)

        本文链接:https://www.haomeiwen.com/subject/svmvuxtx.html