Numpy&Pandas视频笔记

作者: Lion_Kiss_Deer | 来源:发表于2018-03-05 23:06 被阅读94次
    数据处理入门

    1 简介

    C语言直接编写,在tensorflow等应用中更方便、快速。pandas是numpy的升级版

    2 安装

    3 基本属性

    import numpy as np
    array=np.array([[1,2,3]
                    [2,5,8]],dtype=int)
    print(array.ndim)
    print(array.shape)
    print(array.size)
    

    4 创建数组array

    import numpy as np
    zero_array = np.zeros((3,4))#3行4列的零矩阵
    one_array = np.ones((2,4),dtype=int)#每个元素都为1
    np.empty((3,2))#每个元素几乎为0
    range_array = np.arange(12).reshape((2,6))
    #按顺序取0-12,并reshape为2行6列的矩阵
    line_arr = np.linspace(0,10,5)#在指定的间隔内返回均匀间隔的数字,即返回0-10中的5个等间隔数字
    

    5 基础运算

    import numpy as np  
    a = np.array([[10,20],
                [1,0,]])
    b = np.arange(4).reshape(2,2)
    c = a*b#逐个相乘
    c_dot = np.dot(a,b)#矩阵乘法
    print(c)
    print(c_dot)
    
    a = np.random.random((2,4))
    print(a)
    print(np.sum(a))
    print(np.max(a,axis=1))#按行求最大值
    print(np.min(a,axis=0))#按列求最大值
    

    6 基础运算2

    a = np.arange(14,2,-1).reshape((3,4))
    print(a)
    print(np.clip(a,5,9))
    print(np.mean(a,axis=0))#对于列进行计算
    print(np.mean(a,axis=1))#对于行进行计算
    

    7 numpy索引

    a = np.arange(3,15)
    print('a',a)
    print(a[3])
    b = a.reshape((3,4))
    print(b)
    print(b[2][1])#第2行,第1列(索引从0开始)
    print(b[:,2])#第2列,所有元素
    print(b[0,:])#第0行,所有元素
    print(b[0:2,0])#第0列,第0-2行的元素
    
    #for循环
    a = np.arange(3,15).reshape((3,4))
    print(a)
    
    print('row:')
    for row in a:
        print(row)
    
    print('column:')
    for column in a.T:#转置矩阵
        print(column)
     
    print('flat:')
    print(a.flatten())
    for item in a.flat:
        print(item)
    

    8 numpay array合并

    a = np.array([1,1,1])
    b = np.array([2,2,2])
    
    c = np.vstack((a,b))#vertical stack,上下合并
    d = np.hstack((a,b))#horizontal stack,左右合并
    print(c.shape,d.shape)
    print(c)
    print(d)
    
    print(a[np.newaxis,:])#新增行维
    print(a[:,np.newaxis])#新增列维
    
    d = np.concatenate((a,b,b,a),axis=0)#多个array的横向合并
    print(d)
    

    9 array分割

    a = np.arange(12).reshape((3,4))
    print(a)
    
    print(np.split(a,2,axis=1))#纵向平均分割为2个array
    print(np.split(a,3,axis=0))#横向平均分割为3个array
    print(np.array_split(a,3,axis=1))#纵向不等分割为3个array
    

    10 numpy copy & deep copy

    a = np.arange(4)
    print(a)
    b = a
    c = a
    d = a
    a[0] = 11
    print(a)
    # a,b,c,d相互关联
    print(b is a)
    d[1:3] = [22,33]
    print(c is a)
    
    #不相互关联
    b = a.copy()#deep copy
    a[3] = 15
    print(a is b)
    

    11 pandas基本介绍

    import pandas as pd
    import numpy as np
    
    s = pd.Series([1,3,6,np.nan,44,1])#可以显示索引的列表
    print(s)
    dates = pd.date_range('20180304',periods=6)#从20180304开始的6天时间数据
    print(dates)
    
    df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])#dataframe相当于一个matrix矩阵,也就是numpy里面的二维矩阵。行索引为dates,列索引为中括号内所述内容
    print(df)
    df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
    print(df1)
    df2 = pd.DataFrame({'A':1,
                       'B':pd.Series(1,index=list(range(4)),dtype='float32'),
                       'C':pd.Timestamp('20180102'),
                       'D':np.array([3]*4,dtype='int32'),
                       'E':pd.Categorical(['test','train','test','train']),
                       'F':'foo'})
    print(df2)
    print(df2.dtypes)#输出类型
    print(df2.index)#输出索引
    print(df2.columns)#输出列名
    print(df2.values)
    print(df2.describe())#输出描述
    print(df2.T)#矩阵转置
    print(df2.sort_index(axis=1,ascending=False))#按行排序,降序
    print(df2.sort_values(by='E'))#按E列中的值排序
    

    12 pandas选择数据

    dates = pd.date_range('20180308',periods=6)
    df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
    print(df[0:3],df['20180309':'20180311'])#选择列
    print(df['a'],df.a)#选择行
    print(df.loc['20180309'])#按横向标签选择
    print(df.loc[:,['b','c']])#按列标签选择
    print(df.iloc[3:5,1:3])#按位置选择
    # print(df.ix[:3,['a','d']])#标签、位置混合选择,已弃用
    print(df[df.a>8])
    

    13 pandas设置值

    df.iloc[2,2]=1111
    df.loc['20180309','a'] = 2222
    df.b[df.a>8] = 0
    

    14 如何处理丢失数据

    dates = pd.date_range('20180308',periods=6)
    df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
    df.iloc[0,1] = np.nan
    df.iloc[1,2] = np.nan
    print(df.dropna(axis=0,how='any'))#丢掉nan数据,how=['any','all'],当how=all时,该行全部为nan时才启用
    print(df.fillna(value=0))#将nan填为0
    print(df.isnull)#是否缺失数据
    print(np.any(df.isnull()==True))#至少有一个为nan
    

    15 导入导出数据

    data = pd.read_csv('filepath')#导入
    data.to_pickle('filepath')#导出
    

    16 合并concatenating

    df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
    df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
    df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
    
    # print(df1)
    # print(df2)
    # print(df3)
    
    # result = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#axis=0为纵向合并
    # print(result)
    
    #join,['inner','outer']
    df4 = pd.DataFrame(np.ones((3,4))*1,columns=['c','d','e','f'])
    df5 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
    res = pd.concat([df4,df5],axis=0,join='inner',ignore_index=True)#默认为outer join
    print(res)
    
    #join axes
    res2 = pd.concat([df4,df5],axis=1,join_axes=[df4.index])#横向合并,以df4的索引为准
    print(res2)
    
    #append
    res3 = df1.append(df2,ignore_index=True)
    print(res3)
    
    s1= pd.Series([1,2,3,4],index=['a','b','c','d'])
    res4 = df1.append(s1,ignore_index=True)
    print(res4)
    

    17 合并merge

    #两组dataframe依据key合并
    left = pd.DataFrame({'key':['k0','k1','k2','k3'],
                        'A':['b0','b1','b2','b3'],
                        'B':['a0','a1','a2','a3']})
    right = pd.DataFrame({'key':['k0','k1','k2','k3'],
                         'C':['c0','c1','c2','c3'],
                         'D':['d0','d1','d2','d3']})
    res = pd.merge(left,right,on='key')
    # print(res)
    
    #依据两组keys合并
    left2 = pd.DataFrame({'key1':['k0','k0','k1','k2'],
                          'key2':['k0','k1','k0','k1'],
                        'A':['b0','b1','b2','b3'],
                        'B':['a0','a1','a2','a3']})
    right2 = pd.DataFrame({'key1':['k0','k0','k1','k2'],
                           'key2':['k0','k0','k0','k0'],
                         'C':['c0','c1','c2','c3'],
                         'D':['d0','d1','d2','d3']})
    res2 = pd.merge(left2,right2,on=['key1','key2'],how='inner')
    # print(res2)
    
    #根据index横向合并
    res3 = pd.merge(left2,right2,left_index=True,right_index=True,how='outer')
    print(res3)
    
    #当两个表中的某一列重名,但是值不同,合并的时候要保留,则使用suffixes参数
    boys = pd.DataFrame({'k':['k0','k1','k2'],'age':[1,2,3]})
    girls = pd.DataFrame({'k':['k0','k0','k3'],'age':[4,5,6]})
    
    res4 = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
    print(res4)
    

    18 plot图表

    import matplotlib.pyplot as plt
    #线性数据Series
    data = pd.Series(np.random.randn(1000),index=np.arange(1000))
    data = data.cumsum()#累加
    # data.plot()
    # plt.show()#显示
    #矩阵数据DataFrame
    data2 = pd.DataFrame(np.random.randn(1000,4),
                        index=np.arange(1000),
                        columns=list('ABCD'))
    data2 = data2.cumsum()
    print(data2.head())
    # data2.plot()
    ax = data2.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
    data2.plot.scatter(x='A',y='C',color='DarkGreen',label='Class2',ax=ax)
    plt.show()#显示
    
    #plot method:'bar','hist','box','kde','area','scatter','hexbin','pie'
    
    

    感谢:

    视频链接

    莫烦PYTHON

    Pandas速查手册中文版 - CSDN博客

    相关文章

      网友评论

        本文标题:Numpy&Pandas视频笔记

        本文链接:https://www.haomeiwen.com/subject/imvkfftx.html