美文网首页
numpy,pandas,matplotlib

numpy,pandas,matplotlib

作者: esskeetit | 来源:发表于2018-06-01 18:14 被阅读0次

    1.numpy

    1.1 create an numpy array

    t=(1,2,3)
    a=np.array(t,dtype= 'int')
    a.dtype
    
    list_1=[1,2,3]
    a = np.array(list1,dtype= 'float32')
    
    list2 = [4.2,5.1,6.3]
    b = np.array([list1,list2])
    
    c = np.array( [ [1,2], [3,4] ], dtype=complex )
    c.shape
    
    np.zeros([2,2])
    np.ones([2,3])
    np.empty([2,3]) 
    np.eye(5)
    np.arange(2,10,2)
    np.linspace(0,10,4) #包含10
    

    1.2 Basic Operations of Arrays

    a = np.array( [[1,2,3,4],[5,6,7,8]])
    a * a  #对应元素相乘
    a ** 3 #对应元素求立芳
    a + a #对应元素相加
    1/a #对应元素取倒数
    
    A = np.array( [[1,3],[0,1]] )
    B = np.array( [[2,2],[3,4]] )
    A*B                # 直接相乘是对每个元素相乘 elementwise product
    A.dot(B)           # .dot()是矩阵相乘 matrix product
    np.dot(A, B)
    np.add(A, B)         #矩阵加法
    
    a = np.random.random([3,3])  # np.random.random 是在半开放区间[0.0, 1.0) 之间生成随机数字
    np.random.randn(3,3) #生成一个浮点数或N维浮点数组,取数范围:正态分布的随机样本数。
    a.sum() #所有数相加
    a.min() #所有数中的最小值
    a.max() #所有数中的最大值
    
    b = np.arange(24).reshape(6,4)
    b.sum(axis=0) # sum of each column
    b.sum(axis=1) # sum of each row
    b.min(axis=1) # min of each row
    b.cumsum(axis=1) # 累计相加 cumulative sum along each row
    

    1.3 Indexing, Slicing and Iterating

    1.3.1 One-dimensional arrays

    a = np.arange(10)**2
    print(a)  #array([100, 100, 100,   9,  16,  25,  36,  49,  64,  81], dtype=int32)
    a[2]
    a[2:6] 
    a[0:3] = 100
    print(a)  #array([100, 100, 100,   9,  16,  25,  36,  49,  64,  81], dtype=int32)
    
    a_slice = a[0:5]
    print(a_slice) #  array([100, 100, 100,   9,  16], dtype=int32)
    
    #当我们选取部分 np array 的时候,我们创建了一个view。
    # 换句话说,我们没有copy以前的值,而是直接改了原来的 array 
    # 如果不想对原来的 array 进行修改,我们可以把选取的部分 copy 出来
    
    a_slice_2 = a[0:5].copy() 
    print(a_slice_2)
    a_slice_2[0] = 500
    print(a)
    

    1.3.2 Multidimensional arrays

    def f(x,y):
        return 4 * x + y
    b = np.fromfunction(f,(3,2),dtype=int)
    # derive from locations of the matrix
    #[[00],[01],[10],[11],[20],[21]]
    
    b[1] #取第二行
    b[1][0] #取第二行第一列
    b[1,0]
    b[:, 1] #取第二列
    
    b.shape[0]  #返回行数
    b.shape[1]  #返回列数
    
    c = np.zeros ([5,5])
    c
    
    for i in range(c.shape[1]):
        c[i] = i  #第i行的所有值都取i这个值
        print(c[i])
    

    1.3.3 Boolean Indexing

    week_days=np.array(['Monday','Tuesday','Wednesday','Thursday','Friday'])
    work_time=np.round(np.random.randn(5,5)+8.0, 2)
    week_days == 'Tuesday' #返回boolean array
    work_time.T[week_days == 'Tuesday']  #沿着列进行索引
    

    1.4 Matrix Operations II

    a = np.array([[1,2,3],[3,4,5]],dtype='float')
    a.T  #转置
    a.transpose() 
    np.dot(a.T,a)
    
    print(np.linalg.inv(np.dot(a.T,a)))# 求逆矩阵
    
    from numpy.linalg import inv
    inv(np.dot(a.T,a))  # 求逆矩阵
    
    #验证求逆是否正确
    np.allclose # Returns True if two arrays are element-wise equal within a tolerance : default 1e-08
    a = np.array([[1., 2.], [3., 4.]])
    ainv = inv(a)
    
    # here is to check if np.dot(a, ainv) equals to I matrix
    np.allclose(np.dot(a, ainv), np.eye(2))
    np.allclose(np.dot(ainv, a), np.eye(2))
    
    from numpy.linalg import *
    a = np.array([[1,2,3],[3,4,5]],dtype='float')
    # return eigenvalues and normalized eigenvectors
    eig(np.dot(a.T,a))
    
    a = np.array([[[1,2,3,0],[3,4,5,2]]])
    a.shape  #(1, 2, 4)
    a.T
    a.T.shape  #(4, 2, 1)
    a.transpose( [0,2,1] ) 
    #a.transpose可以指定到底要怎么变换:比如原来是 [1,2,4], 
    #可以指定转变的方式 [0,2,1], 这个 [0,2,1] 的意思是 第一个维度不变,后两个维度交换
    # 那么形状就变成了 [1,4,2]
    
    a = np.array([[1., 2.], [3., 4.]])
    np.trace(a) # 算对角的和
    a.swapaxes(0,1) #换轴
    
    a3d = np.arange(50).reshape([5,5,2])
    a3d.transpose([0,2,1])  # .reshape([5,2,5])
    
    np.bincount(np.array([1, 1, 2, 10, 2, 4, 7])) # 统计0-10分别出现的次数
    

    1.5 Array processing

    1.5.1Generate Grid

    import numpy as np
    import matplotlib.pyplot as plt
    %matplotlib inline
    x,y=np.meshgrid(np.arange(-10,10,0.02),np.arange(-10,10,0.02))
    z= np.sqrt(x**2+y**2)
    plt.imshow(z)
    plt.colorbar()
    plt.show()
    
    image.png

    1.5.2 numpy where function

    A = np.array([1,2,3,4])
    B= np.array([5,1,7,2])
    condition = np.array([True,False,False,False])
    np.where(condition,A,B) #condition满足选择A,不满足选择B
    
    b = np.random.randn(5,5)
    np.where(b < 0,0,b)  #change negative number to 0
    

    1.5.3 Some Statistical Processing

    c = np.array([[1,2,3],[4,5,6],[7,8,9]])
    print(c)
    c.sum() #所有数求和
    c.sum(axis=1) #对各行求和
    c.mean()#所有数求平均值
    c.std() #所有数求标准偏差
    c.var() #所有数求方差
    

    1.5.4 Array Sort

    d = np.random.randn(10)
    d.sort()
    
    # in1d test values in one array
    e = np.array([1,2,3,3,4,4,5])
    np.in1d([2,4,8],e)   array([ True,  True, False])
    # check if element in the first array appears in the second array
    
    # unique
    np.unique(e) #把所有不重复的值取出来
    

    1.6 Save and Load Array

    ## Saving array in binary format (.npy)
    import numpy as np
    a = np.array([1,2,3,4,5])
    np.save('array_a',a)
    np.load('array_a.npy')
    
    ## Saving multiple arrays into a zip file
    b = np.array([[1,2,3],[4,5,6]])
    np.savez('two_arrays.npz',x=b,y=b.T)
    np.load('two_arrays.npz')['x']
    np.load('two_arrays.npz')['y']
    
    ## Saving and loading into text files
    np.savetxt('array_text.txt',b,delimiter=',')
    np.loadtxt('array_text.txt',delimiter=',')
    

    2. pandas

    2.1 series

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    %matplotlib inline
    s1 = pd.Series([1,2,3,4,np.nan,5,6,7])
    s1.values #返回所有值
    s1.index  #RangeIndex(start=0, stop=8, step=1)
    
    s2 = pd.Series([21,23,42,21,23],index=['Jack','Lucy','Helen','Milky','Jasper']) #可指定Index
    s2['Jack']  #取值
    s2.loc['Jack']
    s2.iloc[0]
    print (s2.shape) #(5,)
    print(s2.size) #5
    s2.head(2) #取前两行
    s2.describe() #描述统计学数据
    s2.sort_values() #按升序对值进行排序
    s2[s2>22] # Check the people who is older than 22
    s2.plot.bar() #直方图
    'Lucy' in s2 #判断元素是否在Series中
    s2_dict = s2.to_dict() #将series转化为字典
    s2_series = pd.Series(s2_dict) #将字典转化为series
    name = ['Jack','Lucy','Helen','Milky','Tom','Jasper','Helen'] #可将index先存在列表中
    s2_new = pd.Series(s2_dict,index = name)
    s2_new.drop_duplicates() ## drop the duplicate in value
    pd.isnull(s2_new) #判断值是否为nan
    

    2.2 DataFrame + Titanic Example

    df = pd.DataFrame({'Student_1':[90,100, 95], 'Student_2':[60, 80, 100]}, index=['Monday', 'Wednesday', 'Friday'])
    df1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['A', 'B'], columns=['C1', 'C2', 'C3'])
    df1.values #返回df1的所有值
    df1.index  #返回df1的行名称
    df1.columns #返回df1的列名称
    df1.T #转置
    df1.shape 
    df1.size #df1中元素的个数
    df1.head(2) #返回前两行
    df1.tail(1) #返回最后一行
    df1.describe() #每一列的描述性统计数据
    df1.loc['B'] #取列名为B的所有元素
    df1.loc['B'].loc['C2']  # loc works on index
    df1['C2'].loc['B']
    df1.loc['B', 'C2']
    df1.iloc[1, 1]     # iloc works on position (only take integers)
    df1 + 10 * 15     # element-wise operations
    df1['C2'] = df1.apply(lambda x: x['C2'] ** 2 + 10, axis=1)
    df1.assign(C2 = lambda x: x['C2'] ** 2 + 10,\
               C3 = lambda x: x['C3'] * 2 - 10).loc['A'] .max()
    
    from IPython.display import Image
    Image("./variable.png")    #jupyter notebook同一目录下图片的显示
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    % matplotlib inline
    import seaborn as sns
    sns.set()
    
    df = pd.read_csv('train.csv')
    df.shape  #查看行数和列数 (891, 12)
    df.head(5) #查看前五行
    df.tail(2) #查看最后两行
    df.dtypes  #查看每一列的数据类型 dataframe会将string类型存为object类型
    df.Survived.value_counts()  #查看存活和遇难的人数
    df.isnull().sum()#查看每一列的缺失值
    df.Survived.value_counts().plot(kind='bar') #存活和遇难人数的直方图
    df.Pclass.value_counts()  #查看不同舱房等级的人数
    df.isnull().sum().plot(kind='bar') # 每一列缺失值的柱状图
    

    How to deal with missing value ?

    df1 = df.drop('Cabin', axis=1) #对于缺失值最多的carbin这一列进行删除
    df1.shape #(891, 11)
    df1['Age']=df1['Age'].fillna(20) #对于Age的缺失值填充20
    #df1['Age']=df1['Age'].fillna(df.Age.mean()) 还可填充平均值
    df2 = df1[df1['Embarked'].notnull()] #把缺失的Embarked那两行数据删除掉
    df2.shape # (889, 11)
    
    # missing value removal
    df3 = df.drop('Cabin', axis=1).assign(Age = lambda x: x['Age'].fillna(20))
    df3=df3.loc[df3['Embarked'].notnull()]
    

    Exploration (basic statistics)

    df3.loc[10:14, ['Name', 'Sex', 'Survived']] #去除第10-14行,'Name', 'Sex', 'Survived'这三列的数据
    df3.columns #查看列名
    df3.pivot_table(values='PassengerId', index='Survived', columns='Sex', aggfunc='count')
    df4 = df3.loc[df3['Survived'] == 1] #取出存活的人
    df5 = df3.loc[df3['Age'] > 30] #取出年龄大于30的人
    df6 = df3[['PassengerId', 'Name']].merge(df3[['PassengerId', 'Age']], on='PassengerId', how='outer') #合并两个df
    df3['Survived'].corr(df['Pclass']) #查看相关系数
    

    2.3 Index Objects

    import numpy as np
    import pandas as pd
    s = pd.Series([1,2,3,4,5,6], index=pd.date_range('20170102', periods=6))
    s.index
    s.index[2]
    s.index[2:] #DatetimeIndex(['2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07'], dtype='datetime64[ns]', freq='D')
    

    2.4 Reindex

    2.4.1 numpy Reindex

    np.random.randn(1, 4).tolist()
    s1 = pd.Series(np.random.randn(1, 4).tolist()[0], index=['A', 'B','C','D']) 
    s2 = s1.reindex(['A', 'B','C','D','E','F','G']) #EFG为NaN
    s3 = s2.reindex(['A', 'B','C','D','E','F','G','H'],fill_value=0) #H为0
    

    2.4.2 Pandas Reindex

    df = pd.DataFrame(np.random.randn(4, 4), index = ['r1','r2','r3','r4'], columns=['c1','c2','c3','c4'])
    df.reindex(['r1','r2','r3','r6','r4','r5']) #并不改变df
    df.reindex(columns=['c1','c2','c3','c4','c5'])
    

    2.5 drop data

    2.5.1 Drop Series

    s1 = pd.Series(np.arange(5),index=[1,2,3,4,5])
    s1.drop(4) #将index为4的这一行删除掉
    

    2.5.2 Drop DataFrame

    df1 = pd.DataFrame(np.random.randn(4,5),index=['r1','r2','r3','r4'],columns=['c1','c2','c3','c4','c5'])
    df1.drop('r1') #删除Index为r1的那一行
    df1.drop('c5',axis=1) #删除column为c5的那一列
    df1.drop('c5',axis=1,inplace=True) #删除后会改变df1
    
    

    2.6 slicing data

    s1 = pd.Series(np.arange(5),index=['A','B','C','D','E'])
    s2=s1[:3] #取出前三行
    s2['A']=2018 #将s2这个slice对应A的值改为2018,s1也会改变
    s1[['A','B','C']]
    s1[s1>2] #取出s1中大于2的数
    s1[s1>3] = 10 #将s1中大于3的数改为10
    s1[(s1>2)&(s1<5)]=6
    
    df1 = pd.DataFrame(np.random.randn(4,5),index=['r1','r2','r3','r4'],columns=['c1','c2','c3','c4','c5'])
    df1['c1'] #取出c1这一列
    df1[['c1','c4','c3']] # 选取多列
    df1[df1['c2']>0] #选出c2这一列大于0的所有行
    df1<0 #返回布尔dataframe
    df1.iloc[1] #返回第二行
    df1.loc['r2'] #返回r2这一行
    df1.iloc[[0,2,3],[0,2]] 
    df2 = df1.copy()
    df2['c6'] = ['one', 'one','two','three'] #copy后对slice进行更改,则原dataframe不会改变
    df2['c6'].isin(['two','three']) #判断c2这一列中是否有列表['two','three']中的元素
    df2[df2['c6'].isin(['two','three'])]
    

    2.7 #Data Alignment

    s1 = pd.Series(np.arange(5),index=['A','B','C','D','E'])
    s2 = pd.Series(np.arange(3),index=['A','B','C'])
    s1 + s2 #对应Index进行相加
    df1 = pd.DataFrame(np.random.randn(4,5),index=['r1','r2','r3','r4'],columns=['c1','c2','c3','c4','c5'])
    df2 = pd.DataFrame(np.random.randn(3,4),index=['r1','r2','r3'],columns=['c1','c2','c3','c4'])
    df1+df2 #index和columns相同的对应相加
    df1.add(df2,fill_value=100) #将df2中缺值,df1中存在的进行补值
    

    2.8 rank and sort

    s1 = pd.Series(np.arange(5),index=['B','D','C','A','E'])
    s1.sort_index() #对index进行升序操作
    s1.sort_values(ascending=False) #对index对应的值进行降序操作
    s2 = pd.Series(np.random.randn(6))
    s2.rank()

    3. matplotlib

    待续

    相关文章

      网友评论

          本文标题:numpy,pandas,matplotlib

          本文链接:https://www.haomeiwen.com/subject/hdwujftx.html