美文网首页
Pandas学习笔记1

Pandas学习笔记1

作者: 山雾幻华 | 来源:发表于2019-04-20 21:44 被阅读0次
    import pandas as pd
    import numpy as np
    
    s = pd.Series([1,3,6,np,np.nan,44,1])
    dates = pd.date_range('20160101',periods=6)
    df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
    df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
    df2 = pd.DataFrame({"A":1.,"B":pd.Categorical(["test",'tesst'])})
    print('S',s)
    print('data',dates)
    print('df',df)
    print('df1',df1)
    print('df1',df2)
    
    S 0                                                    1
    1                                                    3
    2                                                    6
    3    <module 'numpy' from 'E:\\program\\Anaconda3\\...
    4                                                  NaN
    5                                                   44
    6                                                    1
    dtype: object
    data DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
                   '2016-01-05', '2016-01-06'],
                  dtype='datetime64[ns]', freq='D')
    df                    a         b         c         d
    2016-01-01 -1.018450  0.116236  0.171505 -0.989516
    2016-01-02  0.608064 -0.523510  0.335354 -0.895313
    2016-01-03  2.804357 -0.105679 -0.821816 -0.340312
    2016-01-04  1.239038 -1.592973 -0.060665 -1.785517
    2016-01-05 -0.307189 -1.255811 -0.420571  1.545740
    2016-01-06 -1.469857  2.646400  0.101558  0.438751
    df1    0  1   2   3
    0  0  1   2   3
    1  4  5   6   7
    2  8  9  10  11
    df1      A      B
    0  1.0   test
    1  1.0  tesst
    
    print("df2.dtypes",df2.dtypes)
    print("df2.index",df2.index)
    print("df2.columns",df2.columns)
    print("df2.values",df2.values)
    print("df2.describe",df2.describe())
    print("df2.T",df2.T)
    print("df2.sort_index",df2.sort_index(axis=1,ascending=False))
    print("df2.sort_index",df2.sort_values(by='B'))
    
    df2.dtypes A     float64
    B    category
    dtype: object
    df2.index RangeIndex(start=0, stop=2, step=1)
    df2.columns Index(['A', 'B'], dtype='object')
    df2.values [[1.0 'test']
     [1.0 'tesst']]
    df2.describe          A
    count  2.0
    mean   1.0
    std    0.0
    min    1.0
    25%    1.0
    50%    1.0
    75%    1.0
    max    1.0
    df2.T       0      1
    A     1      1
    B  test  tesst
    df2.sort_index        B    A
    0   test  1.0
    1  tesst  1.0
    df2.sort_index      A      B
    1  1.0  tesst
    0  1.0   test
    

    选择数据

    dates=pd.date_range('20130101',periods=6)
    df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
    print(df)
    print(df['A'],df.A)
    print(df[0:3],'\n\n',df["20130102":"20130104"])
    # select by label:loc
    print(df.loc['20130102'])
    print(df.loc[:,["A","B"]])
    # select by position:iloc
    print(df.iloc[3,1])
    print(df.iloc[3:5,1])
    # mixed selection:ix
    print(df.ix[:3,['A','C']])
    #Boolean indexing
    print(df[df.A>8])
    
                 A   B   C   D
    2013-01-01   0   1   2   3
    2013-01-02   4   5   6   7
    2013-01-03   8   9  10  11
    2013-01-04  12  13  14  15
    2013-01-05  16  17  18  19
    2013-01-06  20  21  22  23
    2013-01-01     0
    2013-01-02     4
    2013-01-03     8
    2013-01-04    12
    2013-01-05    16
    2013-01-06    20
    Freq: D, Name: A, dtype: int32 2013-01-01     0
    2013-01-02     4
    2013-01-03     8
    2013-01-04    12
    2013-01-05    16
    2013-01-06    20
    Freq: D, Name: A, dtype: int32
                A  B   C   D
    2013-01-01  0  1   2   3
    2013-01-02  4  5   6   7
    2013-01-03  8  9  10  11 
    
                  A   B   C   D
    2013-01-02   4   5   6   7
    2013-01-03   8   9  10  11
    2013-01-04  12  13  14  15
    A    4
    B    5
    C    6
    D    7
    Name: 2013-01-02 00:00:00, dtype: int32
                 A   B
    2013-01-01   0   1
    2013-01-02   4   5
    2013-01-03   8   9
    2013-01-04  12  13
    2013-01-05  16  17
    2013-01-06  20  21
    13
    2013-01-04    13
    2013-01-05    17
    Freq: D, Name: B, dtype: int32
                A   C
    2013-01-01  0   2
    2013-01-02  4   6
    2013-01-03  8  10
                 A   B   C   D
    2013-01-04  12  13  14  15
    2013-01-05  16  17  18  19
    2013-01-06  20  21  22  23
    
    
    E:\program\Anaconda3\lib\site-packages\ipykernel_launcher.py:13: DeprecationWarning: 
    .ix is deprecated. Please use
    .loc for label based indexing or
    .iloc for positional indexing
    
    See the documentation here:
    http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
      del sys.path[0]
    

    设置值

    df.iloc[2,2] = 11111
    df['Z'] = 0
    df["E"]=pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
    print(df)
    
    
                 A   B      C   D  Z  E
    2013-01-01   0   1      2   3  0  1
    2013-01-02   4   5      6   7  0  2
    2013-01-03   8   9  11111  11  0  3
    2013-01-04  12  13     14  15  0  4
    2013-01-05  16  17     18  19  0  5
    2013-01-06  20  21     22  23  0  6
    

    处理丢失值

    dates=pd. date_range('20130101', periods=6)
    df=pd. DataFrame(np. arange(24). reshape((6,4)), index=dates, columns=['A','B','C','D'])
    df.iloc[0,1]=np. nan
    df.iloc[1,2]=np. nan 
    print(df)
    print(df.dropna())#丢掉行
    print(df.dropna(axis=0,how='any'))#丢掉行how={fany','all}
    
    print(df.fillna(value=0))#填充0
    
    print(np.any(df.isna()) == True)#判断有无null
    
                 A     B     C   D
    2013-01-01   0   NaN   2.0   3
    2013-01-02   4   5.0   NaN   7
    2013-01-03   8   9.0  10.0  11
    2013-01-04  12  13.0  14.0  15
    2013-01-05  16  17.0  18.0  19
    2013-01-06  20  21.0  22.0  23
                 A     B     C   D
    2013-01-03   8   9.0  10.0  11
    2013-01-04  12  13.0  14.0  15
    2013-01-05  16  17.0  18.0  19
    2013-01-06  20  21.0  22.0  23
                 A     B     C   D
    2013-01-03   8   9.0  10.0  11
    2013-01-04  12  13.0  14.0  15
    2013-01-05  16  17.0  18.0  19
    2013-01-06  20  21.0  22.0  23
                 A     B     C   D
    2013-01-01   0   0.0   2.0   3
    2013-01-02   4   5.0   0.0   7
    2013-01-03   8   9.0  10.0  11
    2013-01-04  12  13.0  14.0  15
    2013-01-05  16  17.0  18.0  19
    2013-01-06  20  21.0  22.0  23
    True
    

    导入导出

    read_csv
    read_excel
    read_hdf
    read_sql
    read_json
    read_msgpack(experimental)
    read_html
    read_gbg(experimental)
    read_stata
    read_sas
    read_clijpboard
    read_pickle
    
    to_csv
    to_excel
    to_hdf
    to_sql
    to_json
    to_msgpack(experimental)
    to_html
    to_gbg(experimental)
    to_stata
    to_sas
    to_clijpboard
    to_pickle
    

    合并

    # concatenating 
    df1=pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
    df2=pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
    df3=pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
    res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
    print(res)
    
    # join,['inner','outer']
    df1=pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
    df2=pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'])
    res1 = pd.concat([df1,df2],join='outer',ignore_index=True)
    res2 = pd.concat([df1,df2],join='inner',ignore_index=True)
    print(res1,res2)
    
    # join_axes 
    dfl=pd. DataFrame(np. ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
    df2=pd. DataFrame(np. ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
    res=pd. concat([ df1, df2], axis=1, join_axes=[ df1. index])
    print(res)
    
    # append 
    df1=pd. DataFrame(np. ones((3,4))*0, columns=['a','b','c','d'])
    df2=pd. DataFrame(np. ones((3,4))*1, columns=['a','b','c','d'])
    df3=pd. DataFrame(np. ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
    res=dfl. append(df2, ignore_index=True)
    res=dfl. append([ df2, df3])
    print(res)
    
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  1.0  1.0  1.0  1.0
    4  1.0  1.0  1.0  1.0
    5  1.0  1.0  1.0  1.0
    6  2.0  2.0  2.0  2.0
    7  2.0  2.0  2.0  2.0
    8  2.0  2.0  2.0  2.0
    
    
    E:\program\Anaconda3\lib\site-packages\ipykernel_launcher.py:11: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
    of pandas will change to not sort by default.
    
    To accept the future behavior, pass 'sort=False'.
    
    To retain the current behavior and silence the warning, pass 'sort=True'.
    
      # This is added back by InteractiveShellApp.init_path()
    
    
         a    b    c    d    e
    0  0.0  0.0  0.0  0.0  NaN
    1  0.0  0.0  0.0  0.0  NaN
    2  0.0  0.0  0.0  0.0  NaN
    3  NaN  1.0  1.0  1.0  1.0
    4  NaN  1.0  1.0  1.0  1.0
    5  NaN  1.0  1.0  1.0  1.0      b    c    d
    0  0.0  0.0  0.0
    1  0.0  0.0  0.0
    2  0.0  0.0  0.0
    3  1.0  1.0  1.0
    4  1.0  1.0  1.0
    5  1.0  1.0  1.0
         a    b    c    d    b    c    d    e
    0  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
    1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
    2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
         a    b    c    d    e
    1  0.0  0.0  0.0  0.0  NaN
    2  0.0  0.0  0.0  0.0  NaN
    3  0.0  0.0  0.0  0.0  NaN
    0  1.0  1.0  1.0  1.0  NaN
    1  1.0  1.0  1.0  1.0  NaN
    2  1.0  1.0  1.0  1.0  NaN
    2  NaN  1.0  1.0  1.0  1.0
    3  NaN  1.0  1.0  1.0  1.0
    4  NaN  1.0  1.0  1.0  1.0
    
    
    E:\program\Anaconda3\lib\site-packages\pandas\core\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
    of pandas will change to not sort by default.
    
    To accept the future behavior, pass 'sort=False'.
    
    To retain the current behavior and silence the warning, pass 'sort=True'.
    
      sort=sort)
    
    # merging two df by key/keys.(may be used in database)
    # simple example 
    left=pd. DataFrame({"key":['KO','K1','K2','K3'],
                        'A':['A0','A1','A2','A3'], 
                        'B':['BO','B1','B2','B3']})
    right=pd. DataFrame({"key":['K0','K1','K2','K3'],
                         'C':['CO','C1','C2','C3'],
                         'D':['DO','D1','D2','D3']})
    print(left,right)
    
    res = pd.merge(left,right,on='key')
    print(res)
    
    # consider two keys
    left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                                 'key2': ['K0', 'K1', 'K0', 'K1'],
                                 'A': ['A0', 'A1', 'A2', 'A3'],
                                 'B': ['B0', 'B1', 'B2', 'B3']})
    right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                                  'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})
    print(left)
    print(right)
    res = pd.merge(left, right, on=['key1', 'key2'], how='inner')  # default for how='inner'
    print(res)
    # how = ['left', 'right', 'outer', 'inner']
    res = pd.merge(left, right, on=['key1', 'key2'], how='left')
    print(res)
    
    # indicator
    df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
    df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
    print(df1)
    print(df2)
    res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
    # give the indicator a custom name
    res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
    
    # merged by index
    left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                                      'B': ['B0', 'B1', 'B2']},
                                      index=['K0', 'K1', 'K2'])
    right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                                         'D': ['D0', 'D2', 'D3']},
                                          index=['K0', 'K2', 'K3'])
    print(left)
    print(right)
    # left_index and right_index
    res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
    print(res)
    res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
    print(res)
    
    # handle overlapping
    boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
    girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
    res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
    print(res)
    
    # join function in pandas is similar with merge. If know merge, you will understand join
    
      key   A   B
    0  KO  A0  BO
    1  K1  A1  B1
    2  K2  A2  B2
    3  K3  A3  B3   key   C   D
    0  K0  CO  DO
    1  K1  C1  D1
    2  K2  C2  D2
    3  K3  C3  D3
      key   A   B   C   D
    0  K1  A1  B1  C1  D1
    1  K2  A2  B2  C2  D2
    2  K3  A3  B3  C3  D3
      key1 key2   A   B
    0   K0   K0  A0  B0
    1   K0   K1  A1  B1
    2   K1   K0  A2  B2
    3   K2   K1  A3  B3
      key1 key2   C   D
    0   K0   K0  C0  D0
    1   K1   K0  C1  D1
    2   K1   K0  C2  D2
    3   K2   K0  C3  D3
      key1 key2   A   B   C   D
    0   K0   K0  A0  B0  C0  D0
    1   K1   K0  A2  B2  C1  D1
    2   K1   K0  A2  B2  C2  D2
      key1 key2   A   B    C    D
    0   K0   K0  A0  B0   C0   D0
    1   K0   K1  A1  B1  NaN  NaN
    2   K1   K0  A2  B2   C1   D1
    3   K1   K0  A2  B2   C2   D2
    4   K2   K1  A3  B3  NaN  NaN
       col1 col_left
    0     0        a
    1     1        b
       col1  col_right
    0     1          2
    1     2          2
    2     2          2
         A   B
    K0  A0  B0
    K1  A1  B1
    K2  A2  B2
         C   D
    K0  C0  D0
    K2  C2  D2
    K3  C3  D3
          A    B    C    D
    K0   A0   B0   C0   D0
    K1   A1   B1  NaN  NaN
    K2   A2   B2   C2   D2
    K3  NaN  NaN   C3   D3
         A   B   C   D
    K0  A0  B0  C0  D0
    K2  A2  B2  C2  D2
        k  age_boy  age_girl
    0  K0        1         4
    1  K0        1         5
    

    plt画图

    import matplotlib.pyplot as plt
    
    # plot data
    
    # Series
    data = pd.Series(np.random.randn(1000), index=np.arange(1000))
    data = data.cumsum()
    ##data.plot()
    
    # DataFrame
    data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
    data = data.cumsum()
    # plot methods:
    # 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
    ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
    data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
    
    plt.show() 
    

    参考

    https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/

    相关文章

      网友评论

          本文标题:Pandas学习笔记1

          本文链接:https://www.haomeiwen.com/subject/jhkrgqtx.html