美文网首页python
15 Pandas.DataFrame函数方法

15 Pandas.DataFrame函数方法

作者: 7125messi | 来源:发表于2018-10-08 20:34 被阅读36次

    1.abs(),返回DataFrame每个数值的绝对值,前提是所有元素均为数值型

    import pandas as pd
    import numpy as np
    
    df=pd.read_excel('南京银行.xlsx',index_col='Date')
    df1=df[:5]
    df1.iat[0,1]=-df1.iat[0,1]
    df1
                Open  High   Low  Close  Turnover    Volume
    Date                                                   
    2017-09-15  8.06 -8.08  8.03   8.04    195.43  24272800
    2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
    2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
    2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
    2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600
    
    df1.abs()
                Open  High   Low  Close  Turnover      Volume
    Date                                                     
    2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800.0
    2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600.0
    2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100.0
    2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700.0
    2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600.0
    

    2.add(other, axis='columns', level=None, fill_value=None) 将某个序列或表中的元素与本表中的元素相加,默认匹配列元素

    ar1=[8.1,8.2,8.0,8.15,200.00,32000000]
    cl1=['Open','High','Low','Close','Turnover','Volume']
    
    se1=pd.Series(data=ar1,index=cl1)
    se1
    
    Open               8.10
    High               8.20
    Low                8.00
    Close              8.15
    Trunover         200.00
    Volume      32000000.00
    dtype: float64
    
    df1.add(se1)
    Open   High    Low  Close  Turnover      Volume
    Date                                                        
    2017-09-15  16.16   0.12  16.03  16.19    395.43  56272800.0
    2017-09-18  16.15  16.33  16.03  16.21    400.76  56867600.0
    2017-09-19  16.13  16.26  15.94  16.15    633.76  86253100.0
    2017-09-20  16.07  16.26  15.95  16.18    519.94  71909700.0
    2017-09-21  16.12  16.30  15.99  16.19    441.94  62056600.0
    
    df1.add(df1)
    
                 Open   High    Low  Close  Turnover     Volume
    Date                                                       
    2017-09-15  16.12 -16.16  16.06  16.08    390.86   48545600
    2017-09-18  16.10  16.26  16.06  16.12    401.52   49735200
    2017-09-19  16.06  16.12  15.88  16.00    867.52  108506200
    2017-09-20  15.94  16.12  15.90  16.06    639.88   79819400
    2017-09-21  16.04  16.20  15.98  16.08    483.88   60113200
    

    3.add_prefix()和add_suffix()为列名添加前缀或后缀

    df1.add_prefix('list')
    
                listOpen  listHigh  listLow  listClose  listTurnover  listVolume
    Date                                                                        
    2017-09-15      8.06      8.08     8.03       8.04        195.43    24272800
    2017-09-18      8.05      8.13     8.03       8.06        200.76    24867600
    2017-09-19      8.03      8.06     7.94       8.00        433.76    54253100
    2017-09-20      7.97      8.06     7.95       8.03        319.94    39909700
    2017-09-21      8.02      8.10     7.99       8.04        241.94    30056600
    
    df1.add_suffix('list')
    
                Openlist  Highlist  Lowlist  Closelist  Turnoverlist  Volumelist
    Date                                                                        
    2017-09-15      8.06      8.08     8.03       8.04        195.43    24272800
    2017-09-18      8.05      8.13     8.03       8.06        200.76    24867600
    2017-09-19      8.03      8.06     7.94       8.00        433.76    54253100
    2017-09-20      7.97      8.06     7.95       8.03        319.94    39909700
    2017-09-21      8.02      8.10     7.99       8.04        241.94    30056600
    

    4.agg(func, axis=0, *args, **kwargs),合计运算,常用的函数有min,max,prod,mean,std,var,median等

    #所有列只做一种运算
    df1.agg(sum)
    Open        4.013000e+01
    High        4.043000e+01
    Low         3.994000e+01
    Close       4.017000e+01
    Turnover    1.391830e+03
    Volume      1.733598e+08
    dtype: float64
    
    #所有列做两种运算
    df1.agg(['sum','min'])
          Open   High    Low  Close  Turnover     Volume
    sum  40.13  40.43  39.94  40.17   1391.83  173359800
    min   7.97   8.06   7.94   8.00    195.43   24272800
    
    #不同列做不同运算
    df1.agg({'Open':['sum','min'],'Close':['sum','max']})
         Close   Open
    max   8.06    NaN
    min    NaN   7.97
    sum  40.17  40.13
    

    5.align(),DataFrame与Series或DataFrame之间连接运算,常用的有内联,外联,左联,右联

    df2=df[3:5]
    df2
    Out[68]: 
                Open  High   Low  Close  Turnover    Volume
    Date                                                   
    2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
    2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600
    
    df1.align(df2,join='inner') #返回的为元组类型对象
    (            Open  High   Low  Close  Turnover    Volume
     Date                                                   
     2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
     2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600,
                 Open  High   Low  Close  Turnover    Volume
     Date                                                   
     2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
     2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600)
    
    df1.align(df2,join='left')
    Out[69]: 
    (            Open  High   Low  Close  Turnover    Volume
     Date                                                   
     2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800
     2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
     2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
     2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
     2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600,
                 Open  High   Low  Close  Turnover      Volume
     Date                                                     
     2017-09-15   NaN   NaN   NaN    NaN       NaN         NaN
     2017-09-18   NaN   NaN   NaN    NaN       NaN         NaN
     2017-09-19   NaN   NaN   NaN    NaN       NaN         NaN
     2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700.0
     2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600.0)
    
    df1.align(df2,join='left')[0]
    Out[70]: 
                Open  High   Low  Close  Turnover    Volume
    Date                                                   
    2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800
    2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
    2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
    2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
    2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600
    

    6.all()和any(),判断选定的DataFrame中的元素是否全不为空或是否任意一个元素不为空,返回值为Boolean类型

    df1.all(axis=0)
    Out[72]: 
    Open        True
    High        True
    Low         True
    Close       True
    Turnover    True
    Volume      True
    dtype: bool
    
    df1.all(axis=1)
    Out[73]: 
    Date
    2017-09-15    True
    2017-09-18    True
    2017-09-19    True
    2017-09-20    True
    2017-09-21    True
    dtype: bool
    
    df1.any()
    Out[74]: 
    Open        True
    High        True
    Low         True
    Close       True
    Turnover    True
    Volume      True
    dtype: bool
    

    7.append(),在此表格尾部添加其他对象的行,返回一个新的对象

    df2=df[5:7]
    df2
    Out[93]: 
                Open  High   Low  Close  Turnover    Volume
    Date                                                   
    2017-09-22  8.01  8.10  8.00   8.08    300.13  37212200
    2017-09-25  8.06  8.07  7.97   7.99    262.30  32754500
    
    df1.append(df2)
    Out[94]: 
                Open  High   Low  Close  Turnover    Volume
    Date                                                   
    2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800
    2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
    2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
    2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
    2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600
    2017-09-22  8.01  8.10  8.00   8.08    300.13  37212200
    2017-09-25  8.06  8.07  7.97   7.99    262.30  32754500
    
    #更高效的方法
    >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
    ...           ignore_index=True)
       A
    0  0
    1  1
    2  2
    3  3
    4  4
    

    8.apply(func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds) 对于DataFrame的行或列应用某个函数

    df1.apply(np.mean,axis=0)
    Out[96]: 
    Open        8.026000e+00
    High        8.086000e+00
    Low         7.988000e+00
    Close       8.034000e+00
    Turnover    2.783660e+02
    Volume      3.467196e+07
    dtype: float64
    
    df1.apply(np.max,axis=1)
    Out[97]: 
    Date
    2017-09-15    24272800.0
    2017-09-18    24867600.0
    2017-09-19    54253100.0
    2017-09-20    39909700.0
    2017-09-21    30056600.0
    dtype: float64
    

    9.applymap(func) 对DataFrame的元素应用某个函数

    df1.applymap(lambda x:'%.3f' %x)
    Out[100]: 
                 Open   High    Low  Close Turnover        Volume
    Date                                                         
    2017-09-15  8.060  8.080  8.030  8.040  195.430  24272800.000
    2017-09-18  8.050  8.130  8.030  8.060  200.760  24867600.000
    2017-09-19  8.030  8.060  7.940  8.000  433.760  54253100.000
    2017-09-20  7.970  8.060  7.950  8.030  319.940  39909700.000
    2017-09-21  8.020  8.100  7.990  8.040  241.940  30056600.000
    

    10.as_blocks()和as_matrix(),分别用于将DataFrame转化为以数据类型为键值的字典和将DataFrame转化为二维数组

    df1.as_blocks()
    Out[105]: 
    {'float64':             Open  High   Low  Close  Turnover
     Date                                         
     2017-09-15  8.06  8.08  8.03   8.04    195.43
     2017-09-18  8.05  8.13  8.03   8.06    200.76
     2017-09-19  8.03  8.06  7.94   8.00    433.76
     2017-09-20  7.97  8.06  7.95   8.03    319.94
     2017-09-21  8.02  8.10  7.99   8.04    241.94, 'int64':               Volume
     Date                
     2017-09-15  24272800
     2017-09-18  24867600
     2017-09-19  54253100
     2017-09-20  39909700
     2017-09-21  30056600}
    
    df1.as_matrix()
    Out[106]: 
    array([[  8.06000000e+00,   8.08000000e+00,   8.03000000e+00,
              8.04000000e+00,   1.95430000e+02,   2.42728000e+07],
           [  8.05000000e+00,   8.13000000e+00,   8.03000000e+00,
              8.06000000e+00,   2.00760000e+02,   2.48676000e+07],
           [  8.03000000e+00,   8.06000000e+00,   7.94000000e+00,
              8.00000000e+00,   4.33760000e+02,   5.42531000e+07],
           [  7.97000000e+00,   8.06000000e+00,   7.95000000e+00,
              8.03000000e+00,   3.19940000e+02,   3.99097000e+07],
           [  8.02000000e+00,   8.10000000e+00,   7.99000000e+00,
              8.04000000e+00,   2.41940000e+02,   3.00566000e+07]])
    

    11.asfreq(freq, method=None, how=None, normalize=False, fill_value=None),将时间序列转化为特定的频度

    #创建一个具有4个分钟时间戳的序列
    >>> index=pd.date_range('1/1/2017',periods=4,freq='T')
    >>> series=pd.Series([0.0,None,2.0,3.0],index=index)
    >>> df=pd.DataFrame({'S':series})
    >>> df
                           S
    2017-01-01 00:00:00  0.0
    2017-01-01 00:01:00  NaN
    2017-01-01 00:02:00  2.0
    2017-01-01 00:03:00  3.0
    
    #将序列升采样以30秒为间隔的时间序列
    >>> df.asfreq(freq='30S')
                           S
    2017-01-01 00:00:00  0.0
    2017-01-01 00:00:30  NaN
    2017-01-01 00:01:00  NaN
    2017-01-01 00:01:30  NaN
    2017-01-01 00:02:00  2.0
    2017-01-01 00:02:30  NaN
    2017-01-01 00:03:00  3.0
    
    #再次升采样,并将填充值设为5.0,可以发现并不改变升采样之前的数值
    >>> df.asfreq(freq='30S',fill_value=5.0)
                           S
    2017-01-01 00:00:00  0.0
    2017-01-01 00:00:30  5.0
    2017-01-01 00:01:00  NaN
    2017-01-01 00:01:30  5.0
    2017-01-01 00:02:00  2.0
    2017-01-01 00:02:30  5.0
    2017-01-01 00:03:00  3.0
    
    #再次升采样,提供一个方法,对于空值,用后面的一个值填充
    >>> df.asfreq(freq='30S',method='bfill')
                           S
    2017-01-01 00:00:00  0.0
    2017-01-01 00:00:30  NaN
    2017-01-01 00:01:00  NaN
    2017-01-01 00:01:30  2.0
    2017-01-01 00:02:00  2.0
    2017-01-01 00:02:30  3.0
    2017-01-01 00:03:00  3.0
    

    12.asof(where, subset=None),返回非空的行

    >>> df.asof(index[0])
    S    0.0
    Name: 2017-01-01 00:00:00, dtype: float64
    
    >>> df.asof(index)
                           S
    2017-01-01 00:00:00  0.0
    2017-01-01 00:01:00  0.0
    2017-01-01 00:02:00  2.0
    2017-01-01 00:03:00  3.0
    

    13.assign(**kwargs),向DataFrame添加新的列,返回一个新的对象包括了原来的列和新增加的列

    >>> df=pd.DataFrame({'A':range(1,11),'B':np.random.randn(10)})
    >>> df
        A         B
    0   1  0.540750
    1   2  0.099605
    2   3  0.165043
    3   4 -1.379514
    4   5  0.357865
    5   6 -0.060789
    6   7 -0.544788
    7   8 -0.347995
    8   9  0.372269
    9  10 -0.212716
    
    >>> df.assign(ln_A=lambda x:np.log(x.A))
        A         B      ln_A
    0   1  0.540750  0.000000
    1   2  0.099605  0.693147
    2   3  0.165043  1.098612
    3   4 -1.379514  1.386294
    4   5  0.357865  1.609438
    5   6 -0.060789  1.791759
    6   7 -0.544788  1.945910
    7   8 -0.347995  2.079442
    8   9  0.372269  2.197225
    9  10 -0.212716  2.302585
    
    #每次只能添加一列,之前添加的列会被覆盖
    >>> df.assign(abs_B=lambda x:np.abs(x.B))
        A         B     abs_B
    0   1  0.540750  0.540750
    1   2  0.099605  0.099605
    2   3  0.165043  0.165043
    3   4 -1.379514  1.379514
    4   5  0.357865  0.357865
    5   6 -0.060789  0.060789
    6   7 -0.544788  0.544788
    7   8 -0.347995  0.347995
    8   9  0.372269  0.372269
    9  10 -0.212716  0.212716
    

    14.astype(dtype, copy=True, errors='raise', **kwargs) 将pandas对象数据类型设置为指定类型

    >>> ser=pd.Series([5,6],dtype='int32')
    >>> ser
    0    5
    1    6
    dtype: int32
    >>> ser.astype('int64')
    0    5
    1    6
    dtype: int64
    
    #转换为类目类型
    >>> ser.astype('category')
    0    5
    1    6
    dtype: category
    Categories (2, int64): [5, 6]
    >>> 
    
    #转换为定制化排序的类目类型
    >>> ser.astype('category',ordered=True,categories=[1,2])
    0   NaN
    1   NaN
    dtype: category
    Categories (2, int64): [1 < 2]
    

    15. at_time()和between_time() 取某一时刻或某段时间相应的数据

    df1.at_time('9:00AM')
    Out[115]: 
    Empty DataFrame
    Columns: [Open, High, Low, Close, Turnover, Volume]
    Index: []
    
    df1.between_time('9:00AM','9:30AM')
    Out[114]: 
    Empty DataFrame
    Columns: [Open, High, Low, Close, Turnover, Volume]
    Index: []
    
    df1.at_time('00:00AM')
    Out[116]: 
                Open  High   Low  Close  Turnover    Volume
    Date                                                   
    2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800
    2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
    2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
    2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
    2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600
    

    16.bfill(axis=None, inplace=False, limit=None, downcast=None)和fillna(method='bfill')效用等同

    17.boxplot(column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds)

    根据DataFrame的列元素或者可选分组绘制箱线图

    df1.boxplot('Open')
    Out[117]: <matplotlib.axes._subplots.AxesSubplot at 0x20374716860>
    
    df1.boxplot(['Open','Close'])
    Out[118]: <matplotlib.axes._subplots.AxesSubplot at 0x2037477da20>
    

    18.mode(axis=0, numeric_only=False)取众数

    Signature: df.mode(axis=0, numeric_only=False)
    Docstring:
    Gets the mode(s) of each element along the axis selected. Adds a row
    for each mode per label, fills in gaps with nan.
    
    Note that there could be multiple values returned for the selected
    axis (when more than one item share the maximum frequency), which is
    the reason why a dataframe is returned. If you want to impute missing
    values with the mode in a dataframe ``df``, you can just do this:
    ``df.fillna(df.mode().iloc[0])``
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        * 0 or 'index' : get mode of each column
        * 1 or 'columns' : get mode of each row
    numeric_only : boolean, default False
        if True, only apply to numeric columns
    
    import pandas as pd
    df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
                      'B':[3,2,1,7,7,4,4]})
    df.mode()
    Out[1]:
    A   B
    0   1.0 4
    1   NaN 7
    
    df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
                      'B':[3,2,7,7,7,4,4]})
    df.mode()
    Out[2]:
    A   B
    0   1   7
    
    df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
                      'B':[3,1,1,7,7,4,4]})
    df.mode()
    Out[3]:
    A   B
    0   1.0 1
    1   NaN 4
    2   NaN 7
    
    df = pd.DataFrame({'A': [1, 2, 2, 1, 1, 2, 3],
                      'B':[3,2,1,7,6,4,4]})
    df.mode()
    Out[5]:
    A   B
    0   1   4.0
    1   2   NaN
    
    df = pd.DataFrame({'A': [1, 2, 2, 1, 1, 2, 3],
                      'B':[3,2,1,7,7,4,4]})
    df.mode()
    Out[6]:
    A   B
    0   1   4
    1   2   7
    
    df = pd.DataFrame({'A': [1, 4, 2, 1, 3, 2, 3],
                      'B':[3,2,7,7,7,4,4]})
    df.mode()
    Out[8]:
    A   B
    0   1   7.0
    1   2   NaN
    2   3   NaN
    

    19 pandas:对dataframe进行groupby后求众数mode

    有如下一个dataframe,打算对a的每一个类别求b的众数(mode)
    dir(df.groupby('a'))可以看到是没有mode函数的,因此不能直接使用df.groupby('a').mode().reset_index()

    注意:.reset_index()是将Series对象转变成DataFrame

    既然df.groupby('a')没有mode函数,又考虑到可能聚合函数agg能够接用外部函数,搜索网上的解决办法,经过尝试,在此总结并列出以下几种解决方案(当然也可以直接跳到最后一个解决方案)。

    (1) 使用scipy.stats.mode():df中的B类别有两个众数,返回的结果B类别的众数取了较小的结果

    >>> from scipy import stats
    >>> df.groupby('a').agg(lambda x: stats.mode(x)[0][0]).reset_index()
       a  b
    0  A  1
    1  B  2
    

    (2) 使用value_counts()

    (1) 先看value_counts()的作用:可以看到得到的结果中的index是取值,内容是计数,并且index是降序排列的,因此取index[0]是取最大值,因此有两个众数以上的时候,会取到较大的结果

    >>> ss = pd.Series([1,2,2,3,3])
    >>> ss
    0    1
    1    2
    2    2
    3    3
    4    3
    dtype: int64
    >>> ss.value_counts()
    3    2
    2    2
    1    1
    dtype: int64
    >>> ss.value_counts().index[0]
    3123456789101112131415
    

    (2) 应用到dataframe的groupby之后的聚合函数中:

    >>> df.groupby('a').agg(lambda x: x.value_counts().index[1]).reset_index()
       a  b
    0  A  1
    1  B  3
    

    (3)使用pd.Series.mode():该函数是返回Series的众数的,当众数有多个时,会返回一个list,里面包含了所有众数

    >>> df.groupby('a').agg(pd.Series.mode).reset_index()
       a       b
    0  A       1
    1  B  [2, 3]
    

    (4)使用pd.Series.mode()和np.mean():上述结果显然不是我想要的,但是如果对有多个众数的结果取均值作为新的众数,是较为满意的结果,则可以这样

    >>> import numpy as np
    >>> df.groupby('a').agg(lambda x: np.mean(pd.Series.mode(x))).reset_index()
      a    b
    0  A  1.0
    1  B  2.5
    

    20 Pandas 描述统计函数

    在进行统计描述时,pandas对三个数据对象的轴参数规定如下:
    Series: 没有轴参数
    DataFrame: “index” (axis=0, default), “columns” (axis=1)
    Panel: “items” (axis=0), “major” (axis=1, default), “minor” (axis=2)
    统计描述参数如下

    df.count() #非空元素计算 
    df.min() #最小值 
    df.max() #最大值 
    df.idxmin() #最小值的位置,类似于R中的which.min函数 
    df.idxmax() #最大值的位置,类似于R中的which.max函数 
    df.quantile(0.1) #10%分位数 
    df.sum() #求和 
    df.mean() #均值 
    df.median() #中位数 
    df.mode() #众数 
    df.var() #方差 
    df.std() #标准差 
    df.mad() #平均绝对偏差 
    df.skew() #偏度 
    df.kurt() #峰度 
    df.describe() #一次性输出多个描述性统计指标
    
    
     count
      Number of non-null observations
      观测值的个数
    
    
      sum
      Sum of values
      求和
    
    
      mean
      Mean of values
      求平均值
    
    
      mad
      Mean absolute deviation
      平均绝对方差
    
    
      median
      Arithmetic median of values
      中位数
    
    
      min
      Minimum
      最小值
    
    
      max
      Maximum
      最大值
    
    
      mode
      Mode
      众数
    
    
      abs
      Absolute Value
      绝对值
    
    
      prod
      Product of values
      乘积
    
    
      std
      Bessel-corrected sample standard deviation
      标准差
    
    
      var
      Unbiased variance
      方差
    
    
      sem
      Standard error of the mean
      标准误
    
    
      skew
      Sample skewness (3rd moment)
      偏度系数
    
    
      kurt
      Sample kurtosis (4th moment)
      峰度
    
    
      quantile
      Sample quantile (value at %)
      分位数
    
    
      cumsum
      Cumulative sum
      累加
    
    
      cumprod
      Cumulative product
      累乘
    
    
      cummax
      Cumulative maximum
      累最大值
    
    
      cummin
      Cumulative minimum
      累最小值
    
    
      cov()
      covariance
      协方差
    
    
      corr()
      correlation
      相关系数
    
    
      rank()
      rank by values
      排名
    
    
      pct_change()
      time change
      时间序列变
    

    21 pandas replace函数使用小结

    import numpy as np
    import pandas as pd
    df = pd.read_csv('emp.csv')
    df
    
    #Series对象值替换
    s = df.iloc[2]#获取行索引为2数据
    #单值替换
    s.replace('?',np.nan)#用np.nan替换?
    s.replace({'?':'NA'})#用NA替换?
    #多值替换
    s.replace(['?',r'$'],[np.nan,'NA'])#列表值替换
    s.replace({'?':np.nan,'$':'NA'})#字典映射
    #同缺失值填充方法类似
    s.replace(['?','$'],method='pad')#向前填充
    s.replace(['?','$'],method='ffill')#向前填充
    s.replace(['?','$'],method='bfill')#向后填充
    #limit参数控制填充次数
    s.replace(['?','$'],method='bfill',limit=1)
    #DataFrame对象值替换
    #单值替换
    df.replace('?',np.nan)#用np.nan替换?
    df.replace({'?':'NA'})#用NA替换?
    #按列指定单值替换
    df.replace({'EMPNO':'?'},np.nan)#用np.nan替换EMPNO列中?
    df.replace({'EMPNO':'?','ENAME':'.'},np.nan)#用np.nan替换EMPNO列中?和ENAME中.
    #多值替换
    df.replace(['?','.','$'],[np.nan,'NA','None'])##用np.nan替换?用NA替换. 用None替换$
    df.replace({'?':'NA','$':None})#用NA替换? 用None替换$
    df.replace({'?','$'},{'NA',None})#用NA替换? 用None替换$
    #正则替换
    df.replace(r'\?|\.|\$',np.nan,regex=True)#用np.nan替换?或.或$原字符
    df.replace([r'\?',r'\$'],np.nan,regex=True)#用np.nan替换?和$
    df.replace([r'\?',r'\$'],[np.nan,'NA'],regex=True)#用np.nan替换?用NA替换$符号
    df.replace(regex={r'\?':None})
    #value参数显示传递
    df.replace(regex=[r'\?|\.|\$'],value=np.nan)#用np.nan替换?或.或$原字符
    
    df['BrandName'].replace(
        to_replace=['ABC', 'AB'],
        value='A',
        inplace=True
    )
    

    相关文章

      网友评论

        本文标题:15 Pandas.DataFrame函数方法

        本文链接:https://www.haomeiwen.com/subject/ywiboftx.html