Pandas Basic

作者: 闫_锋 | 来源:发表于2018-07-25 11:19 被阅读17次

    转载:
    https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/3-1-pd-intro/

    Pandas基本介绍:

    Pandas基于Numpy构建,带有标签,易于使用。

    两个数据结构:
    Series
    DataFrame

    #Series
    import pandas as pd
    import numpy as np
    s = pd.Series([1,3,6,np.nan,44,1])
    
    print(s)
    """
    0     1.0
    1     3.0
    2     6.0
    3     NaN
    4    44.0
    5     1.0
    dtype: float64
    """
    
    #DataFrame
    dates = pd.date_range('20160101',periods=6)
    df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
    
    print(df)
    """
                       a         b         c         d
    2016-01-01 -0.253065 -2.071051 -0.640515  0.613663
    2016-01-02 -1.147178  1.532470  0.989255 -0.499761
    2016-01-03  1.221656 -2.390171  1.862914  0.778070
    2016-01-04  1.473877 -0.046419  0.610046  0.204672
    2016-01-05 -1.584752 -0.700592  1.487264 -1.778293
    2016-01-06  0.633675 -1.414157 -0.277066 -0.442545
    """
    

    DataFrame是表格型数据结构,每一列的元素可以使不同值类型。DataFrame既有行索引也有列索引。

    print(df['b'])
    """
    2016-01-01   -2.071051
    2016-01-02    1.532470
    2016-01-03   -2.390171
    2016-01-04   -0.046419
    2016-01-05   -0.700592
    2016-01-06   -1.414157
    Freq: D, Name: b, dtype: float64
    """
    
    df.index
    df.columns
    df.values
    df.describe()
    
    

    Pandas选择数据

    dates = pd.date_range('20130101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
    
    """
                 A   B   C   D
    2013-01-01   0   1   2   3
    2013-01-02   4   5   6   7
    2013-01-03   8   9  10  11
    2013-01-04  12  13  14  15
    2013-01-05  16  17  18  19
    2013-01-06  20  21  22  23
    """
    
    print(df['A'])
    print(df.A)
    
    """
    2013-01-01     0
    2013-01-02     4
    2013-01-03     8
    2013-01-04    12
    2013-01-05    16
    2013-01-06    20
    Freq: D, Name: A, dtype: int64
    """
    
    print(df[0:3])
     
    """
                A  B   C   D
    2013-01-01  0  1   2   3
    2013-01-02  4  5   6   7
    2013-01-03  8  9  10  11
    """
    
    print(df['20130102':'20130104'])
    
    """
    A   B   C   D
    2013-01-02   4   5   6   7
    2013-01-03   8   9  10  11
    2013-01-04  12  13  14  15
    """
    

    行标签 loc

    loc,主要通过标签名字选择某一行数据, 或者通过选择某行或者所有行(:代表所有行)然后选其中某一列或几列数据。

    print(df.loc['20130102'])
    """
    A    4
    B    5
    C    6
    D    7
    Name: 2013-01-02 00:00:00, dtype: int64
    """
    
    print(df.loc[:,['A','B']]) 
    """
                 A   B
    2013-01-01   0   1
    2013-01-02   4   5
    2013-01-03   8   9
    2013-01-04  12  13
    2013-01-05  16  17
    2013-01-06  20  21
    """
    
    print(df.loc['20130102',['A','B']])
    """
    A    4
    B    5
    Name: 2013-01-02 00:00:00, dtype: int64
    """
    

    序列标签

    print(df.iloc[3,1])
    # 13
    
    print(df.iloc[3:5,1:3])
    """
                 B   C
    2013-01-04  13  14
    2013-01-05  17  18
    """
    
    print(df.iloc[[1,3,5],1:3])
    """
                 B   C
    2013-01-02   5   6
    2013-01-04  13  14
    2013-01-06  21  22
    
    """
    

    混合标签

    print(df.ix[:3, ['A','C']])
    """
                A   C
    2013-01-01  0   2
    2013-01-02  4   6
    2013-01-03  8  10
    """
    

    Pandas设置值

    dates = pd.date_range('20130101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
    
    """
                 A   B   C   D
    2013-01-01   0   1   2   3
    2013-01-02   4   5   6   7
    2013-01-03   8   9  10  11
    2013-01-04  12  13  14  15
    2013-01-05  16  17  18  19
    2013-01-06  20  21  22  23
    """
    #loc & iloc
    df.iloc[2,2] = 1111
    df.loc['20130101','B'] = 2222
    
    """
                 A     B     C   D
    2013-01-01   0  2222     2   3
    2013-01-02   4     5     6   7
    2013-01-03   8     9  1111  11
    2013-01-04  12    13    14  15
    2013-01-05  16    17    18  19
    2013-01-06  20    21    22  23
    """
    
    #根据条件设置
    df.B[df.A>4] = 0
    """
                    A     B     C   D
    2013-01-01   0  2222     2   3
    2013-01-02   4     5     6   7
    2013-01-03   8     0  1111  11
    2013-01-04  12     0    14  15
    2013-01-05  16     0    18  19
    2013-01-06  20     0    22  23 
    """
    
    df['F'] = np.nan
    """
                 A     B     C   D   F
    2013-01-01   0  2222     2   3 NaN
    2013-01-02   4     5     6   7 NaN
    2013-01-03   8     0  1111  11 NaN
    2013-01-04  12     0    14  15 NaN
    2013-01-05  16     0    18  19 NaN
    2013-01-06  20     0    22  23 NaN
    """
    
    df['E'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130101',periods=6)) 
    """
                 A     B     C   D   F  E
    2013-01-01   0  2222     2   3 NaN  1
    2013-01-02   4     5     6   7 NaN  2
    2013-01-03   8     0  1111  11 NaN  3
    2013-01-04  12     0    14  15 NaN  4
    2013-01-05  16     0    18  19 NaN  5
    2013-01-06  20     0    22  23 NaN  6
    """
    

    相关文章

      网友评论

        本文标题:Pandas Basic

        本文链接:https://www.haomeiwen.com/subject/arnumftx.html