美文网首页
Pandas03:DataFrame&Series

Pandas03:DataFrame&Series

作者: 罗泽坤 | 来源:发表于2020-03-28 21:07 被阅读0次

    selecting and indexing data

    一、Construction of Series and DataFrame

    只列举出常用操作具体内容参见官方文档
    
    from pandas import DataFrame,Series
    x1 = Series([1,2,3,4])
    x2 = Series(data=[1,2,3,4],index=['a','b','c','d'])
    Dict = {'a':1,'b':2,'c':3,'d':4}
    x3 = Series(Dict)
    print(x1)
    print(x2)
    print(x3)
    
    0    1
    1    2
    2    3
    3    4
    dtype: int64
    a    1
    b    2
    c    3
    d    4
    dtype: int64
    a    1
    b    2
    c    3
    d    4
    dtype: int64
    
    # dataframe使用
    from pandas import Series, DataFrame
    data = {'Chinese': [66, 95, 93, 90,80], 'Math': [30, 98, 96, 77, 90], 'English': [65, 85, 92, 88, 90]}
    df1 = DataFrame(data)
    df2 = DataFrame(data, index=['ZhangFei', 'GuanYu', 'LiuBei', 'DianWei', 'XuChu'], columns=['Chinese', 'Math', 'English'])
    print(df1)
    print(df2)
    
    # 对列名进行更换
    df2.rename(columns={'Chinese': '语文', 'English': '英语', 'Math': '数学'}, inplace = True)
    print(df2.isnull())  #检验是否有缺失值
    # 输出df2的概要输出均值,标准差,最大值,最小值,各种分位数
    print(df2.describe())
    
       Chinese  Math  English
    0       66    30       65
    1       95    98       85
    2       93    96       92
    3       90    77       88
    4       80    90       90
              Chinese  Math  English
    ZhangFei       66    30       65
    GuanYu         95    98       85
    LiuBei         93    96       92
    DianWei        90    77       88
    XuChu          80    90       90
                 语文     数学     英语
    ZhangFei  False  False  False
    GuanYu    False  False  False
    LiuBei    False  False  False
    DianWei   False  False  False
    XuChu     False  False  False
                  语文         数学         英语
    count   5.000000   5.000000   5.000000
    mean   84.800000  78.200000  84.000000
    std    11.987493  28.163807  10.931606
    min    66.000000  30.000000  65.000000
    25%    80.000000  77.000000  85.000000
    50%    90.000000  90.000000  88.000000
    75%    93.000000  96.000000  90.000000
    max    95.000000  98.000000  92.000000
    

    二、indexing

    import pandas as pd
    import numpy as np
    
    datas = pd.date_range('28/3/2020',periods = 8)
    df = pd.DataFrame(data = np.random.randn(8,4),index=datas,columns=['A','B','C','D'])
    print(df)
    a = df['A']   # 此种形式DataFrame索引为series
    print()
    print(a)
    print()
    print(a[datas[5]])
    
                       A         B         C         D
    2020-03-28  0.040467  2.502838  1.750702  0.752269
    2020-03-29 -0.591560 -0.607783  0.390446  0.759989
    2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
    2020-03-31  2.107192  1.267613 -0.421305 -0.286911
    2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
    2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
    2020-04-03 -1.193671  2.798576  3.594377 -0.016369
    2020-04-04  1.592835 -0.351965  1.728636  0.547841
    
    2020-03-28    0.040467
    2020-03-29   -0.591560
    2020-03-30   -0.201351
    2020-03-31    2.107192
    2020-04-01   -0.263185
    2020-04-02   -0.668462
    2020-04-03   -1.193671
    2020-04-04    1.592835
    Freq: D, Name: A, dtype: float64
    
    -0.6684622593090315
    
    print(df)
    print(df[['A','B']])  # indexing A B 两列
    print(df[:3]) #索引切片行支持反向和步长
    
                       A         B         C         D
    2020-03-28  0.040467  2.502838  1.750702  0.752269
    2020-03-29 -0.591560 -0.607783  0.390446  0.759989
    2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
    2020-03-31  2.107192  1.267613 -0.421305 -0.286911
    2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
    2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
    2020-04-03 -1.193671  2.798576  3.594377 -0.016369
    2020-04-04  1.592835 -0.351965  1.728636  0.547841
                       A         B
    2020-03-28  0.040467  2.502838
    2020-03-29 -0.591560 -0.607783
    2020-03-30 -0.201351 -1.134350
    2020-03-31  2.107192  1.267613
    2020-04-01 -0.263185 -0.974481
    2020-04-02 -0.668462 -0.668671
    2020-04-03 -1.193671  2.798576
    2020-04-04  1.592835 -0.351965
                       A         B         C         D
    2020-03-28  0.040467  2.502838  1.750702  0.752269
    2020-03-29 -0.591560 -0.607783  0.390446  0.759989
    2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
    
    # 使用loc函数索引行列组合,对series而言索引行
    print(df)
    print(df.loc[:,['A','B']])  
    #此种形式错误print(df[[:],['A','B']])
    
                       A         B         C         D
    2020-03-28  0.040467  2.502838  1.750702  0.752269
    2020-03-29 -0.591560 -0.607783  0.390446  0.759989
    2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
    2020-03-31  2.107192  1.267613 -0.421305 -0.286911
    2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
    2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
    2020-04-03 -1.193671  2.798576  3.594377 -0.016369
    2020-04-04  1.592835 -0.351965  1.728636  0.547841
                       A         B
    2020-03-28  0.040467  2.502838
    2020-03-29 -0.591560 -0.607783
    2020-03-30 -0.201351 -1.134350
    2020-03-31  2.107192  1.267613
    2020-04-01 -0.263185 -0.974481
    2020-04-02 -0.668462 -0.668671
    2020-04-03 -1.193671  2.798576
    2020-04-04  1.592835 -0.351965
    

    利用索引修改series和DataFrame值

    sa = pd.Series(range(4),list('abcd'))
    dfa = df.copy()
    print(sa)
    print(dfa)
    
    a    0
    b    1
    c    2
    d    3
    dtype: int64
                       A         B         C         D
    2020-03-28  0.040467  2.502838  1.750702  0.752269
    2020-03-29 -0.591560 -0.607783  0.390446  0.759989
    2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
    2020-03-31  2.107192  1.267613 -0.421305 -0.286911
    2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
    2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
    2020-04-03 -1.193671  2.798576  3.594377 -0.016369
    2020-04-04  1.592835 -0.351965  1.728636  0.547841
    
    sa['a']=100
    print(sa)
    dfa['A'] = list(range(len(dfa.index)))
    print(dfa)
    
    a    100
    b      1
    c      2
    d      3
    dtype: int64
                A         B         C         D
    2020-03-28  0  2.502838  1.750702  0.752269
    2020-03-29  1 -0.607783  0.390446  0.759989
    2020-03-30  2 -1.134350 -0.912988 -0.313645
    2020-03-31  3  1.267613 -0.421305 -0.286911
    2020-04-01  4 -0.974481 -0.655299 -0.672898
    2020-04-02  5 -0.668671  0.525188 -0.134835
    2020-04-03  6  2.798576  3.594377 -0.016369
    2020-04-04  7 -0.351965  1.728636  0.547841
    
    x = pd.DataFrame({'y':[1,2,3],'z':[4,5,6]})
    print(x)
    print(x.iloc[1]) #iloc[:]=DataFrame[:]
    x.iloc[1]=[20,50]
    print(x)
    
       y  z
    0  1  4
    1  2  5
    2  3  6
       y  z
    1  2  5
    2  3  6
        y   z
    0   1   4
    1  20  50
    2   3   6
    
    #boolean indexing
    print(df)
    print(df>0)  # 返回bool类型
    
                       A         B         C         D
    2020-03-28  0.040467  2.502838  1.750702  0.752269
    2020-03-29 -0.591560 -0.607783  0.390446  0.759989
    2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
    2020-03-31  2.107192  1.267613 -0.421305 -0.286911
    2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
    2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
    2020-04-03 -1.193671  2.798576  3.594377 -0.016369
    2020-04-04  1.592835 -0.351965  1.728636  0.547841
                    A      B      C      D
    2020-03-28   True   True   True   True
    2020-03-29  False  False   True   True
    2020-03-30  False  False  False  False
    2020-03-31   True   True  False  False
    2020-04-01  False  False  False  False
    2020-04-02  False  False   True  False
    2020-04-03  False   True   True  False
    2020-04-04   True  False   True   True
    
    dfA = df['A']
    print(dfA)
    print(dfA>0)
    
    2020-03-28    0.040467
    2020-03-29   -0.591560
    2020-03-30   -0.201351
    2020-03-31    2.107192
    2020-04-01   -0.263185
    2020-04-02   -0.668462
    2020-04-03   -1.193671
    2020-04-04    1.592835
    Freq: D, Name: A, dtype: float64
    2020-03-28     True
    2020-03-29    False
    2020-03-30    False
    2020-03-31     True
    2020-04-01    False
    2020-04-02    False
    2020-04-03    False
    2020-04-04     True
    Freq: D, Name: A, dtype: bool
    
    print(df)
    print(df.loc['2020-03-29'])
    print(df.loc[:,df.loc['2020-03-29']>0])
    
                       A         B         C         D
    2020-03-28  0.040467  2.502838  1.750702  0.752269
    2020-03-29 -0.591560 -0.607783  0.390446  0.759989
    2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
    2020-03-31  2.107192  1.267613 -0.421305 -0.286911
    2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
    2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
    2020-04-03 -1.193671  2.798576  3.594377 -0.016369
    2020-04-04  1.592835 -0.351965  1.728636  0.547841
    A   -0.591560
    B   -0.607783
    C    0.390446
    D    0.759989
    Name: 2020-03-29 00:00:00, dtype: float64
                       C         D
    2020-03-28  1.750702  0.752269
    2020-03-29  0.390446  0.759989
    2020-03-30 -0.912988 -0.313645
    2020-03-31 -0.421305 -0.286911
    2020-04-01 -0.655299 -0.672898
    2020-04-02  0.525188 -0.134835
    2020-04-03  3.594377 -0.016369
    2020-04-04  1.728636  0.547841
    
    |-iloc与loc的差别:
        loc的索引和切片只能通过对应的indexs和columns值进行索引
        而iloc系统会从0开始indexs和columns设置下标因此可以直接通过下标
        进行索引和切片例子如下
    
    S1 = pd.Series(data=list(range(0,8,2)),index=list('abcd'))
    print(S1)
    df1 = pd.DataFrame(data=np.random.randn(4,6),index=list('abcd'),columns=list(range(2,14,2)))
    print(df1)
    
    a    0
    b    2
    c    4
    d    6
    dtype: int64
             2         4         6         8         10        12
    a -2.445292  0.048598  0.050947 -0.713184  2.017222 -1.389391
    b  1.909918 -1.212520  0.552249  1.115173 -0.024809 -0.192347
    c -0.776439  0.877586  0.569017 -1.741527 -0.022756  0.154204
    d  0.537282  2.366709 -0.606037  0.860133 -0.707234 -0.297887
    
    print(S1.iloc[1:])
    #此方式不行print(S1.loc[1:])
    print(df1.iloc[1:4,2:5])
    #此方式不行print(df1.loc[1:4,2:5])
    print(df1.loc['b':,6:10])  #这个与上式等价
    
    b    2
    c    4
    d    6
    dtype: int64
             6         8         10
    b  0.552249  1.115173 -0.024809
    c  0.569017 -1.741527 -0.022756
    d -0.606037  0.860133 -0.707234
             6         8         10
    b  0.552249  1.115173 -0.024809
    c  0.569017 -1.741527 -0.022756
    d -0.606037  0.860133 -0.707234
    

    GitHub

    https://github.com/luozekun1230/MyPyhonProgram/tree/master/Pandas

    相关文章

      网友评论

          本文标题:Pandas03:DataFrame&Series

          本文链接:https://www.haomeiwen.com/subject/kfxeuhtx.html