美文网首页
Panada——数据框DataFrame

Panada——数据框DataFrame

作者: d1b0f55d8efb | 来源:发表于2018-06-01 18:19 被阅读0次

    DataFrame是一个类似表格的数据结构,索引包括列索引和行索引,包含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔值等)。DataFrame的每一行和每一列都是一个Series,这个Series的name属性为当前的行索引名/列索引名。

    使用字典生成DataFrame

    #使用字典生成DataFrame
    from pandas import DataFrame
    data = DataFrame({'state':['ok', 'ok', 'good', 'bad'],
            'year':[2000, 2001, 2002, 2003],
            'pop':[3.7, 3.6, 2.4, 0.9]})
    print (data )# 行索引index默认为0,1,2,3 
      state  year  pop
    0    ok  2000  3.7
    1    ok  2001  3.6
    2  good  2002  2.4
    3   bad  2003  0.9
    #指定列索引columns,不匹配的列为NaN
    print (DataFrame(data, columns = ['year', 'state', 'pop','debt']))
       year state  pop debt
    0  2000    ok  3.7  NaN
    1  2001    ok  3.6  NaN
    2  2002  good  2.4  NaN
    3  2003   bad  0.9  NaN
    #指定行索引index
    x = DataFrame(data,
                        columns = ['year', 'state', 'pop', 'debt'],
                        index = ['one', 'two', 'three', 'four'])
    print(x)
           year state  pop debt
    one    2000    ok  3.7  NaN
    two    2001    ok  3.6  NaN
    three  2002  good  2.4  NaN
    four   2003   bad  0.9  NaN
    #按列访问
    print(DataFrame(data)['state'])
    0      ok
    1      ok
    2    good
    3     bad
    Name: state, dtype: object
    

    DataFrame元素的索引与修改

    #原数据框
           year state  pop debt
    one    2000    ok  3.7  NaN
    two    2001    ok  3.6  NaN
    three  2002  good  2.4  NaN
    four   2003   bad  0.9  NaN
    
    import numpy
    print(x['state'])
    one        ok
    two        ok
    three    good
    four      bad
    Name: state, dtype: object
    # 修改一整列数据
    x['debt'] = 16.5
    print(x)
           year state  pop  debt
    one    2000    ok  3.7  16.5
    two    2001    ok  3.6  16.5
    three  2002  good  2.4  16.5
    four   2003   bad  0.9  16.5
    # 用numpy数组修改元素
    x.debt = numpy.arange(4)
    print(x)
           year state  pop  debt
    one    2000    ok  3.7     0
    two    2001    ok  3.6     1
    three  2002  good  2.4     2
    four   2003   bad  0.9     3
    #用Series修改元素,没有指定的默认数据用NaN
    val = Series([-1.2, -1.5, -1.7,0], index = ['one', 'two', 'five','six']) 
    x.debt = val # DataFrame的行索引不变
    print(x)
           year state  pop  debt
    one    2000    ok  3.7  -1.2
    two    2001    ok  3.6  -1.5
    three  2002  good  2.4   NaN
    four   2003   bad  0.9   NaN
    
    #增加一行
    x.loc[len(x)]=[2,3,4,5]
    print(x)
           year state  pop  debt
    one    2000    ok  3.7  -1.2
    two    2001    ok  3.6  -1.5
    three  2002  good  2.4   NaN
    four   2003   bad  0.9   NaN
    4         2     3  4.0   5.0
    #增加一列
    x['newColumn']=[1,1,1,1,1]
    print(x)
           year state  pop  debt  newColumn
    one    2000    ok  3.7  -1.2          1
    two    2001    ok  3.6  -1.5          1
    three  2002  good  2.4   NaN          1
    four   2003   bad  0.9   NaN          1
    4         2     3  4.0   5.0          1
    #DataFrame转置
    print(x.T)
               one  two three four  4
    yeat       NaN  NaN   NaN  NaN  2
    state       ok   ok  good  bad  3
    pop        3.7  3.6   2.4  0.9  4
    debt      -1.2 -1.5   NaN  NaN  5
    newColumn    1    1     1    1  1
    

    DataFrame算术:不重叠部分为NaN,重叠部分元素运算

    x = DataFrame(numpy.arange(9.).reshape((3, 3)),
                    columns = ['A','B','C'],
                    index = ['a', 'b', 'c'])
    y = DataFrame(numpy.arange(12).reshape((4, 3)),
                    columns = ['A','B','C'],
                    index = ['a', 'b', 'c', 'd'])
    print(x)
         A    B    C
    a  0.0  1.0  2.0
    b  3.0  4.0  5.0
    c  6.0  7.0  8.0
    print(y)
       A   B   C
    a  0   1   2
    b  3   4   5
    c  6   7   8
    d  9  10  11
    print(x+y)
          A     B     C
    a   0.0   2.0   4.0
    b   6.0   8.0  10.0
    c  12.0  14.0  16.0
    d   NaN   NaN   NaN
    
    #DataFrame与Series运算:每行/列进行运算
    frame = DataFrame(numpy.arange(9).reshape((3, 3)),
                      columns = ['A','B','C'],
                      index = ['a', 'b', 'c'])
    print(frame)
       A  B  C
    a  0  1  2
    b  3  4  5
    c  6  7  8
    series=frame.ix[0]
    print(series)
    A    0
    B    1
    C    2
    print(frame-series)
       A  B  C
    a  0  0  0
    b  3  3  3
    c  6  6  6
    # 按行运算:缺失列则为NaN
    series2 = Series(range(4), index = ['A','B','C','D'])
    print(series2)
    A    0
    B    1
    C    2
    D    3
    print (frame + series2 )
       A  B   C   D
    a  0  2   4 NaN
    b  3  5   7 NaN
    c  6  8  10 NaN
    series3 = frame.A
    print(series3)
    a    0
    b    3
    c    6
    Name: A, dtype: int64
     
    print(frame.sub(series3,axis=0))#替换
       A  B  C
    a  0  1  2
    b  0  1  2
    c  0  1  2
    

    额外运算

    df = DataFrame({
        'column1': numpy.random.randn(5),
        'column2': numpy.random.randn(5)
    })
    print(df)
        column1   column2
    0 -0.336839 -0.420312
    1 -1.172474  0.671025
    2 -0.481245  0.292897
    3  1.335457 -1.167297
    4 -0.170178  0.140632
    #每列最小
    print(df.apply(min))
    column1   -1.172474
    column2   -1.167297
    dtype: float64
    #每行最小
    print(df.apply(min, axis=1))
    0   -0.420312
    1   -1.172474
    2   -0.481245
    3   -1.167297
    4   -0.170178
    dtype: float64
    #判断每个列,值是否都大于0
    print(df.apply(
        lambda x: numpy.all(x>0),
        axis=1
    ))
    0    False
    1    False
    2    False
    3    False
    4    False
    dtype: bool
    print(DataFrame(df[df.apply(
        lambda x: numpy.all(x>0),
        axis=1
    )]))
        column1   column2
    3  0.826535  0.415204
    

    相关文章

      网友评论

          本文标题:Panada——数据框DataFrame

          本文链接:https://www.haomeiwen.com/subject/xfpksftx.html