美文网首页
8. 日月光华 Python数据分析-Pandas-合并数据集

8. 日月光华 Python数据分析-Pandas-合并数据集

作者: 薛东弗斯 | 来源:发表于2023-07-08 10:08 被阅读0次
    import pandas as pd
    import numpy as np
    
    data1 = pd.DataFrame({'one':['a','b','a','a','c','b'],'two':range(6)})    # 列明分别为one/two 键值即是列名
    data2 = pd.DataFrame({'one':['a','b','c','d'],'two':range(10,14)})
    
    data1
    #   one two
    # 0 a   0
    # 1 b   1
    # 2 a   2
    # 3 a   3
    # 4 c   4
    # 5 b   5
    
    data2
    #   one two
    # 0 a   10
    # 1 b   11
    # 2 c   12
    # 3 d   13
    
    data = pd.merge(data1, data2, on='one', how='right')  
    # pd.merge 列与列之间进行横向合并
    # on='one' 以one列为基准,根据one这一列进行合并
    # how='right'  以右边的data2为准
    # how=’left‘ 以坐标的data1为准
    # how=outer  并集,没有的值用NA值不错
    # how=inner 交集
    
    data       # how=’right‘,以右边的data2中的one这一列为准。data1对应为准没有的部分以NaN填充
    #   one two_x   two_y
    # 0 a   0.0 10
    # 1 a   2.0 10
    # 2 a   3.0 10
    # 3 b   1.0 11
    # 4 b   5.0 11
    # 5 c   4.0 12
    # 6 d   NaN 13
    
    data = pd.merge(data1, data2, on='one', how='left')   # 以左边为准,最后只保留data1中one这一列的值
    data
    # one   two_x   two_y
    # 0 a   0   10
    # 1 b   1   11
    # 2 a   2   10
    # 3 a   3   10
    # 4 c   4   12
    # 5 b   5   11
    
    data1 = pd.DataFrame({'one1':['a','b','a','a','c','b'],'two1':range(6)})
    data2 = pd.DataFrame({'one2':['a','b','c','d'],'two2':range(10,14)})
    
    data1
    #   one1    two1
    # 0 a   0
    # 1 b   1
    # 2 a   2
    # 3 a   3
    # 4 c   4
    # 5 b   5
    
    data2
    #   one2    two2
    # 0 a   10
    # 1 b   11
    # 2 c   12
    # 3 d   13
    
    data = pd.merge(data1, data2, left_on='one1', right_on='one2')   # 默认是inner的合并,因此d这一行就欸有体现了
    data
    #   one1    two1    one2    two2
    # 0 a   0   a   10
    # 1 a   2   a   10
    # 2 a   3   a   10
    # 3 b   1   b   11
    # 4 b   5   b   11
    # 5 c   4   c   12
    
    data = pd.merge(data1, data2, left_on='two1', right_index=True, how='outer')   # 根据index进行合并
    data
    #   one1    two1    one2    two2
    # 0 a                 0 a   10.0
    # 1 b                 1 b   11.0
    # 2 a                 2 c   12.0
    # 3 a                 3 d   13.0
    # 4 c                    4  NaN NaN
    # 5 b                 5 NaN NaN
    
    data = pd.merge(data1, data2, left_on='two1', right_index=True) 
    data
    #   one1    two1    one2    two2
    # 0 a   0   a   10
    # 1 b   1   b   11
    # 2 a   2   c   12
    # 3 a   3   d   13
    

    join,要求两个dataframe的index行数必须相等

    data1
    #   one1    two1
    # 0 a   0
    # 1 b   1
    # 2 a   2
    # 3 a   3
    
    data2
    #   one2    two2
    # 0 a   10
    # 1 b   11
    # 2 c   12
    # 3 d   13
    
    data1.join(data2)   # 要求data1 data2 index相同(如果不同,以data1为准,针对data2没有的部分补充NaN),且没有重叠的列
    #   one1    two1    one2    two2
    # 0 a   0   a   10
    # 1 b   1   b   11
    # 2 a   2   c   12
    # 3 a   3   d   13
    

    assign 快速添加列

    data1
    #   one1    two1
    # 0 a   0
    # 1 b   1
    # 2 a   2
    # 3 a   3
    
    data = data1.assign(three=np.arange(4))   # 添加一列,列名为three,该列对应的行数必须与data1相同
    data
    #   one1    two1    three
    # 0 a   0   0
    # 1 b   1   1
    # 2 a   2   2
    # 3 a   3   3
    

    纵向合并,增加行

    data1 = pd.DataFrame(np.random.randn(3,4), columns = ['a','b','c','d'])
    data2 = pd.DataFrame(np.random.randn(2,3), columns = ['b','d','a'])
    data1
    #         a                b                        c                    d
    # 0 0.151172            -0.173789   -0.484290   0.078217
    # 1 -0.775621   -0.317611   -0.739203   -1.568968
    # 2 -0.445466   1.300033    0.438614    -1.535612
    
    data2
    #         b             d                   a
    # 0 0.538695            -0.081372   1.291073
    # 1 -2.111805   -0.443657   -0.112507
    
    data = pd.concat([data1, data2], ignore_index=True)
    data
    #   a                     b                 c                   d
    # 0 0.151172            -0.173789   -0.484290   0.078217
    # 1 -0.775621   -0.317611   -0.739203   -1.568968
    # 2 -0.445466   1.300033            0.438614            -1.535612
    # 3 1.291073            0.538695        NaN                 -0.081372
    # 4 -0.112507   -2.111805   NaN                 -0.443657
    

    相关文章

      网友评论

          本文标题:8. 日月光华 Python数据分析-Pandas-合并数据集

          本文链接:https://www.haomeiwen.com/subject/oqrdudtx.html