import pandas as pd
import numpy as np
data1 = pd.DataFrame({'one':['a','b','a','a','c','b'],'two':range(6)}) # 列明分别为one/two 键值即是列名
data2 = pd.DataFrame({'one':['a','b','c','d'],'two':range(10,14)})
data1
# one two
# 0 a 0
# 1 b 1
# 2 a 2
# 3 a 3
# 4 c 4
# 5 b 5
data2
# one two
# 0 a 10
# 1 b 11
# 2 c 12
# 3 d 13
data = pd.merge(data1, data2, on='one', how='right')
# pd.merge 列与列之间进行横向合并
# on='one' 以one列为基准,根据one这一列进行合并
# how='right' 以右边的data2为准
# how=’left‘ 以坐标的data1为准
# how=outer 并集,没有的值用NA值不错
# how=inner 交集
data # how=’right‘,以右边的data2中的one这一列为准。data1对应为准没有的部分以NaN填充
# one two_x two_y
# 0 a 0.0 10
# 1 a 2.0 10
# 2 a 3.0 10
# 3 b 1.0 11
# 4 b 5.0 11
# 5 c 4.0 12
# 6 d NaN 13
data = pd.merge(data1, data2, on='one', how='left') # 以左边为准,最后只保留data1中one这一列的值
data
# one two_x two_y
# 0 a 0 10
# 1 b 1 11
# 2 a 2 10
# 3 a 3 10
# 4 c 4 12
# 5 b 5 11
data1 = pd.DataFrame({'one1':['a','b','a','a','c','b'],'two1':range(6)})
data2 = pd.DataFrame({'one2':['a','b','c','d'],'two2':range(10,14)})
data1
# one1 two1
# 0 a 0
# 1 b 1
# 2 a 2
# 3 a 3
# 4 c 4
# 5 b 5
data2
# one2 two2
# 0 a 10
# 1 b 11
# 2 c 12
# 3 d 13
data = pd.merge(data1, data2, left_on='one1', right_on='one2') # 默认是inner的合并,因此d这一行就欸有体现了
data
# one1 two1 one2 two2
# 0 a 0 a 10
# 1 a 2 a 10
# 2 a 3 a 10
# 3 b 1 b 11
# 4 b 5 b 11
# 5 c 4 c 12
data = pd.merge(data1, data2, left_on='two1', right_index=True, how='outer') # 根据index进行合并
data
# one1 two1 one2 two2
# 0 a 0 a 10.0
# 1 b 1 b 11.0
# 2 a 2 c 12.0
# 3 a 3 d 13.0
# 4 c 4 NaN NaN
# 5 b 5 NaN NaN
data = pd.merge(data1, data2, left_on='two1', right_index=True)
data
# one1 two1 one2 two2
# 0 a 0 a 10
# 1 b 1 b 11
# 2 a 2 c 12
# 3 a 3 d 13
join,要求两个dataframe的index行数必须相等
data1
# one1 two1
# 0 a 0
# 1 b 1
# 2 a 2
# 3 a 3
data2
# one2 two2
# 0 a 10
# 1 b 11
# 2 c 12
# 3 d 13
data1.join(data2) # 要求data1 data2 index相同(如果不同,以data1为准,针对data2没有的部分补充NaN),且没有重叠的列
# one1 two1 one2 two2
# 0 a 0 a 10
# 1 b 1 b 11
# 2 a 2 c 12
# 3 a 3 d 13
assign 快速添加列
data1
# one1 two1
# 0 a 0
# 1 b 1
# 2 a 2
# 3 a 3
data = data1.assign(three=np.arange(4)) # 添加一列,列名为three,该列对应的行数必须与data1相同
data
# one1 two1 three
# 0 a 0 0
# 1 b 1 1
# 2 a 2 2
# 3 a 3 3
纵向合并,增加行
data1 = pd.DataFrame(np.random.randn(3,4), columns = ['a','b','c','d'])
data2 = pd.DataFrame(np.random.randn(2,3), columns = ['b','d','a'])
data1
# a b c d
# 0 0.151172 -0.173789 -0.484290 0.078217
# 1 -0.775621 -0.317611 -0.739203 -1.568968
# 2 -0.445466 1.300033 0.438614 -1.535612
data2
# b d a
# 0 0.538695 -0.081372 1.291073
# 1 -2.111805 -0.443657 -0.112507
data = pd.concat([data1, data2], ignore_index=True)
data
# a b c d
# 0 0.151172 -0.173789 -0.484290 0.078217
# 1 -0.775621 -0.317611 -0.739203 -1.568968
# 2 -0.445466 1.300033 0.438614 -1.535612
# 3 1.291073 0.538695 NaN -0.081372
# 4 -0.112507 -2.111805 NaN -0.443657
网友评论