本章内容
- 数据处理
- 数据合并
一、数据处理
通过dropna()函数滤除缺失数据:
# 1 处理Series对象
se=pd.Series([4,np.nan,8,np.nan,5])
print(se)
print(se.dropna())
print(se.isnull())
'''
0 4.0
1 NaN
2 8.0
3 NaN
4 5.0
dtype: float64
0 4.0
2 8.0
4 5.0
dtype: float64
0 False
1 True
2 False
3 True
4 False
dtype: bool
'''
# 通过布尔序列也能滤除:
print(se[se.notnull()])
'''
0 4.0
2 8.0
4 5.0
dtype: float64
'''
# 2 处理DataFrame对象
df1=pd.DataFrame([[1,2,3],[np.nan,np.nan,2],[np.nan,np.nan,np.nan],[8,8,np.nan]])
print(df1)
'''
0 1 2
0 1.0 2.0 3.0
1 NaN NaN 2.0
2 NaN NaN NaN
3 8.0 8.0 NaN
'''
# 默认滤除所有包含NaN:
print(df1.dropna())
'''
0 1 2
0 1.0 2.0 3.0
'''
# 传入how=‘all’滤除全为NaN的行:
print(df1.dropna(how='all')) # 默认情况下是how='any',只要有nan就删除
'''
0 1 2
0 1.0 2.0 3.0
1 NaN NaN 2.0
3 8.0 8.0 NaN
'''
# 传入axis=1滤除列:
print(df1.dropna(axis=1,how="all"))
#传入thresh=n保留至少有n个非NaN数据的行:
print(df1.dropna(thresh=1))
'''
0 1 2
0 1.0 2.0 3.0
1 NaN NaN 2.0
3 8.0 8.0 NaN
'''
# 用常数填充fillna
print(df1.fillna(0))
print(df1)
#传入inplace=True直接修改原对象:
df1.fillna(0,inplace=True)
print(df1)
# 通过字典填充不同的常数
print(df1.fillna({0:10,1:20,2:30})) #0,1,2对应列索引
'''
0 1 2
0 1.0 2.0 3.0
1 10.0 20.0 2.0
2 10.0 20.0 30.0
3 8.0 8.0 30.0
'''
# 填充平均值
print(df1.fillna(df1.mean()))
# 如果只填充一列
print(df1.iloc[:,1].fillna(5,inplace = True))
print(df1)
'''
0 1 2
0 1.0 2.0 3.0
1 NaN 5.0 2.0
2 NaN 5.0 NaN
3 8.0 8.0 NaN
'''
填充缺失数据:ffill与bfill
# 传入method=” “改变插值方式:
df2=pd.DataFrame(np.random.randint(0,10,(5,5)))
df2.iloc[1:4,3]=np.nan
df2.iloc[2:4,4]=np.nan
print(df2)
# 用前面的值来填充ffill 用后面的值来填充bfill
print(df2.fillna(method='bfill'))
'''
0 1 2 3 4
0 9 9 3 9.0 9.0
1 9 2 8 6.0 3.0
2 9 5 5 6.0 8.0
3 6 3 7 6.0 8.0
4 9 0 0 6.0 8.0
'''
# 传入limit=” “限制填充行数:
print(df2.fillna(method='bfill',limit=1))
'''
0 1 2 3 4
0 2 7 0 4.0 7.0
1 3 0 4 NaN 4.0
2 6 3 8 NaN NaN
3 1 3 2 4.0 3.0
4 9 5 5 4.0 3.0
'''
# 传入axis=” “修改填充方向:
print(df2.fillna(method="ffill",limit=1,axis=1))
'''
0 1 2 3 4
0 8.0 1.0 6.0 0.0 2.0
1 3.0 8.0 2.0 2.0 5.0
2 2.0 5.0 2.0 2.0 NaN
3 5.0 5.0 4.0 4.0 NaN
4 2.0 8.0 8.0 8.0 3.0
'''
移除重复数据
df1=pd.DataFrame({'A':[1,1,1,2,2,3,1],'B':list("aabbbca")})
print(df1)
# 判断每一行是否重复(结果是bool值,TRUE代表重复的)
print(df1.duplicated())
'''
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
'''
# 去除全部的重复行
print(df1.drop_duplicates())
'''
A B
0 1 a
2 1 b
3 2 b
5 3 c
'''
# 指定列去除重复行
print(df1.drop_duplicates(['A']))
'''
A B
0 1 a
3 2 b
5 3 c
'''
# 保留重复行中的最后一行
print(df1.drop_duplicates(['A'],keep = 'last'))
'''
A B
4 2 b
5 3 c
6 1 a
'''
# 去重的同时修改原对象
df1.drop_duplicates(['A','B'],inplace = True)
'''
A B
0 1 a
2 1 b
3 2 b
5 3 c
'''
二、数据合并
join,着重关注行的合并
import pandas as pd
df3=pd.DataFrame({'Red':[1,3,5],'Green':[5,0,3]},index=list('abc'))
df4=pd.DataFrame({'Blue':[1,9,8],'Yellow':[6,6,7]},index=list('cde'))
print(df3)
'''
Red Green
a 1 5
b 3 0
c 5 3
'''
print(df4)
'''
Blue Yellow
c 1 6
d 9 6
e 8 7
'''
df3.join(df4,how='left')
'''
Red Green Blue Yellow
a 1 5 NaN NaN
b 3 0 NaN NaN
c 5 3 1.0 6.0
'''
df3.join(df4,how='right')
'''
Red Green Blue Yellow
c 5.0 3.0 1 6
d NaN NaN 9 6
e NaN NaN 8 7
'''
df3.join(df4,how='outer')
'''
Red Green Blue Yellow
a 1.0 5.0 NaN NaN
b 3.0 0.0 NaN NaN
c 5.0 3.0 1.0 6.0
d NaN NaN 9.0 6.0
e NaN NaN 8.0 7.0
'''
# 合并多个DataFrame对象
df5 = pd.DataFrame({'Brown':[3,4,5],'white':[1,1,2]},index = list('aed'))
df3.join([df4,df5])
'''
Red Green Blue Yellow Brown white
a 1.0 5.0 NaN NaN 3.0 1.0
b 3.0 0.0 NaN NaN NaN NaN
c 5.0 3.0 1.0 6.0 NaN NaN
'''
merge, 着重于列的合并
df1=pd.DataFrame({'名字':list('ABCDE'),'性别':['男','女','男','男','女'],'职称':['副教授','讲师','助教','教授','助教']},index=range(1001,1006))
df1.columns.name='学院老师'
df1.index.name='编号'
print(df1)
'''
学院老师 名字 性别 职称
编号
1001 A 男 副教授
1002 B 女 讲师
1003 C 男 助教
1004 D 男 教授
1005 E 女 助教
'''
df2=pd.DataFrame({'名字':list('ABDAX'),'课程':['C++','计算机导论','汇编','数据结构','马克思原理'],'职称':['副教授','讲师','教授','副教授','讲师']},index=[1001,1002,1004,1001,3001])
df2.columns.name='课程'
df2.index.name='编号'
print(df2)
'''
课程 名字 课程 职称
编号
1001 A C++ 副教授
1002 B 计算机导论 讲师
1004 D 汇编 教授
1001 A 数据结构 副教授
3001 X 马克思原理 讲师
'''
# 默认下是根据左右对象中出现同名的列作为连接的键,且连接方式是how=’inner’
print(pd.merge(df1,df2))
'''
名字 性别 职称 课程
0 A 男 副教授 C++
1 A 男 副教授 数据结构
2 B 女 讲师 计算机导论
3 D 男 教授 汇编
'''
# 指定列名合并
print(pd.merge(df1,df2,on = '名字',suffixes = ['_1','_2']))
# 指定连接方式
pd.merge(df1,df2,how = 'left')
pd.merge(df1,df2,how = 'right')
pd.merge(df1,df2,how = 'outer')
# 根据多个键连接
pd.merge(df1,df2,on = ['职称','名字'])
轴向连接-Concat
s1 = pd.Series([1,2],index = list('ab'))
s2 = pd.Series([3,4,5],index = list('bde'))
print(pd.concat([s1,s2]))
'''
a 1
b 2
b 3
d 4
e 5
dtype: int64
'''
print(pd.concat([s1,s2],axis = 1))
'''
0 1
a 1.0 NaN
b 2.0 3.0
d NaN 4.0
e NaN 5.0
'''
# 用内连接求交集(inner,left,right,outer)
print(pd.concat([s1,s2],axis=1,join='inner'))
'''
0 1
b 2 3
'''
# 指定部分索引进行连接
print(pd.concat([s1,s2],axis=1,join_axes=[list('abc')]))
'''
0 1
a 1.0 NaN
b 2.0 3.0
c NaN NaN
'''
# 创建层次化索引
print(pd.concat([s1,s2],keys=['A','B']))
'''
A a 1
b 2
B b 3
d 4
e 5
dtype: int64
'''
# 纵向连接时,keys为列名
print(pd.concat([s1,s2],keys=['A','D'],axis=1))
'''
A D
a 1.0 NaN
b 2.0 3.0
d NaN 4.0
e NaN 5.0
'''
DataFrame对象之间的连接:
df3 = pd.DataFrame({'red':[1,3,5],'Green':[5,0,3]},index=list('abd'))
df4 = pd.DataFrame({'Blue':[1,9],'Yellow':[6,6]},index=list('ce'))
print(df3)
print(df4)
'''
red Green
a 1 5
b 3 0
d 5 3
Blue Yellow
c 1 6
e 9 6
'''
print(pd.concat([df3,df4]))
'''
Blue Green Yellow red
a NaN 5.0 NaN 1.0
b NaN 0.0 NaN 3.0
d NaN 3.0 NaN 5.0
c 1.0 NaN 6.0 NaN
e 9.0 NaN 6.0 NaN
'''
print(pd.concat([df3,df4],axis=1,keys=['A','B']))
'''
A B
red Green Blue Yellow
a 1.0 5.0 NaN NaN
b 3.0 0.0 NaN NaN
c NaN NaN 1.0 6.0
d 5.0 3.0 NaN NaN
e NaN NaN 9.0 6.0
'''
# 用字典的连接方式创建层次化索引
print(pd.concat({'A':df3,'B':df4},axis=1))
'''
A B
red Green Blue Yellow
a 1.0 5.0 NaN NaN
b 3.0 0.0 NaN NaN
c NaN NaN 1.0 6.0
d 5.0 3.0 NaN NaN
e NaN NaN 9.0 6.0
'''
网友评论