美文网首页
pandas基础教程——Day2

pandas基础教程——Day2

作者: XaviSong | 来源:发表于2020-08-19 16:54 被阅读0次

本章内容

  1. 数据处理
  2. 数据合并

一、数据处理

通过dropna()函数滤除缺失数据:
# 1 处理Series对象
se=pd.Series([4,np.nan,8,np.nan,5])
print(se)
print(se.dropna())
print(se.isnull())
'''
0    4.0
1    NaN
2    8.0
3    NaN
4    5.0
dtype: float64
0    4.0
2    8.0
4    5.0
dtype: float64
0    False
1     True
2    False
3     True
4    False
dtype: bool
'''

# 通过布尔序列也能滤除:
print(se[se.notnull()])
'''
0    4.0
2    8.0
4    5.0
dtype: float64
'''
# 2 处理DataFrame对象
df1=pd.DataFrame([[1,2,3],[np.nan,np.nan,2],[np.nan,np.nan,np.nan],[8,8,np.nan]])
print(df1)
'''
     0    1    2
0  1.0  2.0  3.0
1  NaN  NaN  2.0
2  NaN  NaN  NaN
3  8.0  8.0  NaN
'''

# 默认滤除所有包含NaN:
print(df1.dropna())
'''
     0    1    2
0  1.0  2.0  3.0
'''

# 传入how=‘all’滤除全为NaN的行:
print(df1.dropna(how='all')) # 默认情况下是how='any',只要有nan就删除
'''
     0    1    2
0  1.0  2.0  3.0
1  NaN  NaN  2.0
3  8.0  8.0  NaN
'''

# 传入axis=1滤除列:
print(df1.dropna(axis=1,how="all"))

#传入thresh=n保留至少有n个非NaN数据的行:
print(df1.dropna(thresh=1))
'''
     0    1    2
0  1.0  2.0  3.0
1  NaN  NaN  2.0
3  8.0  8.0  NaN
'''

# 用常数填充fillna
print(df1.fillna(0))
print(df1)

#传入inplace=True直接修改原对象:
df1.fillna(0,inplace=True)
print(df1)

# 通过字典填充不同的常数
print(df1.fillna({0:10,1:20,2:30})) #0,1,2对应列索引
'''
      0     1     2
0   1.0   2.0   3.0
1  10.0  20.0   2.0
2  10.0  20.0  30.0
3   8.0   8.0  30.0
'''
# 填充平均值
print(df1.fillna(df1.mean()))
# 如果只填充一列
print(df1.iloc[:,1].fillna(5,inplace = True))
print(df1)
'''
     0    1    2
0  1.0  2.0  3.0
1  NaN  5.0  2.0
2  NaN  5.0  NaN
3  8.0  8.0  NaN
'''
填充缺失数据:ffill与bfill
# 传入method=” “改变插值方式:
df2=pd.DataFrame(np.random.randint(0,10,(5,5)))
df2.iloc[1:4,3]=np.nan
df2.iloc[2:4,4]=np.nan
print(df2)
# 用前面的值来填充ffill   用后面的值来填充bfill
print(df2.fillna(method='bfill'))
'''
   0  1  2    3    4
0  9  9  3  9.0  9.0
1  9  2  8  6.0  3.0
2  9  5  5  6.0  8.0
3  6  3  7  6.0  8.0
4  9  0  0  6.0  8.0
'''

# 传入limit=” “限制填充行数:
print(df2.fillna(method='bfill',limit=1))
'''
   0  1  2    3    4
0  2  7  0  4.0  7.0
1  3  0  4  NaN  4.0
2  6  3  8  NaN  NaN
3  1  3  2  4.0  3.0
4  9  5  5  4.0  3.0
'''

# 传入axis=” “修改填充方向:
print(df2.fillna(method="ffill",limit=1,axis=1))
'''
     0    1    2    3    4
0  8.0  1.0  6.0  0.0  2.0
1  3.0  8.0  2.0  2.0  5.0
2  2.0  5.0  2.0  2.0  NaN
3  5.0  5.0  4.0  4.0  NaN
4  2.0  8.0  8.0  8.0  3.0
'''
移除重复数据
df1=pd.DataFrame({'A':[1,1,1,2,2,3,1],'B':list("aabbbca")})
print(df1)

# 判断每一行是否重复(结果是bool值,TRUE代表重复的)
print(df1.duplicated())
'''
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
'''

# 去除全部的重复行
print(df1.drop_duplicates())
'''
   A  B
0  1  a
2  1  b
3  2  b
5  3  c
'''

# 指定列去除重复行
print(df1.drop_duplicates(['A']))
'''
   A  B
0  1  a
3  2  b
5  3  c
'''

# 保留重复行中的最后一行
print(df1.drop_duplicates(['A'],keep = 'last'))
'''
   A  B
4  2  b
5  3  c
6  1  a
'''

# 去重的同时修改原对象
df1.drop_duplicates(['A','B'],inplace = True)
'''
   A  B
0  1  a
2  1  b
3  2  b
5  3  c
'''

二、数据合并

join,着重关注行的合并
import pandas as pd
df3=pd.DataFrame({'Red':[1,3,5],'Green':[5,0,3]},index=list('abc'))
df4=pd.DataFrame({'Blue':[1,9,8],'Yellow':[6,6,7]},index=list('cde'))
print(df3)
'''
   Red  Green
a    1      5
b    3      0
c    5      3
'''
print(df4)
'''
   Blue  Yellow
c     1       6
d     9       6
e     8       7
'''

df3.join(df4,how='left')
'''
    Red Green   Blue    Yellow
a   1   5       NaN     NaN
b   3   0       NaN     NaN
c   5   3       1.0     6.0
'''
df3.join(df4,how='right')
'''
    Red Green   Blue    Yellow
c   5.0 3.0     1       6
d   NaN NaN     9       6
e   NaN NaN     8       7   
'''
df3.join(df4,how='outer')
'''
    Red Green   Blue    Yellow
a   1.0 5.0     NaN     NaN
b   3.0 0.0     NaN     NaN
c   5.0 3.0     1.0     6.0
d   NaN NaN     9.0     6.0
e   NaN NaN     8.0     7.0
'''
# 合并多个DataFrame对象
df5 = pd.DataFrame({'Brown':[3,4,5],'white':[1,1,2]},index = list('aed'))
df3.join([df4,df5])
'''
Red  Green  Blue  Yellow  Brown  white
a  1.0    5.0   NaN     NaN    3.0    1.0
b  3.0    0.0   NaN     NaN    NaN    NaN
c  5.0    3.0   1.0     6.0    NaN    NaN
'''
merge, 着重于列的合并
df1=pd.DataFrame({'名字':list('ABCDE'),'性别':['男','女','男','男','女'],'职称':['副教授','讲师','助教','教授','助教']},index=range(1001,1006))
df1.columns.name='学院老师'
df1.index.name='编号'
print(df1)
'''
学院老师 名字 性别   职称
编号             
1001  A  男  副教授
1002  B  女   讲师
1003  C  男   助教
1004  D  男   教授
1005  E  女   助教
'''

df2=pd.DataFrame({'名字':list('ABDAX'),'课程':['C++','计算机导论','汇编','数据结构','马克思原理'],'职称':['副教授','讲师','教授','副教授','讲师']},index=[1001,1002,1004,1001,3001])
df2.columns.name='课程'
df2.index.name='编号'
print(df2)
'''
课程   名字     课程        职称
编号                 
1001    A       C++       副教授
1002    B       计算机导论  讲师
1004    D       汇编       教授
1001    A       数据结构    副教授
3001    X       马克思原理  讲师
'''

# 默认下是根据左右对象中出现同名的列作为连接的键,且连接方式是how=’inner’
print(pd.merge(df1,df2))
'''
 名字 性别   职称     课程
0  A  男  副教授    C++
1  A  男  副教授   数据结构
2  B  女   讲师  计算机导论
3  D  男   教授     汇编
'''

# 指定列名合并
print(pd.merge(df1,df2,on = '名字',suffixes = ['_1','_2']))
# 指定连接方式
pd.merge(df1,df2,how = 'left')
pd.merge(df1,df2,how = 'right')
pd.merge(df1,df2,how = 'outer')
# 根据多个键连接
pd.merge(df1,df2,on = ['职称','名字'])
轴向连接-Concat
s1 = pd.Series([1,2],index = list('ab'))
s2 = pd.Series([3,4,5],index = list('bde'))
print(pd.concat([s1,s2]))
'''
a    1
b    2
b    3
d    4
e    5
dtype: int64
'''
print(pd.concat([s1,s2],axis = 1))
'''
     0    1
a  1.0  NaN
b  2.0  3.0
d  NaN  4.0
e  NaN  5.0
'''

# 用内连接求交集(inner,left,right,outer)
print(pd.concat([s1,s2],axis=1,join='inner'))
'''
   0  1
b  2  3
'''
# 指定部分索引进行连接
print(pd.concat([s1,s2],axis=1,join_axes=[list('abc')]))
'''
     0    1
a  1.0  NaN
b  2.0  3.0
c  NaN  NaN
'''
# 创建层次化索引
print(pd.concat([s1,s2],keys=['A','B']))
'''
A  a    1
   b    2
B  b    3
   d    4
   e    5
dtype: int64
'''
# 纵向连接时,keys为列名
print(pd.concat([s1,s2],keys=['A','D'],axis=1))
'''
     A    D
a  1.0  NaN
b  2.0  3.0
d  NaN  4.0
e  NaN  5.0
'''
DataFrame对象之间的连接:
df3 = pd.DataFrame({'red':[1,3,5],'Green':[5,0,3]},index=list('abd'))
df4 = pd.DataFrame({'Blue':[1,9],'Yellow':[6,6]},index=list('ce'))
print(df3)
print(df4)
'''
   red  Green
a    1      5
b    3      0
d    5      3
   Blue  Yellow
c     1       6
e     9       6
'''

print(pd.concat([df3,df4]))
'''
   Blue  Green  Yellow  red
a   NaN    5.0     NaN  1.0
b   NaN    0.0     NaN  3.0
d   NaN    3.0     NaN  5.0
c   1.0    NaN     6.0  NaN
e   9.0    NaN     6.0  NaN
'''
print(pd.concat([df3,df4],axis=1,keys=['A','B']))
'''
     A          B       
   red Green Blue Yellow
a  1.0   5.0  NaN    NaN
b  3.0   0.0  NaN    NaN
c  NaN   NaN  1.0    6.0
d  5.0   3.0  NaN    NaN
e  NaN   NaN  9.0    6.0
'''

# 用字典的连接方式创建层次化索引
print(pd.concat({'A':df3,'B':df4},axis=1))
'''
     A          B       
   red Green Blue Yellow
a  1.0   5.0  NaN    NaN
b  3.0   0.0  NaN    NaN
c  NaN   NaN  1.0    6.0
d  5.0   3.0  NaN    NaN
e  NaN   NaN  9.0    6.0
'''

相关文章

网友评论

      本文标题:pandas基础教程——Day2

      本文链接:https://www.haomeiwen.com/subject/ojkdjktx.html