if-then和if-then-else逻辑实现
import pandas as pd
import numpy as np
df = pd.DataFrame(
{'AAA': [4,5,6,7], 'BBB': [10,20,30,40], 'CCC': [100,50,-30,-50] })
# loc函数
# if-then 的逻辑在某列上,【选取满足的条件的行数据,然后对指定的列赋值】
df.loc[df.AAA >= 5, 'BBB'] = -1
df.loc[df.AAA >= 5,['BBB','CCC']] = 555
df.loc[df.AAA < 5,['BBB','CCC']] = 2000
# where函数
# 根据每个位置元素的true或false,去赋值,true原值不变,false改变
df_mask = pd.DataFrame({'AAA' : [True] * 4, 'BBB' : [False] * 4,'CCC' : [True,False] * 2})
# df_mask的'AAA'列全是true,'BBB'列全是false,'CCC'是true,false,true,false
df.where(df_mask, -1000)
Out[6]:
AAA BBB CCC
0 4 -1000 2000
1 5 -1000 -1000
2 6 -1000 555
3 7 -1000 -1000
# 利用numpy中的where函数,实现if-then-else逻辑
df = pd.DataFrame(
{'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]})
# 'AAA'列如果大于5,赋值某列(新列)为high,否则赋值low
df['logic'] = np.where(df['AAA']>5, 'high', 'low')
筛选出某列满足条件的数据
df = pd.DataFrame(
{'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]})
Out[9]:
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50
# 筛选出某列满足条件的数据
dflow = df[df.AAA <= 5]
dflow
Out[10]:
AAA BBB CCC
0 4 10 100
1 5 20 50
dfhigh = df[df.AAA > 5]
dfhigh
Out[11]:
AAA BBB CCC
2 6 30 -30
3 7 40 -50
筛选出多列满足条件的数据
df = pd.DataFrame(
{'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]})
# 利用&表示同时存在关系,| 表示或关系,~ 表示取反的关系
newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']
newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']
# 对筛选出满足条件的数据赋值
df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1
df
Out[15]:
AAA BBB CCC
0 0.1 10 100
1 5.0 20 50
2 0.1 30 -30
3 0.1 40 -50
# 对某列运算并排序
aValue = 43.0
df.loc[(df.CCC-aValue).abs().argsort()]
Out[18]:
AAA BBB CCC
1 5 20 50 # 7
0 4 10 100 # 57
2 6 30 -30 # abs(-73)
3 7 40 -50 # abs(-93)
#
df = pd.DataFrame(
....: {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df
....:
Out[19]:
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50
Crit1 = df.AAA <= 5.5 # [True, True, False, False]
Crit2 = df.BBB == 10.0 # [True, False, False, False]
Crit3 = df.CCC > -40.0 # [True, True, True, False]
AllCrit = Crit1 & Crit2 & Crit3 # [True, False, False, False]
df[AllCrit]
Out[25]:
AAA BBB CCC
0 4 10 100
# 或者下面实现方式
CritList = [Crit1,Crit2,Crit3]
AllCrit = functools.reduce(lambda x,y: x & y, CritList)
df[AllCrit]
Out[26]:
AAA BBB CCC
0 4 10 100
############
df[(df.AAA <= 6) & (df.index.isin([0,2,4]))]
Out[28]:
AAA BBB CCC
0 4 10 100
2 6 30 -30
###################
df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))]
Out[38]:
AAA BBB CCC
1 5 20 50
3 7 40 -50
切片用法
df['AAA'] # 正确
df.loc['AAA'] # 错误, 因为loc函数放进去的是index,而不是columns,注意与前面讲的筛选条件的用法。
data = {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}
df = pd.DataFrame(data=data,index=['foo','bar','boo','kar'])
df
Out[30]:
AAA BBB CCC
foo 4 10 100
bar 5 20 50
boo 6 30 -30
kar 7 40 -50
# loc函数的切片用法
df.loc['bar':'kar'] #Label
Out[31]:
AAA BBB CCC
bar 5 20 50
boo 6 30 -30
kar 7 40 -50
# iloc函数与loc函数不同,一个是位置切片(不包含尾边界),一个是定义的index切片(包含尾边界)
df.iloc[0:3]
Out[32]:
AAA BBB CCC
foo 4 10 100
bar 5 20 50
boo 6 30 -30
panels
rng = pd.date_range('1/1/2013',periods=100,freq='D')
data = np.random.randn(100, 4)
cols = ['A','B','C','D']
df1, df2, df3 = pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols)
pf = pd.Panel({'df1':df1,'df2':df2,'df3':df3})
pf
Out[43]:
<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 100 (major_axis) x 4 (minor_axis)
Items axis: df1 to df3
Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00
Minor_axis axis: A to D
# 增加多一列,由原来的3*100*4到3*100*5
pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols);pf
Out[44]:
<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 100 (major_axis) x 5 (minor_axis)
Items axis: df1 to df3
Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00
Minor_axis axis: A to F
创建新列
df = pd.DataFrame(
{'AAA' : [1,2,1,3], 'BBB' : [1,1,2,2], 'CCC' : [2,1,3,1]})
source_cols = df.columns
new_cols = [str(x) + "_cat" for x in source_cols]
# 建立一个字典
categories = {1 : 'Alpha', 2 : 'Beta', 3 : 'Charlie' }
# 多个列中元素,通过字典一一映射,注意使用字典的函数get通过键获取对应值
df[new_cols] = df[source_cols].applymap(categories.get)
df
Out[49]:
AAA BBB CCC AAA_cat BBB_cat CCC_cat
0 1 1 2 Alpha Alpha Beta
1 2 1 1 Beta Alpha Alpha
2 1 2 3 Alpha Beta Charlie
3 3 2 1 Charlie Beta Alpha
分组,组内使用函数
df = pd.DataFrame(
{'AAA' : [1,1,1,2,2,2,3,3], 'BBB' : [2,1,3,4,5,1,2,3]})
# 根据列'AAA'分组,取每组的最小值,注意idxmin是取组内最小值的索引
df.loc[df.groupby("AAA")["BBB"].idxmin()]
Out[51]:
AAA BBB
1 1 1
5 2 1
6 3 2
# 或者用第二种方法 ,先根据列'BBB'排序,然后以列'AAA'分组,取第一位值即可。
df.sort_values(by="BBB").groupby("AAA", as_index=False).first()
多索引
df = pd.DataFrame({'row' : [0,1,2],
'One_X' : [1.1,1.1,1.1],
'One_Y' : [1.2,1.2,1.2],
'Two_X' : [1.11,1.11,1.11],
'Two_Y' : [1.22,1.22,1.22]});
df
Out[53]:
row One_X One_Y Two_X Two_Y
0 0 1.1 1.2 1.11 1.22
1 1 1.1 1.2 1.11 1.22
2 2 1.1 1.2 1.11 1.22
# 将某一列作为索引
df = df.set_index('row');
df
Out[54]:
One_X One_Y Two_X Two_Y
row
0 1.1 1.2 1.11 1.22
1 1.1 1.2 1.11 1.22
2 1.1 1.2 1.11 1.22
# 改变原df的列名,从而列名成为嵌套列名,比如列名不再简单是['a','b'],而是这种格式[('a', 'aa'), ('b', 'bb')],也就是多列名形式,和多索引形式类似
df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns]);
df
Out[55]:
One Two
X Y X Y
row
0 1.1 1.2 1.11 1.22
1 1.1 1.2 1.11 1.22
2 1.1 1.2 1.11 1.22
# 使用stack函数,参数0表示最外层列变成索引,在索引里是内层位置,
# reset_index,参数为1表示选取最外层做索引,从而内层索引变成一列
df = df.stack(0).reset_index(1);df
Out[56]:
level_1 X Y
row
0 One 1.10 1.20
0 Two 1.11 1.22
1 One 1.10 1.20
1 Two 1.11 1.22
2 One 1.10 1.20
2 Two 1.11 1.22
#
df.columns = ['Sample','All_X','All_Y'];
df
Out[57]:
Sample All_X All_Y
row
0 One 1.10 1.20
0 Two 1.11 1.22
1 One 1.10 1.20
1 Two 1.11 1.22
2 One 1.10 1.20
2 Two 1.11 1.22
算术
cols = pd.MultiIndex.from_tuples([ (x,y) for x in ['A','B','C'] for y in ['O','I']])
df = pd.DataFrame(np.random.randn(2,6),index=['n','m'],columns=cols); df
Out[59]:
A B C
O I O I O I
n 1.920906 -0.388231 -2.314394 0.665508 0.402562 0.399555
m -1.765956 0.850423 0.388054 0.992312 0.744086 -0.739776
# df中所有元素除以对应df['C'],两两对应相处
df = df.div(df['C'],level=1); df
Out[60]:
A B C
O I O I O I
n 4.771702 -0.971660 -5.749162 1.665625 1.0 1.0
m -2.373321 -1.149568 0.521518 -1.341367 1.0 1.0
多索引切片
coords = [('AA','one'),('AA','six'),('BB','one'),('BB','two'),('BB','six')]
index = pd.MultiIndex.from_tuples(coords)
df = pd.DataFrame([11,22,33,44,55],index,['MyData']);
df
Out[63]:
MyData
AA one 11
six 22
BB one 33
two 44
six 55
# 获取多索引中'BB'索引,且Level=0情况,也就是最内层
df.xs('BB',level=0,axis=0) #Note : level and axis are optional, and default to zero
Out[64]:
MyData
one 33
two 44
six 55
# 同上,参数取值不一样而已
df.xs('six',level=1,axis=0)
Out[65]:
MyData
AA 22
BB 55
# 搞个复杂一点的,看看多索引,多列名的切片取值
index = list(itertools.product(['Ada','Quinn','Violet'],['Comp','Math','Sci']))
headr = list(itertools.product(['Exams','Labs'],['I','II']))
indx = pd.MultiIndex.from_tuples(index,names=['Student','Course'])
cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-named
data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)]
df = pd.DataFrame(data,indx,cols);
df
Out[71]:
Exams Labs
I II I II
Student Course
Ada Comp 70 71 72 73
Math 71 73 75 74
Sci 72 75 75 75
Quinn Comp 73 74 75 76
Math 74 76 78 77
Sci 75 78 78 78
Violet Comp 76 77 78 79
Math 77 79 81 80
Sci 78 81 81 81
All = slice(None)
df.loc['Violet']
Out[73]:
Exams Labs
I II I II
Course
Comp 76 77 78 79
Math 77 79 81 80
Sci 78 81 81 81
df.loc[(All,'Math'),All]
Out[74]:
Exams Labs
I II I II
Student Course
Ada Math 71 73 75 74
Quinn Math 74 76 78 77
Violet Math 77 79 81 80
df.loc[(slice('Ada','Quinn'),'Math'),All]
Out[75]:
Exams Labs
I II I II
Student Course
Ada Math 71 73 75 74
Quinn Math 74 76 78 77
df.loc[(All,'Math'),('Exams')]
Out[76]:
I II
Student Course
Ada Math 71 73
Quinn Math 74 76
Violet Math 77 79
df.loc[(All,'Math'),(All,'II')]
Out[77]:
Exams Labs
II II
Student Course
Ada Math 73 74
Quinn Math 76 77
Violet Math 79 80
网友评论