selecting and indexing data
一、Construction of Series and DataFrame
只列举出常用操作具体内容参见官方文档
from pandas import DataFrame,Series
x1 = Series([1,2,3,4])
x2 = Series(data=[1,2,3,4],index=['a','b','c','d'])
Dict = {'a':1,'b':2,'c':3,'d':4}
x3 = Series(Dict)
print(x1)
print(x2)
print(x3)
0 1
1 2
2 3
3 4
dtype: int64
a 1
b 2
c 3
d 4
dtype: int64
a 1
b 2
c 3
d 4
dtype: int64
# dataframe使用
from pandas import Series, DataFrame
data = {'Chinese': [66, 95, 93, 90,80], 'Math': [30, 98, 96, 77, 90], 'English': [65, 85, 92, 88, 90]}
df1 = DataFrame(data)
df2 = DataFrame(data, index=['ZhangFei', 'GuanYu', 'LiuBei', 'DianWei', 'XuChu'], columns=['Chinese', 'Math', 'English'])
print(df1)
print(df2)
# 对列名进行更换
df2.rename(columns={'Chinese': '语文', 'English': '英语', 'Math': '数学'}, inplace = True)
print(df2.isnull()) #检验是否有缺失值
# 输出df2的概要输出均值,标准差,最大值,最小值,各种分位数
print(df2.describe())
Chinese Math English
0 66 30 65
1 95 98 85
2 93 96 92
3 90 77 88
4 80 90 90
Chinese Math English
ZhangFei 66 30 65
GuanYu 95 98 85
LiuBei 93 96 92
DianWei 90 77 88
XuChu 80 90 90
语文 数学 英语
ZhangFei False False False
GuanYu False False False
LiuBei False False False
DianWei False False False
XuChu False False False
语文 数学 英语
count 5.000000 5.000000 5.000000
mean 84.800000 78.200000 84.000000
std 11.987493 28.163807 10.931606
min 66.000000 30.000000 65.000000
25% 80.000000 77.000000 85.000000
50% 90.000000 90.000000 88.000000
75% 93.000000 96.000000 90.000000
max 95.000000 98.000000 92.000000
二、indexing
import pandas as pd
import numpy as np
datas = pd.date_range('28/3/2020',periods = 8)
df = pd.DataFrame(data = np.random.randn(8,4),index=datas,columns=['A','B','C','D'])
print(df)
a = df['A'] # 此种形式DataFrame索引为series
print()
print(a)
print()
print(a[datas[5]])
A B C D
2020-03-28 0.040467 2.502838 1.750702 0.752269
2020-03-29 -0.591560 -0.607783 0.390446 0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31 2.107192 1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671 0.525188 -0.134835
2020-04-03 -1.193671 2.798576 3.594377 -0.016369
2020-04-04 1.592835 -0.351965 1.728636 0.547841
2020-03-28 0.040467
2020-03-29 -0.591560
2020-03-30 -0.201351
2020-03-31 2.107192
2020-04-01 -0.263185
2020-04-02 -0.668462
2020-04-03 -1.193671
2020-04-04 1.592835
Freq: D, Name: A, dtype: float64
-0.6684622593090315
print(df)
print(df[['A','B']]) # indexing A B 两列
print(df[:3]) #索引切片行支持反向和步长
A B C D
2020-03-28 0.040467 2.502838 1.750702 0.752269
2020-03-29 -0.591560 -0.607783 0.390446 0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31 2.107192 1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671 0.525188 -0.134835
2020-04-03 -1.193671 2.798576 3.594377 -0.016369
2020-04-04 1.592835 -0.351965 1.728636 0.547841
A B
2020-03-28 0.040467 2.502838
2020-03-29 -0.591560 -0.607783
2020-03-30 -0.201351 -1.134350
2020-03-31 2.107192 1.267613
2020-04-01 -0.263185 -0.974481
2020-04-02 -0.668462 -0.668671
2020-04-03 -1.193671 2.798576
2020-04-04 1.592835 -0.351965
A B C D
2020-03-28 0.040467 2.502838 1.750702 0.752269
2020-03-29 -0.591560 -0.607783 0.390446 0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
# 使用loc函数索引行列组合,对series而言索引行
print(df)
print(df.loc[:,['A','B']])
#此种形式错误print(df[[:],['A','B']])
A B C D
2020-03-28 0.040467 2.502838 1.750702 0.752269
2020-03-29 -0.591560 -0.607783 0.390446 0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31 2.107192 1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671 0.525188 -0.134835
2020-04-03 -1.193671 2.798576 3.594377 -0.016369
2020-04-04 1.592835 -0.351965 1.728636 0.547841
A B
2020-03-28 0.040467 2.502838
2020-03-29 -0.591560 -0.607783
2020-03-30 -0.201351 -1.134350
2020-03-31 2.107192 1.267613
2020-04-01 -0.263185 -0.974481
2020-04-02 -0.668462 -0.668671
2020-04-03 -1.193671 2.798576
2020-04-04 1.592835 -0.351965
利用索引修改series和DataFrame值
sa = pd.Series(range(4),list('abcd'))
dfa = df.copy()
print(sa)
print(dfa)
a 0
b 1
c 2
d 3
dtype: int64
A B C D
2020-03-28 0.040467 2.502838 1.750702 0.752269
2020-03-29 -0.591560 -0.607783 0.390446 0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31 2.107192 1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671 0.525188 -0.134835
2020-04-03 -1.193671 2.798576 3.594377 -0.016369
2020-04-04 1.592835 -0.351965 1.728636 0.547841
sa['a']=100
print(sa)
dfa['A'] = list(range(len(dfa.index)))
print(dfa)
a 100
b 1
c 2
d 3
dtype: int64
A B C D
2020-03-28 0 2.502838 1.750702 0.752269
2020-03-29 1 -0.607783 0.390446 0.759989
2020-03-30 2 -1.134350 -0.912988 -0.313645
2020-03-31 3 1.267613 -0.421305 -0.286911
2020-04-01 4 -0.974481 -0.655299 -0.672898
2020-04-02 5 -0.668671 0.525188 -0.134835
2020-04-03 6 2.798576 3.594377 -0.016369
2020-04-04 7 -0.351965 1.728636 0.547841
x = pd.DataFrame({'y':[1,2,3],'z':[4,5,6]})
print(x)
print(x.iloc[1]) #iloc[:]=DataFrame[:]
x.iloc[1]=[20,50]
print(x)
y z
0 1 4
1 2 5
2 3 6
y z
1 2 5
2 3 6
y z
0 1 4
1 20 50
2 3 6
#boolean indexing
print(df)
print(df>0) # 返回bool类型
A B C D
2020-03-28 0.040467 2.502838 1.750702 0.752269
2020-03-29 -0.591560 -0.607783 0.390446 0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31 2.107192 1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671 0.525188 -0.134835
2020-04-03 -1.193671 2.798576 3.594377 -0.016369
2020-04-04 1.592835 -0.351965 1.728636 0.547841
A B C D
2020-03-28 True True True True
2020-03-29 False False True True
2020-03-30 False False False False
2020-03-31 True True False False
2020-04-01 False False False False
2020-04-02 False False True False
2020-04-03 False True True False
2020-04-04 True False True True
dfA = df['A']
print(dfA)
print(dfA>0)
2020-03-28 0.040467
2020-03-29 -0.591560
2020-03-30 -0.201351
2020-03-31 2.107192
2020-04-01 -0.263185
2020-04-02 -0.668462
2020-04-03 -1.193671
2020-04-04 1.592835
Freq: D, Name: A, dtype: float64
2020-03-28 True
2020-03-29 False
2020-03-30 False
2020-03-31 True
2020-04-01 False
2020-04-02 False
2020-04-03 False
2020-04-04 True
Freq: D, Name: A, dtype: bool
print(df)
print(df.loc['2020-03-29'])
print(df.loc[:,df.loc['2020-03-29']>0])
A B C D
2020-03-28 0.040467 2.502838 1.750702 0.752269
2020-03-29 -0.591560 -0.607783 0.390446 0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31 2.107192 1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671 0.525188 -0.134835
2020-04-03 -1.193671 2.798576 3.594377 -0.016369
2020-04-04 1.592835 -0.351965 1.728636 0.547841
A -0.591560
B -0.607783
C 0.390446
D 0.759989
Name: 2020-03-29 00:00:00, dtype: float64
C D
2020-03-28 1.750702 0.752269
2020-03-29 0.390446 0.759989
2020-03-30 -0.912988 -0.313645
2020-03-31 -0.421305 -0.286911
2020-04-01 -0.655299 -0.672898
2020-04-02 0.525188 -0.134835
2020-04-03 3.594377 -0.016369
2020-04-04 1.728636 0.547841
|-iloc与loc的差别:
loc的索引和切片只能通过对应的indexs和columns值进行索引
而iloc系统会从0开始indexs和columns设置下标因此可以直接通过下标
进行索引和切片例子如下
S1 = pd.Series(data=list(range(0,8,2)),index=list('abcd'))
print(S1)
df1 = pd.DataFrame(data=np.random.randn(4,6),index=list('abcd'),columns=list(range(2,14,2)))
print(df1)
a 0
b 2
c 4
d 6
dtype: int64
2 4 6 8 10 12
a -2.445292 0.048598 0.050947 -0.713184 2.017222 -1.389391
b 1.909918 -1.212520 0.552249 1.115173 -0.024809 -0.192347
c -0.776439 0.877586 0.569017 -1.741527 -0.022756 0.154204
d 0.537282 2.366709 -0.606037 0.860133 -0.707234 -0.297887
print(S1.iloc[1:])
#此方式不行print(S1.loc[1:])
print(df1.iloc[1:4,2:5])
#此方式不行print(df1.loc[1:4,2:5])
print(df1.loc['b':,6:10]) #这个与上式等价
b 2
c 4
d 6
dtype: int64
6 8 10
b 0.552249 1.115173 -0.024809
c 0.569017 -1.741527 -0.022756
d -0.606037 0.860133 -0.707234
6 8 10
b 0.552249 1.115173 -0.024809
c 0.569017 -1.741527 -0.022756
d -0.606037 0.860133 -0.707234
GitHub
https://github.com/luozekun1230/MyPyhonProgram/tree/master/Pandas
网友评论