pandas笔记
插入pandas
import pandas as pd
import numpy as np
创建序列
s = pd.Series([1,3,6,np.nan,44,1]) #nan = not a number
print(s)
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
创建dataframe
dates = pd.date_range('20160101',periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
print('-------------------------------------------------------------------')
df1 = pd.DataFrame(np.random.randn(6,4))
print(df1)
print('-------------------------------------------------------------------')
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20200101'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df2)
print('-------------------------------------------------------------------')
print(df2.dtypes) #输出类型
print('-------------------------------------------------------------------')
print(df2.index) #输出行索引
print('-------------------------------------------------------------------')
print(df2.columns) #输出列索引
print('-------------------------------------------------------------------')
print(df2.values) #输出值
print('-------------------------------------------------------------------')
print(df2.describe()) #输出常见的统计数据 求和/均值/标准差/最小值...
print('-------------------------------------------------------------------')
print(df2.T) #转置
print('-------------------------------------------------------------------')
print(df2.sort_index(axis=1,ascending=False)) #索引列倒序 axis=0 行倒序
print('-------------------------------------------------------------------')
print(df2.sort_values(by='E')) #值排序
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')
a b c d
2016-01-01 -1.671207 -0.932499 -1.655480 0.769094
2016-01-02 -0.680207 -0.666975 -1.760835 0.337892
2016-01-03 0.426209 1.478592 1.028569 -1.529521
2016-01-04 -0.906435 0.813065 0.638096 -0.283077
2016-01-05 -0.790560 0.233199 -0.730250 0.056732
2016-01-06 0.791728 0.958603 1.275976 1.052596
-------------------------------------------------------------------
0 1 2 3
0 0.280672 1.326906 -1.078034 -1.739482
1 -0.282193 -0.392775 -0.512235 -0.361789
2 -0.402586 0.326123 -0.291029 0.129753
3 -0.607459 0.177432 -0.527313 -0.734037
4 -1.011148 0.069085 -0.324561 -0.687408
5 -0.301261 0.752312 0.567507 -0.432943
-------------------------------------------------------------------
A B C D E F
0 1.0 2020-01-01 1.0 3 test foo
1 1.0 2020-01-01 1.0 3 train foo
2 1.0 2020-01-01 1.0 3 test foo
3 1.0 2020-01-01 1.0 3 train foo
-------------------------------------------------------------------
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
-------------------------------------------------------------------
Int64Index([0, 1, 2, 3], dtype='int64')
-------------------------------------------------------------------
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
-------------------------------------------------------------------
[[1.0 Timestamp('2020-01-01 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2020-01-01 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2020-01-01 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2020-01-01 00:00:00') 1.0 3 'train' 'foo']]
-------------------------------------------------------------------
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
-------------------------------------------------------------------
0 1 2 \
A 1 1 1
B 2020-01-01 00:00:00 2020-01-01 00:00:00 2020-01-01 00:00:00
C 1 1 1
D 3 3 3
E test train test
F foo foo foo
3
A 1
B 2020-01-01 00:00:00
C 1
D 3
E train
F foo
-------------------------------------------------------------------
F E D C B A
0 foo test 3 1.0 2020-01-01 1.0
1 foo train 3 1.0 2020-01-01 1.0
2 foo test 3 1.0 2020-01-01 1.0
3 foo train 3 1.0 2020-01-01 1.0
-------------------------------------------------------------------
A B C D E F
0 1.0 2020-01-01 1.0 3 test foo
2 1.0 2020-01-01 1.0 3 test foo
1 1.0 2020-01-01 1.0 3 train foo
3 1.0 2020-01-01 1.0 3 train foo
选择数据
dates = pd.date_range('20200101',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
print('-------------------------------------------------------------------')
print(df.A)#打印A列
print('-------------------------------------------------------------------')
print(df[0:2])#0-1行
print('-------------------------------------------------------------------')
print(df['20200102':'20200104'])
print('-------------------------------------------------------------------')
# slect by label:loc
print(df.loc['20200103']) #根据横向标签选择
print('-------------------------------------------------------------------')
print(df.loc[:,['A','B']]) #根据列标签选择
print('-------------------------------------------------------------------')
# select by position:iloc
print(df.iloc[3:,1:2]) #根据索引选择
print('-------------------------------------------------------------------')
print(df.iloc[[1,3,5],1:3])
print('-------------------------------------------------------------------')
# mixed selection:ix 综合loc、iloc
# pandas的1.0.0版本开始,移除了Series.ix and DataFrame.ix 方法
#print(df.ix[:3,['A','C']])
print('-------------------------------------------------------------------')
#Boolean indexing
print(df[df.A<8]) #输出小于8 的行
A B C D
2020-01-01 0 1 2 3
2020-01-02 4 5 6 7
2020-01-03 8 9 10 11
2020-01-04 12 13 14 15
2020-01-05 16 17 18 19
2020-01-06 20 21 22 23
-------------------------------------------------------------------
2020-01-01 0
2020-01-02 4
2020-01-03 8
2020-01-04 12
2020-01-05 16
2020-01-06 20
Freq: D, Name: A, dtype: int32
-------------------------------------------------------------------
A B C D
2020-01-01 0 1 2 3
2020-01-02 4 5 6 7
-------------------------------------------------------------------
A B C D
2020-01-02 4 5 6 7
2020-01-03 8 9 10 11
2020-01-04 12 13 14 15
-------------------------------------------------------------------
A 8
B 9
C 10
D 11
Name: 2020-01-03 00:00:00, dtype: int32
-------------------------------------------------------------------
A B
2020-01-01 0 1
2020-01-02 4 5
2020-01-03 8 9
2020-01-04 12 13
2020-01-05 16 17
2020-01-06 20 21
-------------------------------------------------------------------
B
2020-01-04 13
2020-01-05 17
2020-01-06 21
-------------------------------------------------------------------
B C
2020-01-02 5 6
2020-01-04 13 14
2020-01-06 21 22
-------------------------------------------------------------------
-------------------------------------------------------------------
A B C D
2020-01-01 0 1 2 3
2020-01-02 4 5 6 7
设置值
dates = pd.date_range('20200101',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
df.iloc[2,2]=1111 #根据索引位置修改值
print(df)
print('-------------------------------------------------------------------')
df.loc['20200101','A'] = 2222 #根据标签名修改值
print(df)
print('-------------------------------------------------------------------')
#df[df.A>4]=0 #对A列大于4的行全部赋值为0
print(df)
print('-------------------------------------------------------------------')
df.B[df.A>4]=0 #对A列大于4的数 B列全部赋值为0
print(df)
print('-------------------------------------------------------------------')
df['E'] = np.nan #添加新的列
print(df)
print('-------------------------------------------------------------------')
df['F'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20200101',periods=6))#索引一定要对齐
print(df)
A B C D
2020-01-01 0 1 2 3
2020-01-02 4 5 6 7
2020-01-03 8 9 1111 11
2020-01-04 12 13 14 15
2020-01-05 16 17 18 19
2020-01-06 20 21 22 23
-------------------------------------------------------------------
A B C D
2020-01-01 2222 1 2 3
2020-01-02 4 5 6 7
2020-01-03 8 9 1111 11
2020-01-04 12 13 14 15
2020-01-05 16 17 18 19
2020-01-06 20 21 22 23
-------------------------------------------------------------------
A B C D
2020-01-01 2222 1 2 3
2020-01-02 4 5 6 7
2020-01-03 8 9 1111 11
2020-01-04 12 13 14 15
2020-01-05 16 17 18 19
2020-01-06 20 21 22 23
-------------------------------------------------------------------
A B C D
2020-01-01 2222 0 2 3
2020-01-02 4 5 6 7
2020-01-03 8 0 1111 11
2020-01-04 12 0 14 15
2020-01-05 16 0 18 19
2020-01-06 20 0 22 23
-------------------------------------------------------------------
A B C D E
2020-01-01 2222 0 2 3 NaN
2020-01-02 4 5 6 7 NaN
2020-01-03 8 0 1111 11 NaN
2020-01-04 12 0 14 15 NaN
2020-01-05 16 0 18 19 NaN
2020-01-06 20 0 22 23 NaN
-------------------------------------------------------------------
A B C D E F
2020-01-01 2222 0 2 3 NaN 1
2020-01-02 4 5 6 7 NaN 2
2020-01-03 8 0 1111 11 NaN 3
2020-01-04 12 0 14 15 NaN 4
2020-01-05 16 0 18 19 NaN 5
2020-01-06 20 0 22 23 NaN 6
处理丢失数据
dates = pd.date_range('20200101',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
df.iloc[0,1]=np.nan #设置丢失数据
df.iloc[1,2]=np.nan
print(df)
#print(df.dropna(axis=0,how='any')) # axis=0按行丢掉 =1按列丢掉 any有一个nan就丢掉 all 全部nan才丢掉
print('-------------------------------------------------------------------')
#print(df.fillna(value=0)) #将NaN填充为0
print('-------------------------------------------------------------------')
print(df.isnull()) #查找是否有缺失数据 返回一个布尔列表
print('-------------------------------------------------------------------')
print(np.any(df.isnull())==True) #判断很大的表格是否有缺失,至少有一个等于true 则返回true
A B C D
2020-01-01 0 NaN 2.0 3
2020-01-02 4 5.0 NaN 7
2020-01-03 8 9.0 10.0 11
2020-01-04 12 13.0 14.0 15
2020-01-05 16 17.0 18.0 19
2020-01-06 20 21.0 22.0 23
-------------------------------------------------------------------
-------------------------------------------------------------------
A B C D
2020-01-01 False True False False
2020-01-02 False False True False
2020-01-03 False False False False
2020-01-04 False False False False
2020-01-05 False False False False
2020-01-06 False False False False
-------------------------------------------------------------------
True
导入导出数据
pandas可读取的文件

a.png
可保存的文件

b.png
# -*- coding:utf-8 -*-
#coding: unicode_escape
#上面两行以及是encoding='gbk' 是对编码问题报错 处理
data = pd.read_csv('data/watermelon_3.csv',encoding='gbk')
print(data)
#存储文件
data.to_pickle('watermelon_3.pickle')
编号 色泽 根蒂 敲声 纹理 脐部 触感 密度 含糖率 好瓜
0 1 青绿 蜷缩 浊响 清晰 凹陷 硬滑 0.697 0.460 是
1 2 乌黑 蜷缩 沉闷 清晰 凹陷 硬滑 0.774 0.376 是
2 3 乌黑 蜷缩 浊响 清晰 凹陷 硬滑 0.634 0.264 是
3 4 青绿 蜷缩 沉闷 清晰 凹陷 硬滑 0.608 0.318 是
4 5 浅白 蜷缩 浊响 清晰 凹陷 硬滑 0.556 0.215 是
5 6 青绿 稍蜷 浊响 清晰 稍凹 软粘 0.403 0.237 是
6 7 乌黑 稍蜷 浊响 稍糊 稍凹 软粘 0.481 0.149 是
7 8 乌黑 稍蜷 浊响 清晰 稍凹 硬滑 0.437 0.211 是
8 9 乌黑 稍蜷 沉闷 稍糊 稍凹 硬滑 0.666 0.091 否
9 10 青绿 硬挺 清脆 清晰 平坦 软粘 0.243 0.267 否
10 11 浅白 硬挺 清脆 模糊 平坦 硬滑 0.245 0.057 否
11 12 浅白 蜷缩 浊响 模糊 平坦 软粘 0.343 0.099 否
12 13 青绿 稍蜷 浊响 稍糊 凹陷 硬滑 0.639 0.161 否
13 14 浅白 稍蜷 沉闷 稍糊 凹陷 硬滑 0.657 0.198 否
14 15 乌黑 稍蜷 浊响 清晰 稍凹 软粘 0.360 0.370 否
15 16 浅白 蜷缩 浊响 模糊 平坦 硬滑 0.593 0.042 否
16 17 青绿 蜷缩 沉闷 稍糊 稍凹 硬滑 0.719 0.103 否
合并数据
#concatenating
df = pd.DataFrame(np.ones((3,4))*0,columns=['A','B','C','D'])
df1= pd.DataFrame(np.ones((3,4))*1,columns=['A','B','C','D'])
df2 = pd.DataFrame(np.ones((3,4))*2,columns=['A','B','C','D'])
print(df)
print(df1)
print(df2)
print('-------------------------------------------------------------------')
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) #axis=0 竖向合并 axis=1横向合并 ignore_index=True将序列索引重新排序
print(res)
A B C D
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
A B C D
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
A B C D
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
-------------------------------------------------------------------
A B C D
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
3 2.0 2.0 2.0 2.0
4 2.0 2.0 2.0 2.0
5 2.0 2.0 2.0 2.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
合并数据-join、append
#join,['inner','outer']
df1= pd.DataFrame(np.ones((3,4))*1,index=[1,2,3],columns=['A','B','C','D'])
df2 = pd.DataFrame(np.ones((3,4))*2,index=[2,3,4],columns=['B','C','D','E'])
print(df1)
print(df2)
print('-------------------------------------------------------------------')
res = pd.concat([df1,df2],join='inner',ignore_index=True) #默认outer填充缺失值为NaN \\ inner:只保留有共同标签的值
print(res)
print('-------------------------------------------------------------------')
#根据给定列填充 join_axes=[df1.index] 在新版本中被删除 无法执行 所以直接使用merge
#res1 = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
#append
res = df1.append(df2,ignore_index=True)
print(res)
print('-------------------------------------------------------------------')
df1= pd.DataFrame(np.ones((3,4))*1,index=[1,2,3],columns=['A','B','C','D'])
s1 = pd.Series([1,2,3,4],index = ['A','B','C','D'])
print(df1)
print(s1)
print('-------------------------------------------------------------------')
res = df1.append(s1,ignore_index=True)
print(res)b
A B C D
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
B C D E
2 2.0 2.0 2.0 2.0
3 2.0 2.0 2.0 2.0
4 2.0 2.0 2.0 2.0
-------------------------------------------------------------------
B C D
0 1.0 1.0 1.0
1 1.0 1.0 1.0
2 1.0 1.0 1.0
3 2.0 2.0 2.0
4 2.0 2.0 2.0
5 2.0 2.0 2.0
-------------------------------------------------------------------
A B C D E
0 1.0 1.0 1.0 1.0 NaN
1 1.0 1.0 1.0 1.0 NaN
2 1.0 1.0 1.0 1.0 NaN
3 NaN 2.0 2.0 2.0 2.0
4 NaN 2.0 2.0 2.0 2.0
5 NaN 2.0 2.0 2.0 2.0
-------------------------------------------------------------------
A B C D
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
A 1
B 2
C 3
D 4
dtype: int64
-------------------------------------------------------------------
A B C D
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
3 1.0 2.0 3.0 4.0
merge合并
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
print('-------------------------------------------------------------------')
res = pd.merge(left,right,on='key')
print(res)
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
-------------------------------------------------------------------
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
merge合并-考虑两个key
# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
print('-------------------------------------------------------------------')
res = pd.merge(left, right, on=['key1', 'key2'], how='inner') # default for how='inner'
# how = ['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key1', 'key2'], how='left') #基于左边数据合并
print(res)
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
-------------------------------------------------------------------
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
merge合并-indicator
# indicator 的作用给你一个直观的显示数据合并情况
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)
print('-------------------------------------------------------------------')
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
col1 col_left
0 0 a
1 1 b
col1 col_right
0 1 2
1 2 2
2 2 2
-------------------------------------------------------------------
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
merge合并-index
# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right)
# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
# res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
merge合并-处理重叠数据
# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(boys)
print(girls)
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
# join function in pandas is similar with merge. If know merge, you will understand join
k age
0 K0 1
1 K1 2
2 K2 3
k age
0 K0 4
1 K0 5
2 K3 6
k age_boy age_girl
0 K0 1 4
1 K0 1 5
plot画图
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()#累加过程
data.plot()
plt.show()
print('-------------------------------------------------------------------')
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
print(data.head())
data.plot()
plt.show()
print('-------------------------------------------------------------------')
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
plt.show()

output_32_0.png
-------------------------------------------------------------------
A B C D
0 0.346351 -2.522668 2.509865 -0.845426
1 0.527578 -2.943785 3.523533 2.287084
2 1.156596 -5.018539 3.974923 1.797585
3 3.635815 -5.410221 5.063978 0.616211
4 5.230207 -4.764272 3.630786 2.111103

output_32_2.png
-------------------------------------------------------------------

output_32_4.png
网友评论