1 简介
C语言直接编写,在tensorflow等应用中更方便、快速。pandas是numpy的升级版
2 安装
3 基本属性
import numpy as np
array=np.array([[1,2,3]
[2,5,8]],dtype=int)
print(array.ndim)
print(array.shape)
print(array.size)
4 创建数组array
import numpy as np
zero_array = np.zeros((3,4))#3行4列的零矩阵
one_array = np.ones((2,4),dtype=int)#每个元素都为1
np.empty((3,2))#每个元素几乎为0
range_array = np.arange(12).reshape((2,6))
#按顺序取0-12,并reshape为2行6列的矩阵
line_arr = np.linspace(0,10,5)#在指定的间隔内返回均匀间隔的数字,即返回0-10中的5个等间隔数字
5 基础运算
import numpy as np
a = np.array([[10,20],
[1,0,]])
b = np.arange(4).reshape(2,2)
c = a*b#逐个相乘
c_dot = np.dot(a,b)#矩阵乘法
print(c)
print(c_dot)
a = np.random.random((2,4))
print(a)
print(np.sum(a))
print(np.max(a,axis=1))#按行求最大值
print(np.min(a,axis=0))#按列求最大值
6 基础运算2
a = np.arange(14,2,-1).reshape((3,4))
print(a)
print(np.clip(a,5,9))
print(np.mean(a,axis=0))#对于列进行计算
print(np.mean(a,axis=1))#对于行进行计算
7 numpy索引
a = np.arange(3,15)
print('a',a)
print(a[3])
b = a.reshape((3,4))
print(b)
print(b[2][1])#第2行,第1列(索引从0开始)
print(b[:,2])#第2列,所有元素
print(b[0,:])#第0行,所有元素
print(b[0:2,0])#第0列,第0-2行的元素
#for循环
a = np.arange(3,15).reshape((3,4))
print(a)
print('row:')
for row in a:
print(row)
print('column:')
for column in a.T:#转置矩阵
print(column)
print('flat:')
print(a.flatten())
for item in a.flat:
print(item)
8 numpay array合并
a = np.array([1,1,1])
b = np.array([2,2,2])
c = np.vstack((a,b))#vertical stack,上下合并
d = np.hstack((a,b))#horizontal stack,左右合并
print(c.shape,d.shape)
print(c)
print(d)
print(a[np.newaxis,:])#新增行维
print(a[:,np.newaxis])#新增列维
d = np.concatenate((a,b,b,a),axis=0)#多个array的横向合并
print(d)
9 array分割
a = np.arange(12).reshape((3,4))
print(a)
print(np.split(a,2,axis=1))#纵向平均分割为2个array
print(np.split(a,3,axis=0))#横向平均分割为3个array
print(np.array_split(a,3,axis=1))#纵向不等分割为3个array
10 numpy copy & deep copy
a = np.arange(4)
print(a)
b = a
c = a
d = a
a[0] = 11
print(a)
# a,b,c,d相互关联
print(b is a)
d[1:3] = [22,33]
print(c is a)
#不相互关联
b = a.copy()#deep copy
a[3] = 15
print(a is b)
11 pandas基本介绍
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])#可以显示索引的列表
print(s)
dates = pd.date_range('20180304',periods=6)#从20180304开始的6天时间数据
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])#dataframe相当于一个matrix矩阵,也就是numpy里面的二维矩阵。行索引为dates,列索引为中括号内所述内容
print(df)
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
df2 = pd.DataFrame({'A':1,
'B':pd.Series(1,index=list(range(4)),dtype='float32'),
'C':pd.Timestamp('20180102'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(['test','train','test','train']),
'F':'foo'})
print(df2)
print(df2.dtypes)#输出类型
print(df2.index)#输出索引
print(df2.columns)#输出列名
print(df2.values)
print(df2.describe())#输出描述
print(df2.T)#矩阵转置
print(df2.sort_index(axis=1,ascending=False))#按行排序,降序
print(df2.sort_values(by='E'))#按E列中的值排序
12 pandas选择数据
dates = pd.date_range('20180308',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
print(df[0:3],df['20180309':'20180311'])#选择列
print(df['a'],df.a)#选择行
print(df.loc['20180309'])#按横向标签选择
print(df.loc[:,['b','c']])#按列标签选择
print(df.iloc[3:5,1:3])#按位置选择
# print(df.ix[:3,['a','d']])#标签、位置混合选择,已弃用
print(df[df.a>8])
13 pandas设置值
df.iloc[2,2]=1111
df.loc['20180309','a'] = 2222
df.b[df.a>8] = 0
14 如何处理丢失数据
dates = pd.date_range('20180308',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df.dropna(axis=0,how='any'))#丢掉nan数据,how=['any','all'],当how=all时,该行全部为nan时才启用
print(df.fillna(value=0))#将nan填为0
print(df.isnull)#是否缺失数据
print(np.any(df.isnull()==True))#至少有一个为nan
15 导入导出数据
data = pd.read_csv('filepath')#导入
data.to_pickle('filepath')#导出
16 合并concatenating
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
# print(df1)
# print(df2)
# print(df3)
# result = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#axis=0为纵向合并
# print(result)
#join,['inner','outer']
df4 = pd.DataFrame(np.ones((3,4))*1,columns=['c','d','e','f'])
df5 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
res = pd.concat([df4,df5],axis=0,join='inner',ignore_index=True)#默认为outer join
print(res)
#join axes
res2 = pd.concat([df4,df5],axis=1,join_axes=[df4.index])#横向合并,以df4的索引为准
print(res2)
#append
res3 = df1.append(df2,ignore_index=True)
print(res3)
s1= pd.Series([1,2,3,4],index=['a','b','c','d'])
res4 = df1.append(s1,ignore_index=True)
print(res4)
17 合并merge
#两组dataframe依据key合并
left = pd.DataFrame({'key':['k0','k1','k2','k3'],
'A':['b0','b1','b2','b3'],
'B':['a0','a1','a2','a3']})
right = pd.DataFrame({'key':['k0','k1','k2','k3'],
'C':['c0','c1','c2','c3'],
'D':['d0','d1','d2','d3']})
res = pd.merge(left,right,on='key')
# print(res)
#依据两组keys合并
left2 = pd.DataFrame({'key1':['k0','k0','k1','k2'],
'key2':['k0','k1','k0','k1'],
'A':['b0','b1','b2','b3'],
'B':['a0','a1','a2','a3']})
right2 = pd.DataFrame({'key1':['k0','k0','k1','k2'],
'key2':['k0','k0','k0','k0'],
'C':['c0','c1','c2','c3'],
'D':['d0','d1','d2','d3']})
res2 = pd.merge(left2,right2,on=['key1','key2'],how='inner')
# print(res2)
#根据index横向合并
res3 = pd.merge(left2,right2,left_index=True,right_index=True,how='outer')
print(res3)
#当两个表中的某一列重名,但是值不同,合并的时候要保留,则使用suffixes参数
boys = pd.DataFrame({'k':['k0','k1','k2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['k0','k0','k3'],'age':[4,5,6]})
res4 = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res4)
18 plot图表
import matplotlib.pyplot as plt
#线性数据Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()#累加
# data.plot()
# plt.show()#显示
#矩阵数据DataFrame
data2 = pd.DataFrame(np.random.randn(1000,4),
index=np.arange(1000),
columns=list('ABCD'))
data2 = data2.cumsum()
print(data2.head())
# data2.plot()
ax = data2.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
data2.plot.scatter(x='A',y='C',color='DarkGreen',label='Class2',ax=ax)
plt.show()#显示
#plot method:'bar','hist','box','kde','area','scatter','hexbin','pie'
感谢:
网友评论