numpy,pandas,matplotlib

作者: esskeetit | 来源:发表于2018-06-01 18:14 被阅读0次

1.numpy

1.1 create an numpy array

t=(1,2,3)
a=np.array(t,dtype= 'int')
a.dtype

list_1=[1,2,3]
a = np.array(list1,dtype= 'float32')

list2 = [4.2,5.1,6.3]
b = np.array([list1,list2])

c = np.array( [ [1,2], [3,4] ], dtype=complex )
c.shape

np.zeros([2,2])
np.ones([2,3])
np.empty([2,3]) 
np.eye(5)
np.arange(2,10,2)
np.linspace(0,10,4) #包含10

1.2 Basic Operations of Arrays

a = np.array( [[1,2,3,4],[5,6,7,8]])
a * a  #对应元素相乘
a ** 3 #对应元素求立芳
a + a #对应元素相加
1/a #对应元素取倒数

A = np.array( [[1,3],[0,1]] )
B = np.array( [[2,2],[3,4]] )
A*B                # 直接相乘是对每个元素相乘 elementwise product
A.dot(B)           # .dot()是矩阵相乘 matrix product
np.dot(A, B)
np.add(A, B)         #矩阵加法

a = np.random.random([3,3])  # np.random.random 是在半开放区间[0.0, 1.0) 之间生成随机数字
np.random.randn(3,3) #生成一个浮点数或N维浮点数组，取数范围：正态分布的随机样本数。
a.sum() #所有数相加
a.min() #所有数中的最小值
a.max() #所有数中的最大值

b = np.arange(24).reshape(6,4)
b.sum(axis=0) # sum of each column
b.sum(axis=1) # sum of each row
b.min(axis=1) # min of each row
b.cumsum(axis=1) # 累计相加 cumulative sum along each row

1.3 Indexing, Slicing and Iterating

1.3.1 One-dimensional arrays

a = np.arange(10)**2
print(a)  #array([100, 100, 100,   9,  16,  25,  36,  49,  64,  81], dtype=int32)
a[2]
a[2:6] 
a[0:3] = 100
print(a)  #array([100, 100, 100,   9,  16,  25,  36,  49,  64,  81], dtype=int32)

a_slice = a[0:5]
print(a_slice) #  array([100, 100, 100,   9,  16], dtype=int32)

#当我们选取部分 np array 的时候，我们创建了一个view。
# 换句话说，我们没有copy以前的值，而是直接改了原来的 array 
# 如果不想对原来的 array 进行修改，我们可以把选取的部分 copy 出来

a_slice_2 = a[0:5].copy() 
print(a_slice_2)
a_slice_2[0] = 500
print(a)

1.3.2 Multidimensional arrays

def f(x,y):
    return 4 * x + y
b = np.fromfunction(f,(3,2),dtype=int)
# derive from locations of the matrix
#[[00],[01],[10],[11],[20],[21]]

b[1] #取第二行
b[1][0] #取第二行第一列
b[1,0]
b[:, 1] #取第二列

b.shape[0]  #返回行数
b.shape[1]  #返回列数

c = np.zeros ([5,5])
c

for i in range(c.shape[1]):
    c[i] = i  #第i行的所有值都取i这个值
    print(c[i])

1.3.3 Boolean Indexing

week_days=np.array(['Monday','Tuesday','Wednesday','Thursday','Friday'])
work_time=np.round(np.random.randn(5,5)+8.0, 2)
week_days == 'Tuesday' #返回boolean array
work_time.T[week_days == 'Tuesday']  #沿着列进行索引

1.4 Matrix Operations II

a = np.array([[1,2,3],[3,4,5]],dtype='float')
a.T  #转置
a.transpose() 
np.dot(a.T,a)

print(np.linalg.inv(np.dot(a.T,a)))# 求逆矩阵

from numpy.linalg import inv
inv(np.dot(a.T,a))  # 求逆矩阵

#验证求逆是否正确
np.allclose # Returns True if two arrays are element-wise equal within a tolerance : default 1e-08
a = np.array([[1., 2.], [3., 4.]])
ainv = inv(a)

# here is to check if np.dot(a, ainv) equals to I matrix
np.allclose(np.dot(a, ainv), np.eye(2))
np.allclose(np.dot(ainv, a), np.eye(2))

from numpy.linalg import *
a = np.array([[1,2,3],[3,4,5]],dtype='float')
# return eigenvalues and normalized eigenvectors
eig(np.dot(a.T,a))

a = np.array([[[1,2,3,0],[3,4,5,2]]])
a.shape  #(1, 2, 4)
a.T
a.T.shape  #(4, 2, 1)
a.transpose( [0,2,1] ) 
#a.transpose可以指定到底要怎么变换：比如原来是 [1,2,4]， 
#可以指定转变的方式 [0,2,1]， 这个 [0,2,1] 的意思是 第一个维度不变，后两个维度交换
# 那么形状就变成了 [1,4,2]

a = np.array([[1., 2.], [3., 4.]])
np.trace(a) # 算对角的和
a.swapaxes(0,1) #换轴

a3d = np.arange(50).reshape([5,5,2])
a3d.transpose([0,2,1])  # .reshape([5,2,5])

np.bincount(np.array([1, 1, 2, 10, 2, 4, 7])) # 统计0-10分别出现的次数

1.5 Array processing

1.5.1Generate Grid

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
x,y=np.meshgrid(np.arange(-10,10,0.02),np.arange(-10,10,0.02))
z= np.sqrt(x**2+y**2)
plt.imshow(z)
plt.colorbar()
plt.show()

image.png

1.5.2 numpy where function

A = np.array([1,2,3,4])
B= np.array([5,1,7,2])
condition = np.array([True,False,False,False])
np.where(condition,A,B) #condition满足选择A,不满足选择B

b = np.random.randn(5,5)
np.where(b < 0,0,b)  #change negative number to 0

1.5.3 Some Statistical Processing

c = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(c)
c.sum() #所有数求和
c.sum(axis=1) #对各行求和
c.mean()#所有数求平均值
c.std() #所有数求标准偏差
c.var() #所有数求方差

1.5.4 Array Sort

d = np.random.randn(10)
d.sort()

# in1d test values in one array
e = np.array([1,2,3,3,4,4,5])
np.in1d([2,4,8],e)   array([ True,  True, False])
# check if element in the first array appears in the second array

# unique
np.unique(e) #把所有不重复的值取出来

1.6 Save and Load Array

## Saving array in binary format (.npy)
import numpy as np
a = np.array([1,2,3,4,5])
np.save('array_a',a)
np.load('array_a.npy')

## Saving multiple arrays into a zip file
b = np.array([[1,2,3],[4,5,6]])
np.savez('two_arrays.npz',x=b,y=b.T)
np.load('two_arrays.npz')['x']
np.load('two_arrays.npz')['y']

## Saving and loading into text files
np.savetxt('array_text.txt',b,delimiter=',')
np.loadtxt('array_text.txt',delimiter=',')

2. pandas

2.1 series

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
s1 = pd.Series([1,2,3,4,np.nan,5,6,7])
s1.values #返回所有值
s1.index  #RangeIndex(start=0, stop=8, step=1)

s2 = pd.Series([21,23,42,21,23],index=['Jack','Lucy','Helen','Milky','Jasper']) #可指定Index
s2['Jack']  #取值
s2.loc['Jack']
s2.iloc[0]
print (s2.shape) #(5,)
print(s2.size) #5
s2.head(2) #取前两行
s2.describe() #描述统计学数据
s2.sort_values() #按升序对值进行排序
s2[s2>22] # Check the people who is older than 22
s2.plot.bar() #直方图
'Lucy' in s2 #判断元素是否在Series中
s2_dict = s2.to_dict() #将series转化为字典
s2_series = pd.Series(s2_dict) #将字典转化为series
name = ['Jack','Lucy','Helen','Milky','Tom','Jasper','Helen'] #可将index先存在列表中
s2_new = pd.Series(s2_dict,index = name)
s2_new.drop_duplicates() ## drop the duplicate in value
pd.isnull(s2_new) #判断值是否为nan

2.2 DataFrame + Titanic Example

df = pd.DataFrame({'Student_1':[90,100, 95], 'Student_2':[60, 80, 100]}, index=['Monday', 'Wednesday', 'Friday'])
df1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['A', 'B'], columns=['C1', 'C2', 'C3'])
df1.values #返回df1的所有值
df1.index  #返回df1的行名称
df1.columns #返回df1的列名称
df1.T #转置
df1.shape 
df1.size #df1中元素的个数
df1.head(2) #返回前两行
df1.tail(1) #返回最后一行
df1.describe() #每一列的描述性统计数据
df1.loc['B'] #取列名为B的所有元素
df1.loc['B'].loc['C2']  # loc works on index
df1['C2'].loc['B']
df1.loc['B', 'C2']
df1.iloc[1, 1]     # iloc works on position (only take integers)
df1 + 10 * 15     # element-wise operations
df1['C2'] = df1.apply(lambda x: x['C2'] ** 2 + 10, axis=1)
df1.assign(C2 = lambda x: x['C2'] ** 2 + 10,\
           C3 = lambda x: x['C3'] * 2 - 10).loc['A'] .max()

from IPython.display import Image
Image("./variable.png")    #jupyter notebook同一目录下图片的显示

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
sns.set()

df = pd.read_csv('train.csv')
df.shape  #查看行数和列数 (891, 12)
df.head(5) #查看前五行
df.tail(2) #查看最后两行
df.dtypes  #查看每一列的数据类型 dataframe会将string类型存为object类型
df.Survived.value_counts()  #查看存活和遇难的人数
df.isnull().sum()#查看每一列的缺失值
df.Survived.value_counts().plot(kind='bar') #存活和遇难人数的直方图
df.Pclass.value_counts()  #查看不同舱房等级的人数
df.isnull().sum().plot(kind='bar') # 每一列缺失值的柱状图

How to deal with missing value ?

df1 = df.drop('Cabin', axis=1) #对于缺失值最多的carbin这一列进行删除
df1.shape #(891, 11)
df1['Age']=df1['Age'].fillna(20) #对于Age的缺失值填充20
#df1['Age']=df1['Age'].fillna(df.Age.mean()) 还可填充平均值
df2 = df1[df1['Embarked'].notnull()] #把缺失的Embarked那两行数据删除掉
df2.shape # (889, 11)

# missing value removal
df3 = df.drop('Cabin', axis=1).assign(Age = lambda x: x['Age'].fillna(20))
df3=df3.loc[df3['Embarked'].notnull()]

Exploration (basic statistics)

df3.loc[10:14, ['Name', 'Sex', 'Survived']] #去除第10-14行，'Name', 'Sex', 'Survived'这三列的数据
df3.columns #查看列名
df3.pivot_table(values='PassengerId', index='Survived', columns='Sex', aggfunc='count')
df4 = df3.loc[df3['Survived'] == 1] #取出存活的人
df5 = df3.loc[df3['Age'] > 30] #取出年龄大于30的人
df6 = df3[['PassengerId', 'Name']].merge(df3[['PassengerId', 'Age']], on='PassengerId', how='outer') #合并两个df
df3['Survived'].corr(df['Pclass']) #查看相关系数

2.3 Index Objects

import numpy as np
import pandas as pd
s = pd.Series([1,2,3,4,5,6], index=pd.date_range('20170102', periods=6))
s.index
s.index[2]
s.index[2:] #DatetimeIndex(['2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07'], dtype='datetime64[ns]', freq='D')

2.4 Reindex

2.4.1 numpy Reindex

np.random.randn(1, 4).tolist()
s1 = pd.Series(np.random.randn(1, 4).tolist()[0], index=['A', 'B','C','D']) 
s2 = s1.reindex(['A', 'B','C','D','E','F','G']) #EFG为NaN
s3 = s2.reindex(['A', 'B','C','D','E','F','G','H'],fill_value=0) #H为0

2.4.2 Pandas Reindex

df = pd.DataFrame(np.random.randn(4, 4), index = ['r1','r2','r3','r4'], columns=['c1','c2','c3','c4'])
df.reindex(['r1','r2','r3','r6','r4','r5']) #并不改变df
df.reindex(columns=['c1','c2','c3','c4','c5'])

2.5 drop data

2.5.1 Drop Series

s1 = pd.Series(np.arange(5),index=[1,2,3,4,5])
s1.drop(4) #将index为4的这一行删除掉

2.5.2 Drop DataFrame

df1 = pd.DataFrame(np.random.randn(4,5),index=['r1','r2','r3','r4'],columns=['c1','c2','c3','c4','c5'])
df1.drop('r1') #删除Index为r1的那一行
df1.drop('c5',axis=1) #删除column为c5的那一列
df1.drop('c5',axis=1,inplace=True) #删除后会改变df1

2.6 slicing data

s1 = pd.Series(np.arange(5),index=['A','B','C','D','E'])
s2=s1[:3] #取出前三行
s2['A']=2018 #将s2这个slice对应A的值改为2018，s1也会改变
s1[['A','B','C']]
s1[s1>2] #取出s1中大于2的数
s1[s1>3] = 10 #将s1中大于3的数改为10
s1[(s1>2)&(s1<5)]=6

df1 = pd.DataFrame(np.random.randn(4,5),index=['r1','r2','r3','r4'],columns=['c1','c2','c3','c4','c5'])
df1['c1'] #取出c1这一列
df1[['c1','c4','c3']] # 选取多列
df1[df1['c2']>0] #选出c2这一列大于0的所有行
df1<0 #返回布尔dataframe
df1.iloc[1] #返回第二行
df1.loc['r2'] #返回r2这一行
df1.iloc[[0,2,3],[0,2]] 
df2 = df1.copy()
df2['c6'] = ['one', 'one','two','three'] #copy后对slice进行更改，则原dataframe不会改变
df2['c6'].isin(['two','three']) #判断c2这一列中是否有列表['two','three']中的元素
df2[df2['c6'].isin(['two','three'])]

2.7 #Data Alignment

s1 = pd.Series(np.arange(5),index=['A','B','C','D','E'])
s2 = pd.Series(np.arange(3),index=['A','B','C'])
s1 + s2 #对应Index进行相加
df1 = pd.DataFrame(np.random.randn(4,5),index=['r1','r2','r3','r4'],columns=['c1','c2','c3','c4','c5'])
df2 = pd.DataFrame(np.random.randn(3,4),index=['r1','r2','r3'],columns=['c1','c2','c3','c4'])
df1+df2 #index和columns相同的对应相加
df1.add(df2,fill_value=100) #将df2中缺值,df1中存在的进行补值

2.8 rank and sort

s1 = pd.Series(np.arange(5),index=['B','D','C','A','E'])
s1.sort_index() #对index进行升序操作
s1.sort_values(ascending=False) #对index对应的值进行降序操作
s2 = pd.Series(np.random.randn(6))
s2.rank()

3. matplotlib

待续