[TOC]
Numpy 构造数据
1、tuple和list转成numpy的array数据类型
my_list = [12,23.24,-19]
my_list #[12, 23.24, -19]
my_np_array = np.array(my_list) #array([ 12. , 23.24, -19. ])
my_tuple = (11,2,3,6,7)
my_tuple*3 #(11, 2, 3, 6, 7, 11, 2, 3, 6, 7, 11, 2, 3, 6, 7)
np.array(my_tuple)*3 #array([33, 6, 9, 18, 21])
2、numpy内置函数arange的那些事
np.arange(2,9)-1 #array([1, 2, 3, 4, 5, 6, 7])
len(np.arange(2,9)) #7
np.arange(2,9).size #7
np_two = np.arange(9).reshape(3,3)
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
len(np_two)#3
np_two.size#9 元素个数
ndim # 维度
3、线性空间linspace,ones,zeros 多维数据及len,size,shape
np.linspace(5,15,9) #array([ 5. , 6.25, 7.5 , 8.75, 10. , 11.25, 12.5 , 13.75, 15. ])
np.linspace(5,15,9,retstep=True) #(array([ 5. , 6.25, 7.5 , 8.75, 10. , 11.25, 12.5 , 13.75, 15. ]), 1.25)
np.zeros((2,3))
# array([[ 0., 0., 0.],
# [ 0., 0., 0.]])
np.ones(5,dtype='int32')
#array([1, 1, 1, 1, 1])
array.shape#行列
数据索引切割及遍历
1、数据切糕slice

2、numpy 逻辑掩码高效刷选数据.

3、numpy数据广播及常用矩阵操作
5 * my_array -2 #广播操作
np.subtract(np.multiply(5,my_3d_array),2)
.dot #向量点积
my_3d_array
"""
array([[[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19],
[20, 21, 22, 23, 24],
[25, 26, 27, 28, 29],
[30, 31, 32, 33, 34]],
[[35, 36, 37, 38, 39],
[40, 41, 42, 43, 44],
[45, 46, 47, 48, 49],
[50, 51, 52, 53, 54],
[55, 56, 57, 58, 59],
[60, 61, 62, 63, 64],
[65, 66, 67, 68, 69]]])
"""
my_3d_array.sum() #2415
my_3d_array.sum(axis=0)
"""
array([[ 35, 37, 39, 41, 43],
[ 45, 47, 49, 51, 53],
[ 55, 57, 59, 61, 63],
[ 65, 67, 69, 71, 73],
[ 75, 77, 79, 81, 83],
[ 85, 87, 89, 91, 93],
[ 95, 97, 99, 101, 103]])
"""
my_3d_array.sum(axis=1)
array([[105, 112, 119, 126, 133],
[350, 357, 364, 371, 378]])
my_3d_array.sum(axis=2)
array([[ 10, 35, 60, 85, 110, 135, 160],
[185, 210, 235, 260, 285, 310, 335]])
np.random.random((7,5)) #随机数
4、numpy基于复杂数据类型构建与操作(*)
person_data_def = [('name','S8'),('height','f8'),('weight','f8'),('age','i8')]
people_array = np.zeros((4,),dtype=person_data_def)
people_array[0] = ('steven',175,70,42)
people_array[2] = ('peter',172,70,32)
"""
array([(b'steven', 175., 70., 42), (b'', 0., 0., 0),
(b'peter', 172., 70., 32), (b'', 0., 0., 0)],
dtype=[('name', 'S8'), ('height', '<f8'), ('weight', '<f8'), ('age', '<i8')])
"""
people_array['name']
#array([b'steven', b'', b'peter', b''], dtype='|S8')
people_array['age']
#array([42, 0, 32, 0], dtype=int64)
==============================================================
person_record_array = np.rec.array([('peter',172,65,29),('bevear',175,70,42),('steven',175,70,42)],dtype=person_data_def)
person_record_array
"""
rec.array([(b'peter', 172., 65., 29), (b'bevear', 175., 70., 42),
(b'steven', 175., 70., 42)],
dtype=[('name', 'S8'), ('height', '<f8'), ('weight', '<f8'), ('age', '<i8')])
"""
person_record_array.age #array([29, 42, 42], dtype=int64)
数据可视化
1、matplotlib第一幅数据可视化图-直方图
import numpy as np
import matplotlib.pyplot as plt
mu, sigma = 100, 15
data_set = mu + sigma * np.random.randn(10000)
# the histogram of the data
n, bins, patches = plt.hist(data_set, 50, normed=1, facecolor='m', alpha=0.75)
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of IQ')
plt.text(60, .025, r'$\mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)
plt.show()

`hist(ages, bins``=``20``, color``=``'lightblue'``, normed``=``True``)`
这里 hist 函数的输入参数有 ages 数组、直方图的 bins 数目(即区间数,这里为 20),同时,还使用了 normed=True 参数来正则化直方图,即让每个方条表示年龄在该区间内的数量占总数量的比.
2、多副图组合 figure和subplot
my_first_figure = plt.figure("My first figure")
#面向对象的方式
subplot1 = my_first_figure.add_subplot(2,2,1)
subplot6 = my_first_figure.add_subplot(2,2,4)
========
x = np.arange(0, 100)
fig = plt.figure()
ax1 = fig.add_subplot(221)
ax1.plot(x, x)
ax2 = fig.add_subplot(222)
ax2.plot(x, -x)
ax3 = fig.add_subplot(223)
ax3.plot(x, x ** 2)
ax4 = fig.add_subplot(224)
ax4.plot(x, np.log(x))
plt.show()
=======
#pyplot的方式
x = np.arange(0, 100)
plt.subplot(221)
plt.plot(x, x)
plt.subplot(222)
plt.plot(x, -x)
plt.subplot(223)
plt.plot(x, x ** 2)
plt.subplot(224)
plt.plot(x, np.log(x))
plt.show()
===========
for i,color in enumerate("rgby"):
plt.subplot(221+i, axisbg=color)
plt.show()
3、matplotlib多曲线图
t1 = np.arange(0.0, 2.0, 0.1)
t2 = np.arange(0.0, 2.0, 0.01)
# note that plot returns a list of lines. The "l1, = plot" usage
# extracts the first element of the list into l1 using tuple
# unpacking. So l1 is a Line2D instance, not a sequence of lines
l1, = plt.plot(t2, np.exp(-t2))
l2, l3 = plt.plot(t2, np.sin(2 * np.pi * t2), '--o', t1, np.log(1 + t1), '.')
l4, = plt.plot(t2, np.exp(-t2) * np.sin(2 * np.pi * t2), 's-.')
plt.legend((l1, l4), ('oscillatory', 'damped'), loc='upper right', shadow=True)
plt.xlabel('time')
plt.ylabel('volts')
plt.title('Damped oscillation')
plt.show()
plt.plot(x, y, color="r", linestyle="--", marker="*", linewidth=1.0)

4、Tick图以及Grid网格
plt.subplot(facecolor="g") #背景色
plt.grid(color="k", linestyle=":")#网格
plt.text(-0.5, 3, "exp functions", fontsize=10) #注释
labels = subplot_1.set_xticklabels(['one', 'two', 'three', 'four', 'five'], rotation=45, fontsize='small') #设置x轴标签
5、饼状图与柱状图
labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30, 45, 10]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral']
explode = (0, 0.1, 0, 0) # only "explode" the 2nd slice (i.e. 'Hogs')
plt.pie(x=sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90)
# Set aspect ratio to be equal so that pie is drawn as a circle.
plt.axis('equal')
plt.show()
#autopct,圆里面的文本格式,%3.1f%%表示小数有三位,整数有一位的浮点数
# shadow,饼是否有阴影
# startangle,起始角度,0,表示从0开始逆时针转,为第一块。一般选择从90度开始比较好看
# labeldistance,文本的位置离远点有多远,1.1指1.1倍半径的位置
plot.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
# loc: 表示legend的位置,包括'upper right','upper left','lower right','lower left'等
# bbox_to_anchor: 表示legend距离图形之间的距离,当出现图形与legend重叠时,可使用bbox_to_anchor进行调整legend的位置

N = 5
menMeans = (20, 35, 30, 35, 27)
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, menMeans, width, color='r')
womenMeans = (25, 32, 34, 20, 25)
rects2 = ax.bar(ind + width, womenMeans, width, color='y')
# add some text for labels, title and axes ticks
ax.set_ylabel('Scores')
ax.set_title('Scores by group and gender')
ax.set_xticks(ind + width)
ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5'))
ax.legend((rects1[0], rects2[0]), ('Men', 'Women'))
plt.show()

参考:https://www.jianshu.com/p/3170304baf55
6、箱型图
箱型图:参考:http://matplotlib.org/examples/statistics/boxplot_vs_violin_demo.html
参考:https://blog.csdn.net/zenghaihong/article/details/53291372

下边缘(Q1),表示最小值;
下四分位数(Q2),又称“第一四分位数”,等于该样本中所有数值由小到大排列后第25%的数字;
中位数(Q3),又称“第二四分位数”等于该样本中所有数值由小到大排列后第50%的数字;
上四分位数(Q4),又称“第三四分位数”等于该样本中所有数值由小到大排列后第75%的数字;
上边缘(Q5),表述最大值。
第三四分位数与第一四分位数的差距又称四分位间距。

#首先导入基本的绘图包
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#添加成绩表
plt.style.use("ggplot")
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif']=['SimHei']
#新建一个空的DataFrame
df=pd.DataFrame()
#添加成绩单,最后显示成绩单表格
df["英语"]=[76,90,97,71,70,93,86,83,78,85,81]
df["经济数学"]=[65,95,51,74,78,63,91,82,75,71,55]
df["西方经济学"]=[93,81,76,88,66,79,83,92,78,86,78]
df["计算机应用基础"]=[85,78,81,95,70,67,82,72,80,81,77]
#用matplotlib来画出箱型图
plt.boxplot(x=df.values,labels=df.columns,whis=1.5)
plt.show()
#用pandas自带的画图工具更快
df.boxplot()
plt.show()

深入理解Numpy
1、数据浅拷贝
import numpy as np
mi_casa = np.array([-45,-31,2,25,51,99])
su_casa = mi_casa#浅拷贝
su_casa is mi_casa #True id(mi_casa)
mi_casa[0] =2 # su_casa的值也会发生改变 array([ 2, -31, 2, 25, 51, 99])
2、数据视图及深拷贝
tree_house = np.array([-45,-31,2,25,51,99])
farm_house = tree_house.view()
farm_house.shape=(2,3)
tree_house[0] = 0
tree_house #array([ 0, -31, 2, 25, 51, 99])
farm_house
"""
array([[ 0, -31, 2],
[ 25, 51, 99]])
"""
#视图数组中的数据实际上保存在base数组中
#对一个数组的切片操作也会返回一个原数组的视图(view)
=====
#深拷贝
dog_house = np.copy(tree_house) #id 不一样
3、数据属性详解
- append
- horizontal stacking
- vertical stacking
- insert

c
array([[ 6, 7, 8],
[ 9, 10, 11]])
=======================
np.append(a,c,axis=0)
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11]])
np.append(a,c,axis=1)
array([[ 0, 1, 2, 6, 7, 8],
[ 3, 4, 5, 9, 10, 11]])

np.insert(arr, obj, values, axis=None)

4、数据删除delete函数
#d
array([[[ 3., 13., 23., 33.],
[ 43., 53., 63., 73.],
[ 83., 93., 103., 113.]],
[[ 123., 133., 143., 153.],
[ 163., 173., 183., 193.],
[ 203., 213., 223., 233.]]])
np.delete(d,1,axis=0)
array([[[ 3., 13., 23., 33.],
[ 43., 53., 63., 73.],
[ 83., 93., 103., 113.]]])
np.delete(d,1,axis=1)
array([[[ 3., 13., 23., 33.],
[ 83., 93., 103., 113.]],
[[ 123., 133., 143., 153.],
[ 203., 213., 223., 233.]]])
np.delete(d,1,axis=2)
array([[[ 3., 23., 33.],
[ 43., 63., 73.],
[ 83., 103., 113.]],
[[ 123., 143., 153.],
[ 163., 183., 193.],
[ 203., 223., 233.]]])
5、数据合并分离
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6]])
together = np.concatenate((a, b), axis=0)
array([[1, 2],
[3, 4],
[5, 6]])
c = np.array([[1, 2], [3, 4]]) *3 + 5
array([[ 8, 11],
[14, 17]])
np.concatenate((a, c), axis=1)
array([[ 1, 2, 8, 11],
[ 3, 4, 14, 17]])

6、玩转数据形状shape
#reshape
#ravel
my_38_array = my_start_array.reshape((3,8))
array([[ 0, 1, 2, 3, 4, 5, 6, 7],
[ 8, 9, 10, 11, 12, 13, 14, 15],
[16, 17, 18, 19, 20, 21, 22, 23]])
my_ravel_array = my_38_array.ravel()
my_ravel_array
array([1111, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23])
#flat
type(my_38_array.flat)
numpy.flatiter
for n in my_38_array.flat:
print(n)
7、快速排序及旋转数据
fliplr #左右反转数据
flipud #上下反转数据
roll(arry,k) #将array中的最后k个元素 放置最前 如果-k 表示将前面k个元素放置array后面
rot90 #整体旋转向左90° k=-1整体向右旋转90°
8、数据类转置操作
transpose
swapaxes#arr.swapaxes(2,1) #就是将第三个维度和第二个维度交换
rollaxes
参考:https://blog.csdn.net/liaoyuecai/article/details/80193996
9、覆瓦式数据拓展
my_start_array
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
np.tile(my_start_array,3)
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11])
--------
np.tile(my_start_array,3).reshape((3,12))
array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]])
my_second_array
array([0, 1, 2, 3, 4, 5, 6])
np.repeat(my_second_array,3)
array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6])
my_repeatable_array = np.array(np.arange(24)).reshape(2,3,4)
my_repeatable_array
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]])
np.repeat(my_repeatable_array,2,axis=0)
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]],
[[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]])
np.repeat(my_repeatable_array,2,axis=1)
array([[[ 0, 1, 2, 3],
[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[ 8, 9, 10, 11]],
[[12, 13, 14, 15],
[12, 13, 14, 15],
[16, 17, 18, 19],
[16, 17, 18, 19],
[20, 21, 22, 23],
[20, 21, 22, 23]]])
np.repeat(my_repeatable_array,2,axis=2)
array([[[ 0, 0, 1, 1, 2, 2, 3, 3],
[ 4, 4, 5, 5, 6, 6, 7, 7],
[ 8, 8, 9, 9, 10, 10, 11, 11]],
[[12, 12, 13, 13, 14, 14, 15, 15],
[16, 16, 17, 17, 18, 18, 19, 19],
[20, 20, 21, 21, 22, 22, 23, 23]]])
网友评论