一、Numpy知识点
import numpy as np
x = np.arange(10)
x # array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) # 一维向量
X = np.arange(15).reshape((5,3))
X
# array([[ 0, 1, 2], # 二维矩阵
# [ 3, 4, 5],
# [ 6, 7, 8],
# [ 9, 10, 11],
# [12, 13, 14]])
1. 访问
x[2]
# 2
X[1,1]
# 4
2. 切片
格式:[行切片,列切片]
x[5:]
# array([5, 6, 7, 8, 9])
# 行切片,列切片
X[2:4,1:]
# array([[ 7, 8],
# [10, 11]])
X[2:,:2]
# array([[ 6, 7],
# [ 9, 10],
# [12, 13]])
如果说不关心reshape的另外一个参数,我们可以写成-1,numpy自动推导出这个参数
x.reshape(5,-1)
# array([[0, 1],
# [2, 3],
# [4, 5],
# [6, 7],
# [8, 9]])
3. numpy的运算
numpy's universal function
X
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
X + 1
array([[ 1, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12],
[13, 14, 15]])
X * 2
array([[ 0, 2, 4],
[ 6, 8, 10],
[12, 14, 16],
[18, 20, 22],
[24, 26, 28]])
np.sin(X)
array([[ 0. , 0.84147098, 0.90929743],
[ 0.14112001, -0.7568025 , -0.95892427],
[-0.2794155 , 0.6569866 , 0.98935825],
[ 0.41211849, -0.54402111, -0.99999021],
[-0.53657292, 0.42016704, 0.99060736]])
4. NUMPY中的argsort()
argsort返回排序后的参数的序列
x = np.arange(16)
x
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
# 随机打乱
# from random import shuffle
np.random.shuffle(x)
x
array([ 1, 8, 6, 12, 4, 14, 13, 7, 0, 15, 11, 3, 9, 5, 2, 10])
# argsort返回排序后的参数的序列
np.argsort(x)
array([ 8, 0, 14, 11, 4, 13, 2, 7, 1, 12, 15, 10, 3, 6, 5, 9],
dtype=int64)
5.Numpy 中的布尔索引
布尔型索引可以应用于数据的筛选
布尔型索引应用于修改值
names = np.array(['Bob','Joe','Will','Bob','Will','Joe','Joe'])
names
array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4')
# 使用np.random模块的randn生成一些正态分布的随机数据
data = np.random.randn(7,4)
data
array([[ 1.8450457 , 1.91791784, -0.90133072, -0.96715706],
[ 0.26275727, 1.27134679, 0.33692668, -1.00586409],
[-0.60143482, -1.41361787, 0.62431237, 0.50040347],
[ 0.0129754 , 1.92856064, 1.3711845 , -1.17564517],
[ 1.43999704, -0.87670553, 0.23952736, -0.64149065],
[-0.81460157, -1.7537682 , -0.82011688, -0.29424883],
[-1.00896275, -1.38725507, -1.03945652, -1.19849684]])
# 假设每个名字对应data数组的一行
names == 'Bob'
array([ True, False, False, True, False, False, False])
# 布尔型索引可以应用于数据的筛选
data[names =='Bob']
array([[ 1.8450457 , 1.91791784, -0.90133072, -0.96715706],
[ 0.0129754 , 1.92856064, 1.3711845 , -1.17564517]])
# 布尔型索引应用于修改值
# 选取所有JOE的行,并且全部赋值为 0
data[names == 'Joe'] = 0
data
array([[ 1.8450457 , 1.91791784, -0.90133072, -0.96715706],
[ 0. , 0. , 0. , 0. ],
[-0.60143482, -1.41361787, 0.62431237, 0.50040347],
[ 0.0129754 , 1.92856064, 1.3711845 , -1.17564517],
[ 1.43999704, -0.87670553, 0.23952736, -0.64149065],
[ 0. , 0. , 0. , 0. ],
[ 0. , 0. , 0. , 0. ]])
# 选取所有Will的行,并且将选取的数据的后两列赋值为0
data[names == 'Will',2:] = 0
data
array([[ 1.8450457 , 1.91791784, -0.90133072, -0.96715706],
[ 0. , 0. , 0. , 0. ],
[-0.60143482, -1.41361787, 0. , 0. ],
[ 0.0129754 , 1.92856064, 1.3711845 , -1.17564517],
[ 1.43999704, -0.87670553, 0. , 0. ],
[ 0. , 0. , 0. , 0. ],
[ 0. , 0. , 0. , 0. ]])
二、绘制散点图
散点图散点图
三、鸢尾花数据集散点图绘制
sklearn库封装了很多机器学习算法
1
2
3
四、机器学习初识
监督学习(supervised learning),无监督学习(unsupervised learning),半监督学习(Semi-Supervised Learning),强化学习(reinforcement Learning )
- 监督学习(supervised learning)和无监督学习(unsupervised learning)的判断:是否有监督(supervised),就看输入数据是否有标签(label)。输入数据有标签,则为有监督学习,没标签则为无监督学习。
- 监督学习:回归(Regression,连续)、分类(Classification,离散)
- 无监督学习:聚类(clustering)
分类算法KNN:
K近邻算法,即K-Nearest Neighbor algorithm,简称KNN算法。
可认为是:找最接近K的那个邻居。
实例:肿瘤良,恶性判断(手动实现)
自己实现KNN算法
数据集
import numpy as np
from matplotlib import pyplot as plt
from math import sqrt # 开平方
from collections import Counter #collections库非常有用
raw_data_X = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679179110],
[2.280362439, 2.866990263],
[7.423436942, 4.696522875],
[5.745051997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783481, 3.424088941],
[7.939820817, 0.791637231]
]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 转化成ndarray类型
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
对数据进行可视化
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='g')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='r')
plt.show()
数据可视化
预测
# 假设新来一个样本数据判断x是恶性还是良性
x = np.array([8.093607318, 3.365731514])
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='g')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='r')
plt.scatter(x[0],x[1],color='b')
plt.show()
预测
通过knn算法来预测
# 计算x距离所有的十个点的距离,然后选距离最近的前k个
# distances = []
# for x_train in X_train:
# d = sqrt(np.sum((x_train-x)**2))
# distances.append(d)
distances = [sqrt(np.sum((x_train-x)**2)) for x_train in X_train] # 欧式距离
distances
'''
[4.812566907609877,
5.229270827235305,
6.749798999160064,
4.6986266144110695,
5.83460014556857,
1.4900114024329525,
2.354574897431513,
1.3761132675144652,
0.3064319992975,
2.5786840957478887]
'''
nearst = np.argsort(distances)
nearst
# array([8, 7, 5, 6, 9, 3, 0, 1, 4, 2], dtype=int64)
# 假设我们制定K的值是6
k = 6
top_k_y = [y_train[i] for i in nearst[:6]]
top_k_y
# [1, 1, 1, 1, 1, 0]
数据统计量大的话使用的统计办法
votes = Counter(top_k_y)
votes
# Counter({1: 5, 0: 1})
# 返回数量前i的数据信息
votes.most_common(1)
predict_y = votes.most_common(1)[0][0]
predict_y
1
# 结论:x恶
网友评论