1knn算法
import numpy as np
import matplotlib.pyplot as plt
row_data_x = [
[3.39, 2.33],
[3.11, 1.78],
[1.34, 3.37],
[3.58, 4.68],
[2.28, 2.87],
[7.42, 4.69],
[5.75, 3.53],
[9.17, 2.51],
[7.79, 3.42],
[7.93, 0.79]]
row_data_y = [0,0,0,0,0,1,1,1,1,1]
x_train = np.array(row_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.09, 3.37])
plt.scatter(x_train[y_train==0, 0], x_train[y_train==0,1], color='g', marker='o')
plt.scatter(x_train[y_train==1, 0], x_train[y_train==1,1], color='r', marker='+')
plt.scatter(x[0], x[1], color='b')
plt.show()
1.1knn的过程
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
row_data_x = [
[3.39, 2.33],
[3.11, 1.78],
[1.34, 3.37],
[3.58, 4.68],
[2.28, 2.87],
[7.42, 4.69],
[5.75, 3.53],
[9.17, 2.51],
[7.79, 3.42],
[7.93, 0.79]]
row_data_y = [0,0,0,0,0,1,1,1,1,1]
x_train = np.array(row_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.09, 3.37])
distances = [sqrt(np.sum((X_train - x)**2)) for X_train in x_train]
nearest = np.argsort(distance)
k = 6
topK_y = [y_train[i] for i in nearest[: k]]
from collections import Counter
votes = Counter(topK_y)
# 找到出现次数最多的两个数
tem= votes.most_common(2)
# 找到出现次数最多的那个数
predict_y = votes.most_common(1)[0][0]
1.2scilit-learn中的knn
from sklearn.neigbors imports KNeighborsClassifier
Knn_classifier = KNeighborsClassifier(n_neighbors=6)
Knn_classifier.fit(x_train, y_train)
result = Knn_classifier.predict(x)
# 降1维向量变成2维向量进行计算
x_predict = x.reshape(1, -1)
result = Knn_classifier.predict(x_predict)
1.3scikit-learn机器学习算法封装
import numpy as np
from math import sqrt
from collections import Counter
def KNN_classify(k,X_train, y_train, x):
assert 1<= k <= X_train.shape[0], 'k must be valid'
assert X_train.shape[0] == y_train.shape[0], '两个数据集尺寸需相同'
distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
nearest = np.argsort(distance)
topK_y = [y_train[i] for i in nearest[: k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
1.4封装自己的sklearn近邻算法
import numpyas np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self, k):
assert k >= 1, "k must be valid"
self.k = k
self._x_train = None
self._y_train = None
def fit(self, x_train, y_train):
assert x_train.shape[0] == y_train.shape[0], "the size of x_train must be equal to the size of y_train"
assert self.k <= x_train.shape[0], "the size of x_train must be at least k"
self._x_train = x_train
self._y_train = y_train
return self
def predict(self,x_predict):
assert self._x_train is not None and self._y_train is not None, "must fit before predict"
assert x_predict.shape[1] == self._x_train.shape[1],"the feature number of x_predict must be equal to x_train"
y_predict = [self._predict(x) for x in x_predict]
return np.array(y_predict)
def _predict(self, x):
assert x.shape[0] == self._x_train.shape[1],"the feature number of x must be equal to x_train"
distances = [sqrt(np.sum((x_train - x)**2)) for x_train in self._x_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[: self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
2算法效率测试
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.neigbors imports KNeighborsClassifier
iris = datasets.load_iris()
x = iris.data
y = iris.target
# 将0-150这个范围内的数字乱序,并将其作为数据集的下标
shuffle_indexes = np.random.permutation(len(x))
# 设置测试数据集的比例
test_ratio = 0.2
# 求测试数据集的数据个数
test_size = int(len(x)*test_ratio)
# 获取测试数据集的下标
test_indexes = shuffle_indexes[:test_size]
# 获取训练数据集的下标
train_indexes = shuffle_indexes[test_size:]
# 获取训练数据集和测试数据集
x_train = x[train_indexes]
y_train = y[train+indexes]
x_test = x[test_indexes]
y_test = y[test_indexes]
# 创建一个knn实例
my_knn_clf = KNeighborsClassifier(n_neighbors=3)
# 送入训练数据集
my_knn_clf.fit(x_train, y_train)
# 预测测试数据集
y_predict = my_knn_clf.predict(x_test)
# 获取测试数据预测准确率
percent = sum(y_predict == y_test)/len(y_test)
2数字识别
ipython中执行??func,执行函数的源码
from sklearn import datasets
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
digits = datasets.load_digits()
print(digits.keys())
descr=digits.DESCR
print(descr)
x=digits.data
y=digits.target
# some_digit 为1个64列的数据
some_digit = x[666]
some_digit_image = some_digit.reshape(8,8)
# 可视化后,显示为手写数字0
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
plt.show()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_predict = knn.predict(x_test)
sum(y_predict==y_test)/len(y_test)
from sklearn.metrics import accuracy_score
# 算正确率
accuracy_score(y_test,y_predict)
# 计算正确率
knn.score(x_test, y_test)
3封装自己的训练测试分割集
import numpy as np
def train_test_split(x,y,test_radio=0.2,random_state=None):
assert x.shape[0] == y.shape[0],'x和y行数需相同'
assert 0 <= test_radio <=1, '分割比例需在0-1之间'
if random_state:
np.random.seed(random_state)
shuffled_indexes = np.random.permutation(x.shape[0])
test_size = int(x.shape[0]*test_radio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
x_train = x[train_indexes]
y_train = y[train_indexes]
x_test = x[test_indexes]
y_test = y[test_indexes]
return x_train,x_test,y_train,y_test
网友评论