k近邻算法是比较简单的机器学习算法。不需要较多的数学知识就能理解它的重要思想。k近邻算法可以用来解决分类问题,也可以用来解决回归问题。
我们先以分类问题为例。今天是个下雨天,那就以下雨为例,如果我们已知一些坐标点,并且知道这些坐标点是否下了雨。通过这些训练数据,我们是否能得到新的坐标点是否下雨呢?
k近邻其实说的是寻找测试样本中k个最近的点,然后查看该k个点中哪个类别占的比较多,那就把新的点归结为这一类。
# X_train.shape[0]表示的是样本的个数,X_train.shape[1]表示的是特征维度
# y_train是一维数组,它只有.shape[0],即y_train.shape[0]表示的是样本个数
# x也是一维数组,x.shape[0]表示的是测试样本的维度
import numpy as np
from math import sqrt
from collections import Counter
def kNN_classify(K, X_train, y_train, x):
assert 1<= K <= X_train.shape[0], "K should be in range of [1, 样本个数]"
assert X_train.shape[0] == y_train.shape[0], "the number of sample should correspond to the number of label"
assert X_train.shape[1] == x.shape[0], "the feature number of train should correspond to the feature number of test"
distances = [sqrt(np.sum((x - x_train)**2)) for x_train in X_train]
sort_index = np.argsort(distances)
nearest_y = [y_train[i] for i in sort_index[0:K]]
votes = Counter(nearest_y)
predict_y = votes.most_common(1)[0][0]
return predict_y
raw_data_X = [[3.39, 2.33],
[3.11, 1.78],
[1.34, 3.36],
[3.58, 4.67],
[2.28, 2.86],
[7.42, 4.69],
[5.74, 3.53],
[9.17, 2.51],
[7.79, 3.42],
[7.93, 0.79]
]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
x = np.array([8.09, 3.36])
label = kNN_classify(3, X_train, y_train, x)
print label
使用sklearn中的kNN近邻分类器
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
kNN_classifier = KNeighborsClassifier(n_neighbors=6)
kNN_classifier.fit(X_train, y_train)
kNN_classifier.predict(x.reshape(1, -1))
使用类来封装KNN代码
# -*- coding: UTF-8 -*-
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier(object):
def __init__(self, k):
assert isinstance(k, int), "k must be an int value"
assert k >= 1, "k must be valid"
self.k = k
self._X_train = None
self._y_train = None
def fit(self, X_train, y_train):
assert X_train.shape[0] == y_train.shape[0], "number of sample should be the same"
assert self.k <= X_train.shape[0], "k must be less than the number of sample"
self._X_train = X_train
self._y_train = y_train
#chain call
return self
def predict(self, X_predict):
assert self._X_train is not None and self._y_train is not None, "train data shouldn't be None"
assert X_predict.shape[1] == self._X_train.shape[1], "number of features must be same between train and test samples"
predict_y = [self._predict(x) for x in X_predict]
return np.array(predict_y)
def _predict(self, x):
assert self._X_train.shape[1] == x.shape[0], "the feature of train samples and x must be equal"
distances = [sqrt(np.sum(((x - x_train) ** 2))) for x_train in self._X_train]
sort_index_k = np.argsort(distances)[: self.k]
nearest_y = [self._y_train[i] for i in sort_index_k]
votes = Counter(nearest_y)
predict_y = votes.most_common(1)[0][0]
return predict_y
def __repr__(self):
return ("KNN(k={})".format(self.k))
网友评论