k-临近算法是采用测量不同特征值之间的距离方法进行分类。
优点:
精度高,对异常值不敏感,无数据输入的假定
缺点:
计算复杂度、空间复杂度高
适用数据范围: 数值型和标称型
计算步骤
(1) 计算已知类别数据集中的点与当前点之间的距离
(2) 按照距离递增次序排列
(3) 选取与当前距离最小的k个点
(4) 确定前k个点所在的类别出现频率
(5) 返回前k个点出现频率最高的类别作为当前点的预测分类
from numpy import *
import operator
def create_dataset():
group = array([[1.0, 1.1], [1.0, 1.0],[0, 0],[0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def normalize(dataset):
min_vals = dataset.min(0)
max_vals = dataset.max(0)
ranges = max_vals - min_vals
m = dataset.shape[0]
norm_dataset = dataset - tile(min_vals, (m,1))
norm_dataset = norm_dataset/tile(ranges, (m, 1))
return norm_dataset, ranges, min_vals
def classify(in_vector, dataset, labels, k):
# get x and y size [x, y]
dataset_size = dataset.shape[0]
# tile turn vector to (x,y) matrix
# 1. -
diff_mat = tile(in_vector, (dataset_size, 1)) - dataset
# 2. multi
diff_mat = diff_mat**2
diff_mat.argsort()
# 3. sum
sq_distance = diff_mat.sum(axis=1)
# 4. squrt
distances = sq_distance**0.5
sorted_distances = distances.argsort()
# sort
class_count = {}
for i in range(k):
vote_label = labels[sorted_distances[i]]
class_count[vote_label] = class_count.get(vote_label, 0) + 1
sorted_class_lebal = sorted(class_count.iteritems(), key=operator.itemgetter(1), reverse=True)
return sorted_class_lebal[0][0]
if __name__ == '__main__':
dataset, labels = create_dataset()
# important this less some columns not useful
norm_dataset, ranges, min_vals = normalize(dataset)
label = classify([0.1, 0], norm_dataset, labels, 2)
print label
网友评论