marry_data 数据
from numpy import *
import operator
from os import listdir
def knn_class(inx, dataset, labels, k):
dataset_size = dataset.shape[0] # shape return size
diff_mat = tile(inx, (dataset_size, 1)) - dataset # tile() 计算距离
sq_diff_mat = diff_mat**2 # python ** == ^ 这里平方算距离
sq_distances = sq_diff_mat.sum(axis = 1) # axis = 0 -> 列 asix = 1 -> 行 按列累和 (x^2 + y^2)
distances = sq_distances**0.5 # (x^2 + y^2)开方算距离
sorted_dist_indicies = distances.argsort() # 距离计算 argsort() 函数返回从小到大的索引值
# 选取 K 个距离最小的点 进行分类 并且统计各个分类的数量
class_count = {}
for i in range(k): # [0, k-1]
vote_label = labels[sorted_dist_indicies[i]]
class_count[vote_label] = class_count.get(vote_label, 0) + 1
sorted_class_count = sorted(class_count.iteritems(), key=operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
def file_to_matrix(filename):
love_dict = {'largeDoses' : 3, 'smallDoses' : 2, 'didntLike': 1}
file = open(filename)
lines = file.readlines()
lines_num = len(lines)
data_matrix = zeros((lines_num, 3)) # -> lines_num * 3 matrix
class_label = []
idx = 0
for line in lines:
line = line.strip() # 删除空白字符
msgs = line.split('\t')
data_matrix[idx, :] = msgs[0:3] # 放入 对应的行中 40920 8.326976 0.953952 largeDoses
if (msgs[-1].isdigit()):
class_label.append(int(msgs[-1]))
else:
class_label.append(love_dict.get(msgs[-1])) # 获取得到该数据的 lable 对应的编号 3 2 1
idx += 1
return data_matrix, class_label
# 把数据归一化到 [0, 1]
def auto_norm(data_set):
min_vals = data_set.min(0)
max_vals = data_set.max(0)
ranges = max_vals - min_vals
norm_data = zeros(shape(data_set))
row_size = data_set.shape[0]
print('row_size', row_size)
# 这里归一化的算法思路: [x, y] z 在 x,y 之间
# 结果 = (z - x) / (y - x) 比如: [1, 9] z = 4 -> = (4 - 1) / (9 - 1)
norm_data = data_set - tile(min_vals, (row_size, 1))
norm_data = norm_data / tile (ranges, (row_size, 1))
return norm_data, ranges, min_vals
# 文件中的数据格式
# 40920 8.326976 0.953952 largeDoses
# 14488 7.153469 1.673904 smallDoses
# 26052 1.441871 0.805124 didntLike
# 75136 13.147394 0.428964 didntLike
# 38344 1.669788 0.134296 didntLike
# 72993 10.141740 1.032955 didntLike
def date_class_test():
ratio = 0.1 # 这里用 90% 的数据来训练 10% 数据留作验证
data_matrix, class_label = file_to_matrix('./marry_data')
norm_matrix, ranges, min_vals = auto_norm(data_matrix) # 数据归一化 使得数据都在 [0,1] 之间 影响因子相同
norm_size = norm_matrix.shape[0]
test_num = int(norm_size * ratio)
error_count = 0.0
for i in range(test_num):
result = knn_class(norm_matrix[i, :], norm_matrix[test_num:norm_size, :],
class_label[test_num:norm_size], 3)
if (result != class_label[i]):
error_count += 1.0
return (error_count / float(test_num)) * 100
print('error_count: %d') % (date_class_test()) + '%' # 5.0%
网友评论