这段代码来自《机器学习实战》
注释是我自己加的,英语不好应该有很多语法甚至单词错误,多多包涵
也没什么解释,因为需要注意的都在代码里面了
# k-means algorithm is a algorithm which sort the data that we don't know
# what kind of class it is . we input some data that we know which kind of
# them , and calculate how far the new data from we knows , then find the
# nearly class and add the data to this class
# #
from numpy import *
import operator
'''
createDataSet Create four data like this
1.2
1.1 a
1.0 a
0.9
0.8
0.7
0.6
0.5
0.4
0.3
0.2
0.1 b
0.0 b
0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 1.1 1.2
'''
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group , labels
group , labels = createDataSet()
print(group)
print(labels)
# #
# about the classify0 function's element
# the first element inX for a vector
# the secend element dataset for the data which to test in other words
# the data is
# 0
def classify0( inX , dataSet , labels , k):
'''
1 get the dataSet's row and lines , then , we change the inX to this shape for
calculating confortable :: we store the ans to an numpy_array named diffMat
2 make the diffMat minus the dataSet
3 square the matrix--diffMat to get Euclidean distance
about[x , y , z]and[ a , b , c] we need to calculate the ans as [ (x-a)^2 + (y-b)^2 + (y-c)^2]^0.5
to get Euclidean distance(欧式距离)
4 sort the ans
5 get the most date from ans[0] to ans[k] , create a dic , key = lable value = times
then get the biggest times's key and print it
'''
# ----start step 1
dataSetSize = dataSet.shape[0]
diffMat = tile(inX , (dataSetSize,1))
# ----end step 1
# ----start step 2
diffMat = diffMat - dataSet
# -----end step 2
# --------start step 3
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis = 1)
distances = sqDistances ** 0.5
# --------end step 3
'''
the ans of distances
[1.48660687 1.41421356 0. 0.1 ]
the ans of sortedDistIndicies
we get the sequence from min to max and store the index to this list
named sortedDistIndicies
[2 3 1 0]
'''
# start 4
sortedDistIndicies = distances.argsort()
print(sortedDistIndicies)
# end 4
# start step 5
classCount = {}
print("start for cricyle")
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel , 0) + 1 # the value in dic add
# print("i"+str(i))
# print(voteIlabel)
# print(classCount[voteIlabel])
print(classCount.items())
'''
classCount.items return a list fill of tuple . like this
dict_items([('B', 2), ('A', 1)])
sorted's key means compare with which element ,
key = operator.itemgetter(1) compare with the secend element
reverse = True means rise
reverse = Falst means decline
at last we get the first value ans[0][0]
'''
sortedClassCount = sorted(classCount.items(),
key = operator.itemgetter(1) , reverse = True)
return sortedClassCount[0][0]
# end step 5
print(classify0([0 , 0],group,labels,3))
网友评论