最近生统课上的作业,要求不调包自己写一个KNN算法,用于预测鸢尾花数据集,
test_samples <- data.frame(Sepal.Length = c(6.1, 5.9, 6.7, 5.6, 7.0, 6.5),
Sepal.Width = c(2.5, 5.0, 4.0, 3.1, 3.6, 3.2),
Petal.Length = c(1.7, 2.0, 6.5, 1.5, 6.3, 4.8),
Petal.Width = c(0.3, 1.2, 2.2, 0.1, 2.5, 1.5),
row.names = paste('sample', 1:6, sep = ''))
test_samples
cal.dist <- function(vector1, vector2){
v.diff <- vector1 - vector2
return(sum(sqrt(v.diff**2)))
}
k.nearest.neighbors <- function(train, test, k=3){
#
n.col <- unlist(lapply(train, is.numeric))
train.numeric <- train[, n.col] # only extract the numeric dataframe
train.factor <- train[, (n.col==FALSE)]
# Calculate the distance
# dist.list <- sapply(unique(as.character(train.factor)), function(x) NULL) # only in character form, the list could be build with name but with emtpy entry
dist.df <- apply(train.numeric, MARGIN = 1, cal.dist, test)
check.df <- data.frame(distance=dist.df, label=train.factor)
check.df <- check.df[order(check.df$distance, decreasing = FALSE), ]
# Vote
k.df <- check.df[c(1:k), ]
count.table <- data.frame(table(k.df[, 2]))
vote <- count.table[which(count.table[, 2] == max(count.table[, 2])), ][1, 1]
return(as.character(vote))
}
# main function
for (i in 1:nrow(test_samples)){
line = test_samples[i, ]
print(k.nearest.neighbors(iris, line, k=3)) # print the output vote results
}
预测结果如下,
setosa
setosa
virginica
setosa
virginica
versicolor
网友评论