美文网首页R统计编程
不调包!自己整一个KNN函数(更新版)

不调包!自己整一个KNN函数(更新版)

作者: 陈有朴 | 来源:发表于2022-09-25 23:59 被阅读0次

    KNN,全称K-Nearest Neighbors Algorithm,是一种非参数、监督的分类方法。

    那么引出一个问题:如何使用R语言编写一个KNN算法呢?

    首先,我们将KNN的编写拆分为如下几个问题,

    1)observation和train数据集之间的距离如何计算?

    2)得到distance matrix之后,如何判断observation的所属关系?

    针对第一个问题,可以有许多种答案,本篇文章使用欧式距离进行距离计算,计算公式如下,
    d(x, y) = \sqrt{(x_{1}-y_{1})^2+(x_{2}-y_{2}^2)+……+(x_{n}-y_{n})^2}
    第二个问题,我的解决方案是,

    给出一个向量(e.g. 从1开始,到20结束),作为K的所选值,随后计算每一种K值的KNN分类结果准确率,并进行多次模拟,以箱线图的结果对最终的K进行判断,

    1)set soft power parameter(参考WGCNA)

    2)bootstraping

    代码部分

    此部分自带数据集鸢尾花为例,

    这是对新手很硬核,但对R programmer非常easy的一部分内容,

    代码总共可以拆分为如下几个部分,

    1)拆分原始数据集的函数:dataset.grouping

    2)计算欧式距离的函数:euclidean_distance

    3)KNN核心代码函数:k.nearest.neighborsk.picker

    4)bootstrap函数:bootstrap.df

    由于编写代码的过程中,忘记考虑到引入和test数据集的结合,因此最后重新编写了一个函数k.nearest.neighbors.spec

    # test数据集
    test_samples <- data.frame(Sepal.Length = c(6.1, 5.9, 6.7, 5.6, 7.0, 6.5),
                               Sepal.Width = c(2.5, 5.0, 4.0, 3.1, 3.6, 3.2),
                               Petal.Length = c(1.7, 2.0, 6.5, 1.5, 6.3, 4.8),
                               Petal.Width = c(0.3, 1.2, 2.2, 0.1, 2.5, 1.5),
                               row.names = paste('sample', 1:6, sep = ''))
    test_samples
    
    
    # Sub-functions
    # 1. Generation of training dataset and testing dataset
    dataset.grouping <- function(data, group) {
        # Cut dataset using group values
        index <- which(names(data)==group)
        grouping.vector <- names(table(data[, index]))
    
        # Build a empty dataframe
        train.df <- data.frame(matrix(0, nrow = nrow(data)*0.7, ncol = ncol(data)))
        test.df <- data.frame(matrix(0, nrow = nrow(data)*0.3, ncol = ncol(data)))
    
        # Create train dataset
        count.1 <- 1
        count.2 <- 1
        for (i in grouping.vector){
            df.i <- data[which(data[, index] == i), ]
            row.index = sample(nrow(df.i), nrow(data)*0.7*1/3, replace = FALSE)
            train.df.i <- df.i[row.index, ]
            test.df.i <- df.i[-row.index, ]
            
            # Append sample dataset to train and test dataset
            count.3 <- count.1 + nrow(train.df.i) - 1
            count.4 <- count.2 + nrow(test.df.i) - 1
            train.df[count.1:count.3, ] <- train.df.i
            test.df[count.2:count.4, ] <- test.df.i
            count.1 <- count.1 + nrow(train.df.i)
            count.2 <- count.2 + nrow(test.df.i)
    
        }
        
        # Reform the Grouping col
        case.vector <- unique(train.df[, index])
        for (i in 1:length(case.vector)){
            train.df[which(train.df[, index] == case.vector[i]), index] <- grouping.vector[i]
            test.df[which(test.df[, index] == case.vector[i]), index] <- grouping.vector[i]
        }
        # Return the train df and test df
        return(list(train.df, test.df))
    }
    
    # 2. Define basic Functions, e.g. distance
    euclidean_distance <- function(trainline, testline){
      if(length(trainline) == length(testline)){
        sqrt(sum((trainline-testline)^2))  
      } else{
        stop('Vectors must be in the same length')
      }
    }
    
    
    # KNN
    k.nearest.neighbors <- function(data, group, k){
        index <- which(names(data)==group)
        df.list <- dataset.grouping(data, group = group)
        train.df <- df.list[[1]]; test.df <- df.list[[2]]
        train.scale <- scale(train.df[, -index]); test.scale <- scale(test.df[, -index])
        train.classifier <- train.df[, index]; test.classifier <- test.df[, index]
    
        predict <- c()
        for (i in 1:nrow(test.df)) {
            dist.mat <- apply(train.scale, 1, euclidean_distance, test.scale[i, ])
            train.classifier <- train.classifier[order(dist.mat, decreasing = FALSE)]
            freq.df <- data.frame(table(train.classifier[1:k]))
            predict <- c(predict, as.character(freq.df$Var1[which(freq.df$Freq == max(freq.df$Freq))]))
        }
    
        info <- list(test.classifier, predict)
        names(info) <- c('original', 'predict')
        return(info)
    }
    # table(info)  # confusion matrix
    
    # Pick the best K
    k.picker <- function(data, group, case.vector) {
        # Note: case.vector is a serial vector to choose best K from
        err.vector <- c()
        for (i in case.vector){
            info <- k.nearest.neighbors(data, group, i)
            # print(length(info$original))
            # print(length(info$predict))
            err = mean(info$original != info$predict)
            # print(err)
            err.vector <- c(err.vector, err)
        }
        return(err.vector)
    }
    # Set soft power to pick
    soft.power <- c(1:20)
    # errs <- k.picker(iris, "Species", soft.power)
    # errs
    
    # Using bootstrap to pick the best to k
    bootstrap.df <- data.frame(matrix(0, nrow = 1000, ncol = length(soft.power)))
    for (i in 1:nrow(bootstrap.df)){
        bootstrap.df[i, ] <- k.picker(iris, "Species", soft.power)
    }
    boxplot(bootstrap.df)
    
    
    k.nearest.neighbors.spec <- function(data, test, group, k){
        index <- which(names(data)==group)
        print(index)
        data.scale <- scale(data[, -index]); test.scale <- scale(test)
        data.classifier <- data[, index]
        # print(data.classifier)
    
        predict <- c()
        for (i in 1:nrow(test.scale)) {
            dist.mat <- apply(data.scale, 1, euclidean_distance, test.scale[i, ])
            data.classifier <- data.classifier[order(dist.mat, decreasing = FALSE)]
            freq.df <- data.frame(table(data.classifier[1:k]))
            print(freq.df)
            predict <- c(predict, as.character(freq.df$Var1[which(freq.df$Freq == max(freq.df$Freq))]))
        }
    
        info <- list(predict)
        names(info) <- c('predict')
        return(info)
    }
    
    k.nearest.neighbors.spec(iris, test_samples, "Species", 3)
    

    相关文章

      网友评论

        本文标题:不调包!自己整一个KNN函数(更新版)

        本文链接:https://www.haomeiwen.com/subject/medtartx.html