美文网首页机器学习
机器学习专题:代码实现(R)

机器学习专题:代码实现(R)

作者: 挽山 | 来源:发表于2020-02-25 16:42 被阅读0次

    一、入门(转载)

    1、线性回归

    • Python代码
    #Import Library
    #Import other necessary libraries like pandas, numpy...
    from sklearn import linear_model
    #Load Train and Test datasets
    #Identify feature and response variable(s) and values must be numeric and numpy arrays
    
    x_train=input_variables_values_training_datasets
    y_train=target_variables_values_training_datasets
    x_test=input_variables_values_test_datasets
    
    # Create linear regression object
    linear = linear_model.LinearRegression()
    
    # Train the model using the training sets and check score
    linear.fit(x_train, y_train)
    linear.score(x_train, y_train)
    
    #Equation coefficient and Intercept
    print('Coefficient: n', linear.coef_)
    print('Intercept: n', linear.intercept_)
    
    #Predict Output
    predicted= linear.predict(x_test)
    
    • R代码
    #Load Train and Test datasets
    #Identify feature and response variable(s) and values must be numeric and numpy arrays
    
    x_train <- input_variables_values_training_datasets
    y_train <- target_variables_values_training_datasets
    x_test <- input_variables_values_test_datasets
    x <- cbind(x_train,y_train)
    
    # Train the model using the training sets and check score
    linear <- lm(y_train ~ ., data = x)
    summary(linear)
    
    #Predict Output
    predicted= predict(linear,x_test)
    

    2、逻辑回归

    • Python代码
    #Import Library
    from sklearn.linear_model import LogisticRegression
    
    #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
    
    # Create logistic regression object
    model = LogisticRegression()
    
    # Train the model using the training sets and check score
    model.fit(X, y)
    model.score(X, y)
    
    #Equation coefficient and Intercept
    print('Coefficient: n', model.coef_)
    print('Intercept: n', model.intercept_)
    
    #Predict Output
    predicted= model.predict(x_test)
    
    • R代码
    x <- cbind(x_train,y_train)
    
    # Train the model using the training sets and check score
    logistic <- glm(y_train ~ ., data = x,family='binomial')
    summary(logistic)
    
    #Predict Output
    predicted= predict(logistic,x_test)
    

    3、决策树

    • Python代码
    #Import Library
    #Import other necessary libraries like pandas, numpy...
    
    from sklearn import tree
    #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
    
    # Create tree object
    model = tree.DecisionTreeClassifier(criterion='gini') # for classification, here you can change the algorithm as gini or entropy (information gain) by default it is gini  
    
    # model = tree.DecisionTreeRegressor() for regression
    
    # Train the model using the training sets and check score
    model.fit(X, y)
    model.score(X, y)
    
    #Predict Output
    predicted= model.predict(x_test)
    
    • R代码
    library(rpart)
    x <- cbind(x_train,y_train)
    
    # grow tree
    fit <- rpart(y_train ~ ., data = x,method="class")
    summary(fit)
    
    #Predict Output
    predicted= predict(fit,x_test)
    

    4、支持向量机

    • Python代码
    #Import Library
    from sklearn import svm
    
    #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
    
    # Create SVM classification object
    
    model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.
    
    # Train the model using the training sets and check score
    model.fit(X, y)
    model.score(X, y)
    
    #Predict Output
    predicted= model.predict(x_test)
    
    • R代码
    library(e1071)
    x <- cbind(x_train,y_train)
    # Fitting model
    fit <-svm(y_train ~ ., data = x)
    summary(fit)
    
    #Predict Output
    predicted= predict(fit,x_test)
    

    5、朴素贝叶斯

    • Python代码
    #Import Library
    from sklearn.naive_bayes import GaussianNB
    
    #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
    
    # Create SVM classification object model = GaussianNB() # there is other distribution for multinomial classes like Bernoulli Naive Bayes, Refer link
    
    # Train the model using the training sets and check score
    model.fit(X, y)
    
    #Predict Output
    predicted= model.predict(x_test)
    
    • R代码
    library(e1071)
    x <- cbind(x_train,y_train)
    
    # Fitting model
    fit <-naiveBayes(y_train ~ ., data = x)
    summary(fit)
    
    #Predict Output
    predicted= predict(fit,x_test)
    

    6、kNN最近邻算法

    • Python代码
    #Import Library
    from sklearn.neighbors import KNeighborsClassifier
    
    #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
    
    # Create KNeighbors classifier object model
    KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5
    
    # Train the model using the training sets and check score
    model.fit(X, y)
    
    #Predict Output
    predicted= model.predict(x_test)
    
    • R代码
    library(knn)
    x <- cbind(x_train,y_train)
    # Fitting model
    fit <-knn(y_train ~ ., data = x,k=5)
    summary(fit)
    #Predict Output
    predicted= predict(fit,x_test)
    

    7、K均值算法

    • Python代码
    #Import Library
    from sklearn.cluster import KMeans
    
    #Assumed you have, X (attributes) for training data set and x_test(attributes) of test_dataset
    
    # Create KNeighbors classifier object model
    k_means = KMeans(n_clusters=3, random_state=0)
    
    # Train the model using the training sets and check score
    model.fit(X)
    
    #Predict Output
    predicted= model.predict(x_test)
    
    • R代码
    library(cluster)
    fit <- kmeans(X, 3) # 5 cluster solution
    

    8、随机森林

    想了解这个算法的更多细节,比较决策树以及优化模型参数,建议阅读以下文章:

    1. 随机森林入门—简化版

    2. 将 CART 模型与随机森林比较(上)

    3. 将随机森林与 CART 模型比较(下)

    4. 调整你的随机森林模型参数

    • Python
    #Import Library
    from sklearn.ensemble import RandomForestClassifier
    
    #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
    
    # Create Random Forest object
    model= RandomForestClassifier()
    
    # Train the model using the training sets and check score
    model.fit(X, y)
    
    #Predict Output
    predicted= model.predict(x_test)
    
    • R代码
    library(randomForest)
    x <- cbind(x_train,y_train)
    
    # Fitting model
    fit <- randomForest(Species ~ ., x,ntree=500)
    summary(fit)
    
    #Predict Output
    predicted= predict(fit,x_test)
    

    9、降维算法

    想要知道更多关于该算法的信息,可以阅读《降维算法的初学者指南》

    • Python代码
    #Import Library
    from sklearn import decomposition
    
    #Assumed you have training and test data set as train and test
    # Create PCA obeject pca= decomposition.PCA(n_components=k) #default value of k =min(n_sample, n_features)
    
    # For Factor analysis
    #fa= decomposition.FactorAnalysis()
    
    # Reduced the dimension of training dataset using PCA
    train_reduced = pca.fit_transform(train)
    
    #Reduced the dimension of test dataset
    test_reduced = pca.transform(test)
    
    • R代码
    library(stats)
    pca <- princomp(train, cor = TRUE)
    train_reduced  <- predict(pca,train)
    test_reduced  <- predict(pca,test)
    

    ※※※降维补充:https://blog.csdn.net/fnqtyr45/article/details/82836063

    10、Gradient Boosting 和AdaBoost算法

    更多:详尽了解 Gradient 和 AdaBoost

    • Python代码
    #Import Library
    from sklearn.ensemble import GradientBoostingClassifier
    
    #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
    
    # Create Gradient Boosting Classifier object
    model= GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    
    # Train the model using the training sets and check score
    model.fit(X, y)
    
    #Predict Output
    predicted= model.predict(x_test)
    
    • R代码
    library(caret)
    x <- cbind(x_train,y_train)
    
    # Fitting model
    fitControl <- trainControl( method = "repeatedcv", number = 4, repeats = 4)
    fit <- train(y ~ ., data = x, method = "gbm", trControl = fitControl,verbose = FALSE)
    
    predicted= predict(fit,x_test,type= "prob")[,2]
    
    • GradientBoostingClassifier 和随机森林是两种不同的 boosting 树分类器。人们常常问起这两个算法之间的区别。

    t-SNE

    https://blog.csdn.net/scythe666/article/details/79203239

    结语

    现在我能确定,你对常用的机器学习算法应该有了大致的了解。写这篇文章并提供 Python 和 R 语言代码的唯一目的,就是让你立马开始学习。如果你想要掌握机器学习,那就立刻开始吧。做做练习,理性地认识整个过程,应用这些代码,并感受乐趣吧!

    二、项目模板

    • 参考:http://www.shujuren.org/article/984.html
      1. Prepare Problem 问题准备
        a) Load libraries 加载所需R包
        b) Load dataset 加载所需数据集
        c) Split-out validation dataset 数据集划分
      1. Summarize Data 数据概要
        a) Descriptive statistics 描述性统计分析
        b) Data visualizations 数据可视化
      1. Prepare Data 数据准备
        a) Data Cleaning 数据清洗
        b) Feature Selection 特征选择
        c) Data Transforms 数据变换
      1. Evaluate Algorithms 算法评测
        a) Test options and evaluation metric 测试集和评价指标
        b) Spot Check Algorithms 测试算法
        c) Compare Algorithms 算法对比分析
      1. Improve Accuracy 性能优化
        a) Algorithm Tuning 调参
        b) Ensembles 集成
      1. Finalize Model 模型应用
        a) Predictions on validation dataset 模型预测
        b) Create standalone model on entire training dataset 全数据集构建模型
        c) Save model for later use 模型保存和实施

    流程实例:基于机器学习项目模板开展的端到端机器学习项目,解决乳腺癌识别的问题。

    参考(特别好的博客):

        # 乳腺癌识别问题
        # 二分类问题
        # 问题描述: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)
        # World-Class Results: http://www.is.umk.pl/projects/datasets.html#Wisconsin
        # 加载R包
        library(mlbench)
        library(caret)
        library(doMC)
        registerDoMC(cores=8)
        # 加载数据集
        data(BreastCancer)
        # 数据集划分
        set.seed(7)
        validation_index <- createDataPartition(BreastCancer$Class, p=0.80, list=FALSE)
        # select 20% of the data for validation
        validation <- BreastCancer[-validation_index,]
        # use the remaining 80% of data to training and testing the models
        dataset <- BreastCancer[validation_index,]
        # 数据概要
        # 数据集样本数和变量数
        dim(dataset)
        # 数据集检视
        head(dataset, n=20)
        # 数据集变量类型
        sapply(dataset, class)
        # 移除ID变量
        dataset <- dataset[,-1]
        # 变量类型转换
        for(i in 1:9) {
            dataset[,i] <- as.numeric(as.character(dataset[,i]))
        }
        # 数据摘要
        summary(dataset)
        # 类别变量分布
        cbind(freq=table(dataset$Class), percentage=prop.table(table(dataset$Class))*100)
        # 变量集之间的相关性
        complete_cases <- complete.cases(dataset)
        cor(dataset[complete_cases,1:9])
        # 变量集直方图
        par(mfrow=c(3,3))
        for(i in 1:9) {
            hist(dataset[,i], main=names(dataset)[i])
        }
        # 变量集核密度图
        par(mfrow=c(3,3))
        complete_cases <- complete.cases(dataset)
        for(i in 1:9) {
            plot(density(dataset[complete_cases,i]), main=names(dataset)[i])
        }
        # 变量集盒箱图
        par(mfrow=c(3,3))
        for(i in 1:9) {
            boxplot(dataset[,i], main=names(dataset)[i])
        }
        # 散点图矩阵
        jittered_x <- sapply(dataset[,1:9], jitter)
        pairs(jittered_x, names(dataset[,1:9]), col=dataset$Class)
        # 基于类别的变量集盒箱图
        par(mfrow=c(3,3))
        for(i in 1:9) {
            barplot(table(dataset$Class,dataset[,i]), main=names(dataset)[i], legend.text=unique(dataset$Class))
        }
        # 算法评测
        # 重复3次的10折交叉验证
        control <- trainControl(method="repeatedcv", number=10, repeats=3)
        metric <- "Accuracy"
        # LG
        set.seed(7)
        fit.glm <- train(Class~., data=dataset, method="glm", metric=metric, trControl=control)
        # LDA
        set.seed(7)
        fit.lda <- train(Class~., data=dataset, method="lda", metric=metric, trControl=control)
        # GLMNET
        set.seed(7)
        fit.glmnet <- train(Class~., data=dataset, method="glmnet", metric=metric, trControl=control)
        # KNN
        set.seed(7)
        fit.knn <- train(Class~., data=dataset, method="knn", metric=metric, trControl=control)
        # CART
        set.seed(7)
        fit.cart <- train(Class~., data=dataset, method="rpart", metric=metric, trControl=control)
        # Naive Bayes
        set.seed(7)
        fit.nb <- train(Class~., data=dataset, method="nb", metric=metric, trControl=control)
        # SVM
        set.seed(7)
        fit.svm <- train(Class~., data=dataset, method="svmRadial", metric=metric, trControl=control)
        # Compare algorithms
        results <- resamples(list(LG=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet, KNN=fit.knn, CART=fit.cart, NB=fit.nb, SVM=fit.svm))
        summary(results)
        dotplot(results)
        # Evaluate Algorithms Transform
        # 10-fold cross validation with 3 repeats
        control <- trainControl(method="repeatedcv", number=10, repeats=3)
        metric <- "Accuracy"
        # LG
        set.seed(7)
        fit.glm <- train(Class~., data=dataset, method="glm", metric=metric, preProc=c("BoxCox"), trControl=control)
        # LDA
        set.seed(7)
        fit.lda <- train(Class~., data=dataset, method="lda", metric=metric, preProc=c("BoxCox"), trControl=control)
        # GLMNET
        set.seed(7)
        fit.glmnet <- train(Class~., data=dataset, method="glmnet", metric=metric, preProc=c("BoxCox"), trControl=control)
        # KNN
        set.seed(7)
        fit.knn <- train(Class~., data=dataset, method="knn", metric=metric, preProc=c("BoxCox"), trControl=control)
        # CART
        set.seed(7)
        fit.cart <- train(Class~., data=dataset, method="rpart", metric=metric, preProc=c("BoxCox"), trControl=control)
        # Naive Bayes
        set.seed(7)
        fit.nb <- train(Class~., data=dataset, method="nb", metric=metric, preProc=c("BoxCox"), trControl=control)
        # SVM
        set.seed(7)
        fit.svm <- train(Class~., data=dataset, method="svmRadial", metric=metric, preProc=c("BoxCox"), trControl=control)
        # Compare algorithms
        transform_results <- resamples(list(LG=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet, KNN=fit.knn, CART=fit.cart, NB=fit.nb, SVM=fit.svm))
        summary(transform_results)
        dotplot(transform_results)
        # 性能优化
        # 调参
        # Tune SVM
        # 10-fold cross validation with 3 repeats
        control <- trainControl(method="repeatedcv", number=10, repeats=3)
        metric <- "Accuracy"
        set.seed(7)
        grid <- expand.grid(.sigma=c(0.025, 0.05, 0.1, 0.15), .C=seq(1, 10, by=1))
        fit.svm <- train(Class~., data=dataset, method="svmRadial", metric=metric, tuneGrid=grid, preProc=c("BoxCox"), trControl=control)
        print(fit.svm)
        plot(fit.svm)
        # Tune kNN
        # 10-fold cross validation with 3 repeats
        control <- trainControl(method="repeatedcv", number=10, repeats=3)
        metric <- "Accuracy"
        set.seed(7)
        grid <- expand.grid(.k=seq(1,20,by=1))
        fit.knn <- train(Class~., data=dataset, method="knn", metric=metric, tuneGrid=grid, preProc=c("BoxCox"), trControl=control)
        print(fit.knn)
        plot(fit.knn)
        # 集成
        # Ensembles: Boosting and Bagging
        # 10-fold cross validation with 3 repeats
        control <- trainControl(method="repeatedcv", number=10, repeats=3)
        metric <- "Accuracy"
        # Bagged CART
        set.seed(7)
        fit.treebag <- train(Class~., data=dataset, method="treebag", metric=metric, trControl=control)
        # Random Forest
        set.seed(7)
        fit.rf <- train(Class~., data=dataset, method="rf", metric=metric, preProc=c("BoxCox"), trControl=control)
        # Stochastic Gradient Boosting
        set.seed(7)
        fit.gbm <- train(Class~., data=dataset, method="gbm", metric=metric, preProc=c("BoxCox"), trControl=control, verbose=FALSE)
        # C5.0
        set.seed(7)
        fit.c50 <- train(Class~., data=dataset, method="C5.0", metric=metric, preProc=c("BoxCox"), trControl=control)
        # Compare results
        ensemble_results <- resamples(list(BAG=fit.treebag, RF=fit.rf, GBM=fit.gbm, C50=fit.c50))
        summary(ensemble_results)
        dotplot(ensemble_results)
        # 模型应用
        # prepare parameters for data transform
        set.seed(7)
        dataset_nomissing <- dataset[complete.cases(dataset),]
        x <- dataset_nomissing[,1:9]
        preprocessParams <- preProcess(x, method=c("BoxCox"))
        x <- predict(preprocessParams, x)
        # prepare the validation dataset
        set.seed(7)
        # remove id column
        validation <- validation[,-1]
        # remove missing values (not allowed in this implementation of knn)
        validation <- validation[complete.cases(validation),]
        # convert to numeric
        for(i in 1:9) {
            validation[,i] <- as.numeric(as.character(validation[,i]))
        }
        # transform the validation dataset
        validation_x <- predict(preprocessParams, validation[,1:9])
        # make predictions
        set.seed(7)
        predictions <- knn3Train(x, validation_x, dataset_nomissing$Class, k=9, prob=FALSE)
        confusionMatrix(predictions, validation$Class)
    

    三、个别项目

    (一)

    相关文章

      网友评论

        本文标题:机器学习专题:代码实现(R)

        本文链接:https://www.haomeiwen.com/subject/qhqyqhtx.html