美文网首页
R语言数据分析必备

R语言数据分析必备

作者: Neural_PDE | 来源:发表于2021-01-17 09:39 被阅读0次

    一. 加载模块包

    install.packages("tidyverse")
    library(tidyverse)
    

    其他方法

    # initial settings
    libary_path <- paste(getwd(), "packages",sep="/")
    dir.create(libary_path,showWarnings = FALSE)
    .libPaths(libary_path)
    
    
    if(!require(testthat)){
        install.packages("testthat")
        library(testthat)
    }
    if(!require(repr)){
        install.packages("repr")
        library(repr)
    }
    if(!require(MASS)){
        install.packages("MASS")
        library(MASS)
    }
    if(!require(gclus)){
        install.packages("gclus")
        library(gclus)
    }
    if(!require(GGally)){
        install.packages("GGally")
        library(GGally)
    }
    if(!require(ggcorrplot)){
        install.packages("ggcorrplot")
        library(ggcorrplot)
    }
    if(!require(tidyverse)){
        install.packages("tidyverse")
        library(tidyverse)
    }
    
    
    # Plot size deppening on your screen resolution to 4 x 3
    options(repr.plot.width=4, repr.plot.height=3)
    

    二. 读取数据:读取写入excel.xlsx

    方法1:将数据和文件放置在同一文件夹下直接读取
    rt <- read.table("3.txt",head=TRUE)
    
    方法2:设置路径后读取
    #读取txt数据
    setwd("D:\\data")
    rt <- read.table("3.txt",head=TRUE)
    
    方法3:弹窗手动选择数据
    data <- read.delim(file.choose())
    data <- read.csv(file.choose())
    
    方法4:直接路径读取数据
    data <- read.delim("D:/data/xxx.csv")
    
    方法5:复制表格内容
    #右键复制表格内容后输入如下代码 以便导入复制在内存中数据:
    data=read.table("clipboard",sep="\t",header = T)
    
    制作数据
    c1 <- 1:5  
    c2 <- c(2,5,1,8,6)
    c3 <- c("w","q","r","t","p")
    c4 <- c("i","w","u","z","v")
    cite <- data.frame(c1,c2,c3,c4) ##生成数据框
    
    查看数据
    View(data)
    str(data)
    summary(data)
    
    查看行和列的个数是(多少行 多少列)
    dim(data)
    #结果是:1,2 这样1行2列
    

    查看某列元素(不重复)

    unique(data$列名)
    unique(data[,1])
    table(data[,2])
    

    查看 列名 & 行名

    names(data)
    row.names(data)
    

    设置 列名 & 行名

    names(data) = c("A","B","C")
    row.names(data)=c("A","B","C")
    

    增加一列 总计

    data$total=rowSums(data)
    

    增加一行 总计

    data[7,]=colSums(data)
    #这里7表示行数+1,所以如下也可以
    #data[nrow(data)+1,]=colSums(data)
    #或者使用合并方法
    #rbind(df1,colSums(data))
    

    三. 数据处理

    改变数值类型

    for (i in 2:dim(df)[2]){
    df[,i]=as.numeric(df[,i])}
    

    取消科学计数,保留小数

    options(scipen = 200)
    

    去重查看

    unique(data)
    

    标准化("max-min"结果肯定大于0) 和 归一化(结果不一定大于0)

    #################  自定义标准化 #####################################
    normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
    data[,-11]=normalize(data[,-11])
    head(data)
    ##或者构造新的数据集
    #data_Stand=as.data.frame(lapply(data[,-11],normalize))
    #data_Stand$class=data$class
    #head(data_Stand)
    
    ################# 归一化函数包scale() #####################################
    #第11列是标签列(目标列)
    data[,-11]=scale(data[,-11])
    #scale(x, center = TRUE, scale = TRUE)
    View(data)
    

    处理查看空缺值

    ################## Processing  "NAN"  ################## 
    na_flag = apply(is.na(data), 2, sum)
    View(na_flag)
    #删掉所在行
    dataset<- na.omit(data)
    na_flag = apply(is.na(dataset), 2, sum)
    View(na_flag)
    

    删除空值所在列

    ##################删除空值大于30的列##################
    flag <- apply(data, 2, function(x) sum(is.na(x)) <= 30)
    newdata <- data[, which(flag)]
    na_flag = apply(is.na(newdata ), 2, sum)
    View(na_flag)
    

    盖帽法替换异常值

    ############################### 数据处理 #####################
    #绘制频率分布直方图
    par(mfrow=c(2,5))
    for(i in 1:10)
    {
    hist(data[,i],main=paste(names(data)[i],"的频率分布直方图"),xlab=NA) 
    }  
    
    
    #绘制箱线图检验离群值
    par(mfrow=c(2,5))
    for(i in 1:10)
    {
    boxplot(data[i],main=paste(names(data)[i],"的箱线图"))   
    }   
    
    #或者另一种方法绘制箱线图检验离群值
    par(mfrow=c(1,1))
    boxplot(data[,-11])
    
    ###### 绘制散点图 #######
    ##par(mfrow=c(4,4))
    ##for(i in 2:17)
    ##{
    ##plot(data[,i],main=paste(names(data)[i],"的散点图"),xlab=NA,ylab=NA)
    ##} 
    #######################
    
    #采用盖帽法替换异常值,用10%处的数据覆盖分布在10%以下的数据,用90%处的数据覆盖分布在99%以上的数据。
    #这里的10%和90%取值有些极端,及供参考。
    block<-function(x,lower=T,upper=T){
      if(lower){
        q1<-quantile(x,0.1)
        x[x<=q1]<-q1
      }
      if(upper){
        q99<-quantile(x,0.90)
        x[x>q99]<-q99
      }
      return(x)
    }
    
    data[,-11] = sapply(data[,-11],block)
    par(mfrow=c(1,1))
    boxplot(data[,-11],frame = T)
    str(data)
    head(data)
    

    相关图

    ##### 绘制替换异常值后的相关图  ##########
    #相关系数cor
    b=cor(data[,-11])
    b
    #安装ggpubr包可视化
    #install.packages("ggpubr")
    #install.packages("corrplot")
    library("corrplot")
    par(mfrow=c(1,1))
    corrplot(b)
    #或通过饼图查看相关性
    corrplot(b,method="pie")
    #用颜色显示,同时显示相关系数。
    corrplot(b,method="color",addCoef.col="grey")
    #显示数字,增强图形可读性。
    col=colorRampPalette(c("navy", "white", "firebrick3")) #设置颜色
    corrplot(b,add=TRUE, type="lower", method="number",diag=FALSE,tl.pos="n", cl.pos="n",col=col(10))
    
    
    #绘制变量间相关性多图混合
    par(mfrow=c(1,1))
    library(PerformanceAnalytics)
    chart.Correlation(data[,-11])
    

    关于日期

    #选定日期
    start=as.Date("2020-1-22",'%Y-%m-%d')#起始日期
    end = as.Date("2021-1-4",'%Y-%m-%d') #终止日期
    ndays = end - start + 1              #总天数
    #构造每日日期向量dates
    dates = seq(from=start, by=1, length.out=ndays)
    View(dates)
    

    拼接变量和字符串

    paste("xxx",i)
    

    四. 划分训练集和测试集

    方法1

    data=data("iris")
    set.seed(2021)
    train_sample = sample(10000,8000) #10000个里面随机抽取8000个
    #train_sample = sample(nrow(iris),0.8*nrow(iris)) #10000个里面随机抽取8000个
    train=data[train_sample, ]
    test=data[-train_sample, ]
    

    方法2

    data("iris")
    set.seed(2)
    ind = sample(2,nrow(iris),replace = TRUE,prob = c(0.7,0.3))
    trainset = iris[ind == 1,]
    testset = iris[ind == 2,]
    

    查看测试集和样本集

    #查看测试集和样本集
    str(classdata_train)
    str(classdata_test)
    #接着我们查看所抽取训练集和测试集标签中各字母所占的比例
    dataclassper=prop.table(table(data$class))
    dataclassper
    train=prop.table(table(classdata_train$class))
    train
    test=prop.table(table(classdata_test$class))
    test
    

    五. 条件,循环,函数

    1条件

    1.1条件选行 给 一个新的向量

    a=df[df$列名 == 条件,]
    

    例子

    a=iris[iris$Species=="virginica",]
    head(a)

    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    6.3 3.3 6.0 2.5 virginica
    5.8 2.7 5.1 1.9 virginica
    7.1 3.0 5.9 2.1 virginica

    按行条件筛选

    library(tidyfst)
    library(survival)  # 加载数据集所需的包
    data("colon") # 加载数据集
    
    # 单条件筛选
    filter_dt(colon, sex == 1)  # 筛选colon数据集中男性患者
    
    # 多条件筛选
    # 筛选≥50岁的男性患者
    filter_dt(colon,sex==1 & age>=50)
    
    # 筛选男性或年龄≥50岁的患者
    filter_dt(colon, sex == 1 | age >= 50)
    
    # 筛选age中大于age平均值的行
    colon %>% filter_dt(age > mean(age, na.rm = TRUE)) 
    
    # 筛选男性中age大于age平均值的行
    colon %>% filter_dt(sex == 1 & age > mean(age, na.rm = TRUE))
    
    # 筛选 50 ≤ age < 60的患者
    filter_dt(colon, age >= 50 & age < 60)
    # 筛选 50 ≤ age ≤ 60的患者
    colon %>% filter_dt(between(age,50,60))
    
    # 筛选肿瘤分化程度differ为1和2,且性别为男性的行
    colon %>% filter_dt(differ %in% c(1,2), sex == 1)
    
    
    # 筛选不同性别中年龄大于各自性别的年龄平均值的行  
    colon %>% 
      group_dt(
        by = sex,
        filter_dt(age > mean(age, na.rm = TRUE))
      )
    

    iris %>% filter_dt(Sepal.Length == max(Sepal.Length))

    2循环

    2.1循环添加元素到空向量

    Vector=c()
    for (i in 1:5){
    Vector=c(Vector,i)
    }
    Vector
    

    2.2循环添加向量到空数据框

    D=data.frame()
    for (i in 1:5){
    D=rbind(D,f(i))
    }
    Vector
    

    六 各类算法模型

    1分类算法

    data = read.csv(file.choose())
    View(data)
    
    #histplot
    par(mfrow=c(2,5))
    for(i in 1:10)
    {
    hist(data[,i],main=paste(names(data)[i]),xlab=NA,col="pink") 
    }  
    
    #################  standarzation #####################################
    normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
    data[,-11]=normalize(data[,-11])
    head(data)
    
    
    
    #boxplot
    par(mfrow=c(2,5))
    for(i in 1:10)
    {
    boxplot(data[i],main=paste("Boxplot of",names(data)[i]))   
    }   
    
    
    
    #Map method
    block<-function(x,lower=T,upper=T){
      if(lower){q1<-quantile(x,0.1);x[x<=q1]<-q1}
      if(upper){q99<-quantile(x,0.90);x[x>q99]<-q99}
      return(x)
    }
    
    data[,-11] = sapply(data[,-11],block)
    par(mfrow=c(1,1))
    boxplot(data[,-11],frame = T,col="purple")
    str(data)
    head(data)
    
    
    Model_comparison = function(data){
    #load required packages
    if(!require(nnet)){
        install.packages("nnet")
        library(nnet)
    }
    if(!require(gmodels)){
        install.packages("gmodels")
        library(gmodels)
    }
    if(!require(MASS)){
        install.packages("MASS")
        library(MASS)
    }
    if(!require(C50)){
        install.packages("C50")
        library(C50)
    }
    if(!require(kernlab)){
        install.packages("kernlab")
        library(nnet,gmodels,MASS,C50,kernlab,randomForest)
    }
    if(!require(randomForest)){
        install.packages("randomForest")
        library(randomForest)
    }
    #Partition data set
    set.seed(2021)
    train_sample = sample(nrow(data),0.8*nrow(data))
    train=data[train_sample, ]
    test =data[-train_sample, ]
    
    #set the vectors of time and Accuracy
    t=c()
    Accuracy=c()
    
    ######################  logistic regression   ######################
    ptm <- proc.time()
    model = glm(as.factor(train$class)~., data=train, family=binomial())
    a=proc.time()-ptm
    t[1]=a[[1]]
    prob = predict(model, test, type="response")#type="response"Get the prediction probability
    pred = factor(prob > 0.5, levels=c(FALSE, TRUE), labels=c("g", "h"))#If the probability is greater than 0.5, it is true and labeled as malignant
    agreement = pred == test$class
    Accuracy[1]=prop.table(table(agreement))[[2]]
    
    ################  Linear discriminant analysis   ##################
    ptm <- proc.time()
    model =lda(as.factor(train$class)~.,data=train)
    a=proc.time()-ptm
    t[2]=a[[1]]
    predictions=predict(model , newdata = test)
    newGrop = predictions$class
    agreement = newGrop == test$class
    Accuracy[2]=prop.table(table(agreement))[[2]]
    
    ###########   Quadratic discriminant analysis   ####################
    ptm <- proc.time()
    model =qda(as.factor(train$class)~.,data=train)
    a=proc.time()-ptm
    t[3]=a[[1]]
    predictions=predict(model , newdata = test)
    newGrop = predictions$class
    agreement = newGrop == test$class
    Accuracy[3]=prop.table(table(agreement))[[2]]
    
    ###########   Bayes discriminant analysis   ########################
    ptm <- proc.time()
    model =lda(as.factor(train$class)~.,data=train,prior = c(1,1)/2)
    a=proc.time()-ptm
    t[4]=a[[1]]
    predictions=predict(model , newdata = test)
    newGrop = predictions$class
    agreement = newGrop == test$class
    Accuracy[4]=prop.table(table(agreement))[[2]]
    
    ############################## SVM   ################################
    ptm <- proc.time()
    class_classifier = ksvm(as.factor(class)~., data = train, kernel = "vanilladot")  
    a=proc.time()-ptm
    t[5]=a[[1]]
    predictions = predict(class_classifier, test)
    agreement <- predictions == test$class
    Accuracy[5]=prop.table(table(agreement))[[2]]
    
    ########################## decision trees ##############################
    #使用C5.0函数获得决策树模型
    ptm <- proc.time()
    data_model = C5.0(train[,-11], as.factor(train$class))
    a=proc.time()-ptm
    t[6]=a[[1]]
    predictions = predict(data_model, test)
    table(predictions, test$class)
    agreement <- predictions == test$class
    Accuracy[6]=prop.table(table(agreement))[[2]]
    
    ############################## Random Forest ################################
    ptm <- proc.time()
    data.rf = randomForest(as.factor(class) ~ ., data=train, importance=TRUE, proximity=TRUE)
    a=proc.time()-ptm
    t[7]=a[[1]]
    predictions = predict(data.rf, test)
    agreement <- predictions == test$class
    Accuracy[7]=prop.table(table(agreement))[[2]]
    
    ############################ neural network ############################ 
    model=nnet(as.factor(train$class)~.,size=20,data=train)
    a=proc.time()-ptm
    t[8]=a[[1]]
    predictions =predict(model,test,type='class')
    agreement = predictions == test$class
    Accuracy[8]=model_nnet=prop.table(table(agreement))[[2]]
    
    #Summary of statistical results
    df=data.frame(rbind(Accuracy,t))
    names(df) = c("logistic regression","Linear discriminant analysis","Quadratic discriminant analysis","Bayes discriminant analysis","SVM","decision trees","Random Forest","neural network")
    df=t(df)
    names(df) = c("Accuracy","Time")
    #Export results
    write.csv(df,"D:/result.csv")
    return(df)
    }
    result=Model_comparison(data)
    View(result)
    

    回归

    #install.packages("randomForest")
    #install.packages("dplyr")
    #install.packages("ggplot2")
    #install.packages("pROC")
    #install.packages("nnet")
    #install.packages("DMwR")
    #install.packages("kernlab")
    library(randomForest)
    library(dplyr)
    library(ggplot2)
    library(nnet)
    library(DMwR)
    library(e1071)
    
    ################################  load data  ###########################
    data=read.csv("D:/data/cardekho.csv")
    ########################图###################
    ##图一
    library(ggplot2)
    str(data)
    ggplot(data,aes(x=brand,y=selling_price,fill=brand))+
      geom_violin()+
      geom_boxplot(width=0.3)+
      labs(x = "",y="selling_price",Title="")+
      facet_wrap(~brand,scale="free")+
      theme_classic()
    ##图二
    ggplot(data,aes(x=brand,y=selling_price,fill=brand))+
      geom_bar(stat = 'identity')+
      labs(x = "brand",y="selling_price",Title="")+
      theme_classic() +
      coord_flip()
    ##图三
    library(ggridges)
    ggplot(data) +
      geom_density(aes(x = selling_price,fill=brand))+ 
      facet_wrap(~brand,scale="free")+
      theme_classic()
    ##图4
    ggplot(data,aes(y=as.factor(brand) ,x=max_cost_price,fill=brand))+
      geom_density_ridges()+
      theme_classic()
    
    
    ################## Processing  "NA"  ################## 
    na_flag = apply(is.na(data), 2, sum)
    View(na_flag)
    #删掉所在行
    dataset<- na.omit(data)
    na_flag = apply(is.na(dataset), 2, sum)
    View(na_flag)
    str(dataset)
    
    ####################### 数据整理 ################
    df=dataset[-5]
    df$car_name=as.numeric(as.factor(df$car_name))
    df$brand=as.numeric(as.factor(df$brand))
    df$model=as.numeric(as.factor(df$model))
    df$seller_type=as.numeric(as.factor(df$seller_type))
    df$fuel_type=as.numeric(as.factor(df$fuel_type))
    df$transmission_type=as.numeric(as.factor(df$transmission_type))
    str(df)
    View(df)
    
    ########相关###########
    #install.packages("corrplot")
    library("corrplot")
    b=cor(df)
    b
    corrplot(b,method="color",addCoef.col="grey")
    write.csv(b,"D:/data/cor.csv")
    
    plot(df$max_power,df$selling_price,col='3')
    plot(df$max_cost_price,df$selling_price,col='4')
    plot(df$engine,df$selling_price,col='5')
    
    
    ## 挑选属性
    testdata=df
    ddf=testdata[,-1]
    df=ddf[,-2]
    str(df)
    
    #################  自定义标准化 #####################################
    normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
    df[,-14]=normalize(df[,-14])
    str(df)
    
    
    
    #200位数以内不使用科学计数法
    options(scipen = 200)
    ###################### 开始模型准备 ########################
    #Partition data set
    set.seed(2021)
    train_sample = sample(nrow(df),0.99*nrow(df))
    train=df[train_sample, ]
    test =df[-train_sample, ]
    ### 未处理数据作为对比的对照模型
    #set.seed(2021)
    #train_sample = sample(nrow(testdata),0.99*nrow(df))
    #train=testdata[train_sample, ]
    #test =testdata[-train_sample, ]
    
    #####################################################################
    #####################################################################
    ########################       线性回归      ########################
    #####################################################################
    #####################################################################
    model = lm(train$selling_price~., data=train)
    pred=predict(model,newdata = test)
    
    regr.eval(test$selling_price,pred)
    print("R-squared");cor(test$selling_price,pred)^2
    result=data.frame(实际=test$selling_price,线性回归=pred)
    plot(test$selling_price,pred,main="linear regression") 
    
    #####################################################################
    #####################################################################
    #############################  randomForest  ########################
    #####################################################################
    #####################################################################
    set.seed(2021)
    model2 = randomForest(train$selling_price~., data=train, importance=TRUE)
    pred2 = predict(model2, test)
    
    regr.eval(test$selling_price,pred2)
    print("R-squared");cor(test$selling_price,pred2)^2
    result$随机森林回归=pred2
    plot(test$selling_price,pred2,main="Random forest regression") 
    
    #####################################################################
    #####################################################################
    ########################       svm回归      ########################
    #####################################################################
    #####################################################################
    library(e1071)
    model3 = svm(train$selling_price~., data=train)
    pred3=predict(model3,newdata = test)
    
    regr.eval(test$selling_price,pred3)
    print("R-squared");cor(test$selling_price,pred3)^2
    result$svm=pred3
    plot(test$selling_price,pred3,main="svm regression") 
    
    #####################################################################
    #####################################################################
    ########################       决策树回归      ########################
    #####################################################################
    #####################################################################
    library(rpart)
    library(rpart.plot)
    model4=rpart(train$selling_price~., data=train)
    pred4=predict(model4,newdata = test)
    
    regr.eval(test$selling_price,pred4)
    print("R-squared");cor(test$selling_price,pred4)^2
    result$决策树=pred4
    plot(test$selling_price,pred4,main="Decision tree regression") 
    
    library(rattle)
    fancyRpartPlot(model4)
    #####################################################################
    #####################################################################
    ########################       xgboost     ########################
    #####################################################################
    #####################################################################
    #install.packages("xgboost")
    library("xgboost")
    library(Matrix)
    
    traindata1 <- Matrix(data.matrix(train[,-16]),sparse=T) # 利用Matrix函数,将sparse参数设置为TRUE,转化为稀疏矩阵
    traindata2 <- data.matrix(train[,16]) # 将因变量转化为numeric
    traindata3 <- list(data=traindata1,label=traindata2) # 将自变量和因变量拼接为list
    
    dtrain <- xgb.DMatrix(data = traindata3$data, label = traindata3$label) # 构造模型需要的xgb.DMatrix对象,处理对象为稀疏矩阵
    dtest = xgb.DMatrix(data.matrix(test[,-16])) # 构造模型需要的xgb.DMatrix对象,处理对象为稀疏矩阵
    
    param <- list(max_depth=20,eta=0.03) # 定义模型参数
    model <- xgboost(params=param,data = dtrain,nrounds = 15)
    pred5 <- predict(model,dtest)
     
    regr.eval(test$selling_price,pred5)
    print("R-squared");cor(test$selling_price,pred5)^2
    
    result$xgboost=pred5 
    plot(test$selling_price,pred5,main="xgboost regression") 
    View(result)
    #去D盘data文件夹,result表格中复制需要的内容哈~
    write.csv(result,"D:/data/result.csv")
    
    #############################################################
    #############################################################
    ######################## 误差曲线图 ############################
    #############################################################
    
    result$随机森林误差 = abs(result$实际-result$随机森林回归)
    result$线性回归误差 = abs(result$实际-result$线性回归)
    result$svm误差 = abs(result$实际-result$svm)
    result$决策树误差 = abs(result$实际-result$决策树)
    result$xgboost误差 = abs(result$实际-result$xgboost)
    
    result$随机森林相对误差 = abs(result$实际-result$随机森林回归)/result$实际
    result$线性回归相对误差 = abs(result$实际-result$线性回归)/result$实际
    result$svm相对误差 = abs(result$实际-result$svm)/result$实际
    result$决策树相对误差 = abs(result$实际-result$决策树)/result$实际
    result$xgboost相对误差 = abs(result$实际-result$xgboost)/result$实际
    View(result)
    
    #误差
    x = 1:length(result$线性回归误差)
    plot(x, result$随机森林误差, type = "n", xlab = "", ylab = "", ylim = range(result$xgboost误差, result$随机森林误差) * c(10,1))
    points(x, result$线性回归误差, type = "o", pch = 1, col = 1, lty = 1, lwd = 2)
    points(x, result$随机森林误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
    points(x, result$svm误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
    points(x, result$决策树误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
    points(x, result$xgboost误差, type = "o", pch = 3, col = 5, lty = 5, lwd = 2)
    title(main = "误差图", xlab = "测试集序号", ylab = "误差")
    legend("topleft", inset=.05, c("线性回归误差","随机森林误差","svm","决策树误差","xgboost误差"),lty=c(1, 2,3,4,5), pch=c(1, 2,3,4,5),col=c(1, 2,3,4,5))
    
    #相对误差
    x = 1:length(result$线性回归相对误差)
    plot(x, result$随机森林相对误差, type = "n", xlab = "", ylab = "", ylim = range(result$线性回归相对误差, result$随机森林相对误差) * c(11,1))
    points(x, result$线性回归相对误差, type = "o", pch = 1, col = 1, lty = 1, lwd = 2)
    points(x, result$随机森林相对误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
    points(x, result$svm相对误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
    points(x, result$决策树相对误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
    points(x, result$xgboost相对误差, type = "o", pch = 3, col = 5, lty = 5, lwd = 2)
    title(main = "相对误差图", xlab = "测试集序号", ylab = "误差")
    legend("topleft", inset=.05, c("线性回归相对误差","随机森林相对误差","svm","决策树相对误差","xgboost相对误差"),lty=c(1, 2,3,4,5), pch=c(1, 2,3,4,5),col=c(1, 2,3,4,5))
    
    #相对误差部分观察
    x = 1:length(result$线性回归相对误差)
    plot(x, result$随机森林相对误差, type = "n", xlab = "", ylab = "", ylim = range(result$决策树相对误差, result$随机森林相对误差) * c(10,1))
    points(x, result$随机森林相对误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
    points(x, result$svm相对误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
    points(x, result$决策树相对误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
    title(main = "相对误差图", xlab = "测试集序号", ylab = "误差")
    legend("topleft", inset=.05, c("随机森林相对误差","svm","决策树相对误差"),lty=c(2,3,4), pch=c(2,3,4),col=c(2,3,4))
    
    
    ##########随机森林参数调整############
    A=c()
    #注意循环次数越大,耗时越久(这里to表示最多尝试树的数目,by为间隔)
    ns=seq(from=50, to=350, by=50)
    for (i in ns){
    set.seed(2021)
    pred=randomForest(train$selling_price~., data=train, importance=TRUE, ntree=i) %>% predict(test)
    p_acc=regr.eval(test$selling_price,pred)
    A=c(A,p_acc[[4]])
    }
    A
    
    
    plot(ns,A,main="随机森林所包含树的数目与MAPE关系图", ylab="MAPE", xlab="随机森林所包含的树的数目", frame.plot = TRUE,axes=FALSE)
    lines(ns,A)
    axis(1, at = ns, labels = formatC(ns, format = "fg"))
    axis(2, at = seq(from=0.25, to=0.30, by=0.005), labels = paste(round(seq(from=25, to=30, by=0.5),2),"%"))
    text(ns,A,round(A,4),pos=1,cex=0.7,offset=-.8)
    
    
    
    
    #####################################################################
    #####################################################################
    #############################  最优 randomForest  ####################
    #####################################################################
    #####################################################################
    set.seed(2021)
    model2 = randomForest(train$selling_price~., data=train, importance=TRUE,ntree=300)
    pred2 = predict(model2, test)
    
    regr.eval(test$selling_price,pred2)
    print("R-squared");cor(test$selling_price,pred2)^2
    plot(test$selling_price,pred2,main="Random forest regression") 
    
    str(df)
    fill=brand
    
    ggplot(df) +
      geom_density_ridges(aes(y = brand, x = selling_price))
    
    
    
    
    
    

    相关文章

      网友评论

          本文标题:R语言数据分析必备

          本文链接:https://www.haomeiwen.com/subject/halaaktx.html