一. 加载模块包
install.packages("tidyverse")
library(tidyverse)
其他方法
# initial settings
libary_path <- paste(getwd(), "packages",sep="/")
dir.create(libary_path,showWarnings = FALSE)
.libPaths(libary_path)
if(!require(testthat)){
install.packages("testthat")
library(testthat)
}
if(!require(repr)){
install.packages("repr")
library(repr)
}
if(!require(MASS)){
install.packages("MASS")
library(MASS)
}
if(!require(gclus)){
install.packages("gclus")
library(gclus)
}
if(!require(GGally)){
install.packages("GGally")
library(GGally)
}
if(!require(ggcorrplot)){
install.packages("ggcorrplot")
library(ggcorrplot)
}
if(!require(tidyverse)){
install.packages("tidyverse")
library(tidyverse)
}
# Plot size deppening on your screen resolution to 4 x 3
options(repr.plot.width=4, repr.plot.height=3)
二. 读取数据:读取写入excel.xlsx
方法1:将数据和文件放置在同一文件夹下直接读取
rt <- read.table("3.txt",head=TRUE)
方法2:设置路径后读取
#读取txt数据
setwd("D:\\data")
rt <- read.table("3.txt",head=TRUE)
方法3:弹窗手动选择数据
data <- read.delim(file.choose())
data <- read.csv(file.choose())
方法4:直接路径读取数据
data <- read.delim("D:/data/xxx.csv")
方法5:复制表格内容
#右键复制表格内容后输入如下代码 以便导入复制在内存中数据:
data=read.table("clipboard",sep="\t",header = T)
制作数据
c1 <- 1:5
c2 <- c(2,5,1,8,6)
c3 <- c("w","q","r","t","p")
c4 <- c("i","w","u","z","v")
cite <- data.frame(c1,c2,c3,c4) ##生成数据框
查看数据
View(data)
str(data)
summary(data)
查看行和列的个数是(多少行 多少列)
dim(data)
#结果是:1,2 这样1行2列
查看某列元素(不重复)
unique(data$列名)
unique(data[,1])
table(data[,2])
查看 列名 & 行名
names(data)
row.names(data)
设置 列名 & 行名
names(data) = c("A","B","C")
row.names(data)=c("A","B","C")
增加一列 总计
data$total=rowSums(data)
增加一行 总计
data[7,]=colSums(data)
#这里7表示行数+1,所以如下也可以
#data[nrow(data)+1,]=colSums(data)
#或者使用合并方法
#rbind(df1,colSums(data))
三. 数据处理
改变数值类型
for (i in 2:dim(df)[2]){
df[,i]=as.numeric(df[,i])}
取消科学计数,保留小数
options(scipen = 200)
去重查看
unique(data)
标准化("max-min"结果肯定大于0) 和 归一化(结果不一定大于0)
################# 自定义标准化 #####################################
normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
data[,-11]=normalize(data[,-11])
head(data)
##或者构造新的数据集
#data_Stand=as.data.frame(lapply(data[,-11],normalize))
#data_Stand$class=data$class
#head(data_Stand)
################# 归一化函数包scale() #####################################
#第11列是标签列(目标列)
data[,-11]=scale(data[,-11])
#scale(x, center = TRUE, scale = TRUE)
View(data)
处理查看空缺值
################## Processing "NAN" ##################
na_flag = apply(is.na(data), 2, sum)
View(na_flag)
#删掉所在行
dataset<- na.omit(data)
na_flag = apply(is.na(dataset), 2, sum)
View(na_flag)
删除空值所在列
##################删除空值大于30的列##################
flag <- apply(data, 2, function(x) sum(is.na(x)) <= 30)
newdata <- data[, which(flag)]
na_flag = apply(is.na(newdata ), 2, sum)
View(na_flag)
盖帽法替换异常值
############################### 数据处理 #####################
#绘制频率分布直方图
par(mfrow=c(2,5))
for(i in 1:10)
{
hist(data[,i],main=paste(names(data)[i],"的频率分布直方图"),xlab=NA)
}
#绘制箱线图检验离群值
par(mfrow=c(2,5))
for(i in 1:10)
{
boxplot(data[i],main=paste(names(data)[i],"的箱线图"))
}
#或者另一种方法绘制箱线图检验离群值
par(mfrow=c(1,1))
boxplot(data[,-11])
###### 绘制散点图 #######
##par(mfrow=c(4,4))
##for(i in 2:17)
##{
##plot(data[,i],main=paste(names(data)[i],"的散点图"),xlab=NA,ylab=NA)
##}
#######################
#采用盖帽法替换异常值,用10%处的数据覆盖分布在10%以下的数据,用90%处的数据覆盖分布在99%以上的数据。
#这里的10%和90%取值有些极端,及供参考。
block<-function(x,lower=T,upper=T){
if(lower){
q1<-quantile(x,0.1)
x[x<=q1]<-q1
}
if(upper){
q99<-quantile(x,0.90)
x[x>q99]<-q99
}
return(x)
}
data[,-11] = sapply(data[,-11],block)
par(mfrow=c(1,1))
boxplot(data[,-11],frame = T)
str(data)
head(data)
相关图
##### 绘制替换异常值后的相关图 ##########
#相关系数cor
b=cor(data[,-11])
b
#安装ggpubr包可视化
#install.packages("ggpubr")
#install.packages("corrplot")
library("corrplot")
par(mfrow=c(1,1))
corrplot(b)
#或通过饼图查看相关性
corrplot(b,method="pie")
#用颜色显示,同时显示相关系数。
corrplot(b,method="color",addCoef.col="grey")
#显示数字,增强图形可读性。
col=colorRampPalette(c("navy", "white", "firebrick3")) #设置颜色
corrplot(b,add=TRUE, type="lower", method="number",diag=FALSE,tl.pos="n", cl.pos="n",col=col(10))
#绘制变量间相关性多图混合
par(mfrow=c(1,1))
library(PerformanceAnalytics)
chart.Correlation(data[,-11])
关于日期
#选定日期
start=as.Date("2020-1-22",'%Y-%m-%d')#起始日期
end = as.Date("2021-1-4",'%Y-%m-%d') #终止日期
ndays = end - start + 1 #总天数
#构造每日日期向量dates
dates = seq(from=start, by=1, length.out=ndays)
View(dates)
拼接变量和字符串
paste("xxx",i)
四. 划分训练集和测试集
方法1
data=data("iris")
set.seed(2021)
train_sample = sample(10000,8000) #10000个里面随机抽取8000个
#train_sample = sample(nrow(iris),0.8*nrow(iris)) #10000个里面随机抽取8000个
train=data[train_sample, ]
test=data[-train_sample, ]
方法2
data("iris")
set.seed(2)
ind = sample(2,nrow(iris),replace = TRUE,prob = c(0.7,0.3))
trainset = iris[ind == 1,]
testset = iris[ind == 2,]
查看测试集和样本集
#查看测试集和样本集
str(classdata_train)
str(classdata_test)
#接着我们查看所抽取训练集和测试集标签中各字母所占的比例
dataclassper=prop.table(table(data$class))
dataclassper
train=prop.table(table(classdata_train$class))
train
test=prop.table(table(classdata_test$class))
test
五. 条件,循环,函数
1条件
1.1条件选行 给 一个新的向量
a=df[df$列名 == 条件,]
例子
a=iris[iris$Species=="virginica",]
head(a)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
6.3 | 3.3 | 6.0 | 2.5 | virginica |
5.8 | 2.7 | 5.1 | 1.9 | virginica |
7.1 | 3.0 | 5.9 | 2.1 | virginica |
按行条件筛选
library(tidyfst)
library(survival) # 加载数据集所需的包
data("colon") # 加载数据集
# 单条件筛选
filter_dt(colon, sex == 1) # 筛选colon数据集中男性患者
# 多条件筛选
# 筛选≥50岁的男性患者
filter_dt(colon,sex==1 & age>=50)
# 筛选男性或年龄≥50岁的患者
filter_dt(colon, sex == 1 | age >= 50)
# 筛选age中大于age平均值的行
colon %>% filter_dt(age > mean(age, na.rm = TRUE))
# 筛选男性中age大于age平均值的行
colon %>% filter_dt(sex == 1 & age > mean(age, na.rm = TRUE))
# 筛选 50 ≤ age < 60的患者
filter_dt(colon, age >= 50 & age < 60)
# 筛选 50 ≤ age ≤ 60的患者
colon %>% filter_dt(between(age,50,60))
# 筛选肿瘤分化程度differ为1和2,且性别为男性的行
colon %>% filter_dt(differ %in% c(1,2), sex == 1)
# 筛选不同性别中年龄大于各自性别的年龄平均值的行
colon %>%
group_dt(
by = sex,
filter_dt(age > mean(age, na.rm = TRUE))
)
iris %>% filter_dt(Sepal.Length == max(Sepal.Length))
2循环
2.1循环添加元素到空向量
Vector=c()
for (i in 1:5){
Vector=c(Vector,i)
}
Vector
2.2循环添加向量到空数据框
D=data.frame()
for (i in 1:5){
D=rbind(D,f(i))
}
Vector
六 各类算法模型
1分类算法
data = read.csv(file.choose())
View(data)
#histplot
par(mfrow=c(2,5))
for(i in 1:10)
{
hist(data[,i],main=paste(names(data)[i]),xlab=NA,col="pink")
}
################# standarzation #####################################
normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
data[,-11]=normalize(data[,-11])
head(data)
#boxplot
par(mfrow=c(2,5))
for(i in 1:10)
{
boxplot(data[i],main=paste("Boxplot of",names(data)[i]))
}
#Map method
block<-function(x,lower=T,upper=T){
if(lower){q1<-quantile(x,0.1);x[x<=q1]<-q1}
if(upper){q99<-quantile(x,0.90);x[x>q99]<-q99}
return(x)
}
data[,-11] = sapply(data[,-11],block)
par(mfrow=c(1,1))
boxplot(data[,-11],frame = T,col="purple")
str(data)
head(data)
Model_comparison = function(data){
#load required packages
if(!require(nnet)){
install.packages("nnet")
library(nnet)
}
if(!require(gmodels)){
install.packages("gmodels")
library(gmodels)
}
if(!require(MASS)){
install.packages("MASS")
library(MASS)
}
if(!require(C50)){
install.packages("C50")
library(C50)
}
if(!require(kernlab)){
install.packages("kernlab")
library(nnet,gmodels,MASS,C50,kernlab,randomForest)
}
if(!require(randomForest)){
install.packages("randomForest")
library(randomForest)
}
#Partition data set
set.seed(2021)
train_sample = sample(nrow(data),0.8*nrow(data))
train=data[train_sample, ]
test =data[-train_sample, ]
#set the vectors of time and Accuracy
t=c()
Accuracy=c()
###################### logistic regression ######################
ptm <- proc.time()
model = glm(as.factor(train$class)~., data=train, family=binomial())
a=proc.time()-ptm
t[1]=a[[1]]
prob = predict(model, test, type="response")#type="response"Get the prediction probability
pred = factor(prob > 0.5, levels=c(FALSE, TRUE), labels=c("g", "h"))#If the probability is greater than 0.5, it is true and labeled as malignant
agreement = pred == test$class
Accuracy[1]=prop.table(table(agreement))[[2]]
################ Linear discriminant analysis ##################
ptm <- proc.time()
model =lda(as.factor(train$class)~.,data=train)
a=proc.time()-ptm
t[2]=a[[1]]
predictions=predict(model , newdata = test)
newGrop = predictions$class
agreement = newGrop == test$class
Accuracy[2]=prop.table(table(agreement))[[2]]
########### Quadratic discriminant analysis ####################
ptm <- proc.time()
model =qda(as.factor(train$class)~.,data=train)
a=proc.time()-ptm
t[3]=a[[1]]
predictions=predict(model , newdata = test)
newGrop = predictions$class
agreement = newGrop == test$class
Accuracy[3]=prop.table(table(agreement))[[2]]
########### Bayes discriminant analysis ########################
ptm <- proc.time()
model =lda(as.factor(train$class)~.,data=train,prior = c(1,1)/2)
a=proc.time()-ptm
t[4]=a[[1]]
predictions=predict(model , newdata = test)
newGrop = predictions$class
agreement = newGrop == test$class
Accuracy[4]=prop.table(table(agreement))[[2]]
############################## SVM ################################
ptm <- proc.time()
class_classifier = ksvm(as.factor(class)~., data = train, kernel = "vanilladot")
a=proc.time()-ptm
t[5]=a[[1]]
predictions = predict(class_classifier, test)
agreement <- predictions == test$class
Accuracy[5]=prop.table(table(agreement))[[2]]
########################## decision trees ##############################
#使用C5.0函数获得决策树模型
ptm <- proc.time()
data_model = C5.0(train[,-11], as.factor(train$class))
a=proc.time()-ptm
t[6]=a[[1]]
predictions = predict(data_model, test)
table(predictions, test$class)
agreement <- predictions == test$class
Accuracy[6]=prop.table(table(agreement))[[2]]
############################## Random Forest ################################
ptm <- proc.time()
data.rf = randomForest(as.factor(class) ~ ., data=train, importance=TRUE, proximity=TRUE)
a=proc.time()-ptm
t[7]=a[[1]]
predictions = predict(data.rf, test)
agreement <- predictions == test$class
Accuracy[7]=prop.table(table(agreement))[[2]]
############################ neural network ############################
model=nnet(as.factor(train$class)~.,size=20,data=train)
a=proc.time()-ptm
t[8]=a[[1]]
predictions =predict(model,test,type='class')
agreement = predictions == test$class
Accuracy[8]=model_nnet=prop.table(table(agreement))[[2]]
#Summary of statistical results
df=data.frame(rbind(Accuracy,t))
names(df) = c("logistic regression","Linear discriminant analysis","Quadratic discriminant analysis","Bayes discriminant analysis","SVM","decision trees","Random Forest","neural network")
df=t(df)
names(df) = c("Accuracy","Time")
#Export results
write.csv(df,"D:/result.csv")
return(df)
}
result=Model_comparison(data)
View(result)
回归
#install.packages("randomForest")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("pROC")
#install.packages("nnet")
#install.packages("DMwR")
#install.packages("kernlab")
library(randomForest)
library(dplyr)
library(ggplot2)
library(nnet)
library(DMwR)
library(e1071)
################################ load data ###########################
data=read.csv("D:/data/cardekho.csv")
########################图###################
##图一
library(ggplot2)
str(data)
ggplot(data,aes(x=brand,y=selling_price,fill=brand))+
geom_violin()+
geom_boxplot(width=0.3)+
labs(x = "",y="selling_price",Title="")+
facet_wrap(~brand,scale="free")+
theme_classic()
##图二
ggplot(data,aes(x=brand,y=selling_price,fill=brand))+
geom_bar(stat = 'identity')+
labs(x = "brand",y="selling_price",Title="")+
theme_classic() +
coord_flip()
##图三
library(ggridges)
ggplot(data) +
geom_density(aes(x = selling_price,fill=brand))+
facet_wrap(~brand,scale="free")+
theme_classic()
##图4
ggplot(data,aes(y=as.factor(brand) ,x=max_cost_price,fill=brand))+
geom_density_ridges()+
theme_classic()
################## Processing "NA" ##################
na_flag = apply(is.na(data), 2, sum)
View(na_flag)
#删掉所在行
dataset<- na.omit(data)
na_flag = apply(is.na(dataset), 2, sum)
View(na_flag)
str(dataset)
####################### 数据整理 ################
df=dataset[-5]
df$car_name=as.numeric(as.factor(df$car_name))
df$brand=as.numeric(as.factor(df$brand))
df$model=as.numeric(as.factor(df$model))
df$seller_type=as.numeric(as.factor(df$seller_type))
df$fuel_type=as.numeric(as.factor(df$fuel_type))
df$transmission_type=as.numeric(as.factor(df$transmission_type))
str(df)
View(df)
########相关###########
#install.packages("corrplot")
library("corrplot")
b=cor(df)
b
corrplot(b,method="color",addCoef.col="grey")
write.csv(b,"D:/data/cor.csv")
plot(df$max_power,df$selling_price,col='3')
plot(df$max_cost_price,df$selling_price,col='4')
plot(df$engine,df$selling_price,col='5')
## 挑选属性
testdata=df
ddf=testdata[,-1]
df=ddf[,-2]
str(df)
################# 自定义标准化 #####################################
normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
df[,-14]=normalize(df[,-14])
str(df)
#200位数以内不使用科学计数法
options(scipen = 200)
###################### 开始模型准备 ########################
#Partition data set
set.seed(2021)
train_sample = sample(nrow(df),0.99*nrow(df))
train=df[train_sample, ]
test =df[-train_sample, ]
### 未处理数据作为对比的对照模型
#set.seed(2021)
#train_sample = sample(nrow(testdata),0.99*nrow(df))
#train=testdata[train_sample, ]
#test =testdata[-train_sample, ]
#####################################################################
#####################################################################
######################## 线性回归 ########################
#####################################################################
#####################################################################
model = lm(train$selling_price~., data=train)
pred=predict(model,newdata = test)
regr.eval(test$selling_price,pred)
print("R-squared");cor(test$selling_price,pred)^2
result=data.frame(实际=test$selling_price,线性回归=pred)
plot(test$selling_price,pred,main="linear regression")
#####################################################################
#####################################################################
############################# randomForest ########################
#####################################################################
#####################################################################
set.seed(2021)
model2 = randomForest(train$selling_price~., data=train, importance=TRUE)
pred2 = predict(model2, test)
regr.eval(test$selling_price,pred2)
print("R-squared");cor(test$selling_price,pred2)^2
result$随机森林回归=pred2
plot(test$selling_price,pred2,main="Random forest regression")
#####################################################################
#####################################################################
######################## svm回归 ########################
#####################################################################
#####################################################################
library(e1071)
model3 = svm(train$selling_price~., data=train)
pred3=predict(model3,newdata = test)
regr.eval(test$selling_price,pred3)
print("R-squared");cor(test$selling_price,pred3)^2
result$svm=pred3
plot(test$selling_price,pred3,main="svm regression")
#####################################################################
#####################################################################
######################## 决策树回归 ########################
#####################################################################
#####################################################################
library(rpart)
library(rpart.plot)
model4=rpart(train$selling_price~., data=train)
pred4=predict(model4,newdata = test)
regr.eval(test$selling_price,pred4)
print("R-squared");cor(test$selling_price,pred4)^2
result$决策树=pred4
plot(test$selling_price,pred4,main="Decision tree regression")
library(rattle)
fancyRpartPlot(model4)
#####################################################################
#####################################################################
######################## xgboost ########################
#####################################################################
#####################################################################
#install.packages("xgboost")
library("xgboost")
library(Matrix)
traindata1 <- Matrix(data.matrix(train[,-16]),sparse=T) # 利用Matrix函数,将sparse参数设置为TRUE,转化为稀疏矩阵
traindata2 <- data.matrix(train[,16]) # 将因变量转化为numeric
traindata3 <- list(data=traindata1,label=traindata2) # 将自变量和因变量拼接为list
dtrain <- xgb.DMatrix(data = traindata3$data, label = traindata3$label) # 构造模型需要的xgb.DMatrix对象,处理对象为稀疏矩阵
dtest = xgb.DMatrix(data.matrix(test[,-16])) # 构造模型需要的xgb.DMatrix对象,处理对象为稀疏矩阵
param <- list(max_depth=20,eta=0.03) # 定义模型参数
model <- xgboost(params=param,data = dtrain,nrounds = 15)
pred5 <- predict(model,dtest)
regr.eval(test$selling_price,pred5)
print("R-squared");cor(test$selling_price,pred5)^2
result$xgboost=pred5
plot(test$selling_price,pred5,main="xgboost regression")
View(result)
#去D盘data文件夹,result表格中复制需要的内容哈~
write.csv(result,"D:/data/result.csv")
#############################################################
#############################################################
######################## 误差曲线图 ############################
#############################################################
result$随机森林误差 = abs(result$实际-result$随机森林回归)
result$线性回归误差 = abs(result$实际-result$线性回归)
result$svm误差 = abs(result$实际-result$svm)
result$决策树误差 = abs(result$实际-result$决策树)
result$xgboost误差 = abs(result$实际-result$xgboost)
result$随机森林相对误差 = abs(result$实际-result$随机森林回归)/result$实际
result$线性回归相对误差 = abs(result$实际-result$线性回归)/result$实际
result$svm相对误差 = abs(result$实际-result$svm)/result$实际
result$决策树相对误差 = abs(result$实际-result$决策树)/result$实际
result$xgboost相对误差 = abs(result$实际-result$xgboost)/result$实际
View(result)
#误差
x = 1:length(result$线性回归误差)
plot(x, result$随机森林误差, type = "n", xlab = "", ylab = "", ylim = range(result$xgboost误差, result$随机森林误差) * c(10,1))
points(x, result$线性回归误差, type = "o", pch = 1, col = 1, lty = 1, lwd = 2)
points(x, result$随机森林误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
points(x, result$svm误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
points(x, result$决策树误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
points(x, result$xgboost误差, type = "o", pch = 3, col = 5, lty = 5, lwd = 2)
title(main = "误差图", xlab = "测试集序号", ylab = "误差")
legend("topleft", inset=.05, c("线性回归误差","随机森林误差","svm","决策树误差","xgboost误差"),lty=c(1, 2,3,4,5), pch=c(1, 2,3,4,5),col=c(1, 2,3,4,5))
#相对误差
x = 1:length(result$线性回归相对误差)
plot(x, result$随机森林相对误差, type = "n", xlab = "", ylab = "", ylim = range(result$线性回归相对误差, result$随机森林相对误差) * c(11,1))
points(x, result$线性回归相对误差, type = "o", pch = 1, col = 1, lty = 1, lwd = 2)
points(x, result$随机森林相对误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
points(x, result$svm相对误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
points(x, result$决策树相对误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
points(x, result$xgboost相对误差, type = "o", pch = 3, col = 5, lty = 5, lwd = 2)
title(main = "相对误差图", xlab = "测试集序号", ylab = "误差")
legend("topleft", inset=.05, c("线性回归相对误差","随机森林相对误差","svm","决策树相对误差","xgboost相对误差"),lty=c(1, 2,3,4,5), pch=c(1, 2,3,4,5),col=c(1, 2,3,4,5))
#相对误差部分观察
x = 1:length(result$线性回归相对误差)
plot(x, result$随机森林相对误差, type = "n", xlab = "", ylab = "", ylim = range(result$决策树相对误差, result$随机森林相对误差) * c(10,1))
points(x, result$随机森林相对误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
points(x, result$svm相对误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
points(x, result$决策树相对误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
title(main = "相对误差图", xlab = "测试集序号", ylab = "误差")
legend("topleft", inset=.05, c("随机森林相对误差","svm","决策树相对误差"),lty=c(2,3,4), pch=c(2,3,4),col=c(2,3,4))
##########随机森林参数调整############
A=c()
#注意循环次数越大,耗时越久(这里to表示最多尝试树的数目,by为间隔)
ns=seq(from=50, to=350, by=50)
for (i in ns){
set.seed(2021)
pred=randomForest(train$selling_price~., data=train, importance=TRUE, ntree=i) %>% predict(test)
p_acc=regr.eval(test$selling_price,pred)
A=c(A,p_acc[[4]])
}
A
plot(ns,A,main="随机森林所包含树的数目与MAPE关系图", ylab="MAPE", xlab="随机森林所包含的树的数目", frame.plot = TRUE,axes=FALSE)
lines(ns,A)
axis(1, at = ns, labels = formatC(ns, format = "fg"))
axis(2, at = seq(from=0.25, to=0.30, by=0.005), labels = paste(round(seq(from=25, to=30, by=0.5),2),"%"))
text(ns,A,round(A,4),pos=1,cex=0.7,offset=-.8)
#####################################################################
#####################################################################
############################# 最优 randomForest ####################
#####################################################################
#####################################################################
set.seed(2021)
model2 = randomForest(train$selling_price~., data=train, importance=TRUE,ntree=300)
pred2 = predict(model2, test)
regr.eval(test$selling_price,pred2)
print("R-squared");cor(test$selling_price,pred2)^2
plot(test$selling_price,pred2,main="Random forest regression")
str(df)
fill=brand
ggplot(df) +
geom_density_ridges(aes(y = brand, x = selling_price))
网友评论