1、先通过代码对数据进行预处理及模型喂养
datasets = read.csv('50_Startups.csv')
#数据明确
datasets$State = factor(datasets$State,
levels = c("New York", "California", "Florida"),
labels = c(1,2,3))
#数据分割
#用
library(caTools)
#set.seed(123)里面有数据就是随机处理
set.seed(123)
split = sample.split(datasets$Profit, SplitRatio = 0.8)
training_set = subset(datasets, split == TRUE) #0.8
test_set = subset(datasets, split == FALSE) #0.2
#数据规范化(缩放)
#training_set[,2:3 ] = scale(training_set[, 2:3])
#test_set[,2:3 ] = scale(test_set[,2:3 ])
#导入formula包 为数据喂养线性函数 Fitting Linear(formula:画线的方法,lm画线的模型)
#regression = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.spengd + State , data = training_set)
regression = lm(formula = Profit ~., data = training_set) #简写
#预期结果,查看预测结果
y_pred = predict(regression, newdata = test_set)
#喂养多元性回归用降梯回归
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State , data = training_set)
summary(regressor) #all in
summary(regressor) #删除state
regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend , data = training_set)
summary(regressor) #删除Administration
regressor = lm(formula = Profit ~ R.D.Spend , data = training_set)
summary(regressor) #删除Marketing.Spend
注意:可以用两种方式,很多数据的时候推荐用简写
#regression = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.spengd + State , data = training_set)
regression = lm(formula = Profit ~., data = training_set) #简写
偏差最小的数据
![](https://img.haomeiwen.com/i1327433/e4bf30cf791465d5.png)
偏差最大的数据
![](https://img.haomeiwen.com/i1327433/72c6b8fa4732003b.png)
运行代码删除偏差最大的数据
![](https://img.haomeiwen.com/i1327433/cbaf1b86a53f706f.png)
进一步删除偏差最大的数据
![](https://img.haomeiwen.com/i1327433/04be4a99668d52a2.png)
最后得出R.D.Spend研发部门对毛利的影响最大
![](https://img.haomeiwen.com/i1327433/20f7a7da4ccb34d6.png)
网友评论