The caret packages
liam
2017/9/7
1.介绍
caret 是一个完备的数据挖掘工具包,功能如下: 1. 数据分割 2. 数据预处理 3. 特征选择 4. 模型训练与调优 5. 变量重要性评估
另外caret,包含了超过150个机器学习模型。
2.可视化
用caret中的featurePlot函数进行可视化分析:
散点图
library(AppliedPredictiveModeling)transparentTheme(trans = .4)library(caret)
Loading required package: ggplot2
featurePlot(x = iris[, 1:4], y = iris$Species, plot = "pairs", ## Add a key at the top auto.key = list(columns = 3))
密度图
transparentTheme(trans = .9)featurePlot(x = iris[, 1:4], y = iris$Species, plot = "density", ## Pass in options to xyplot() to ## make it prettier scales = list(x = list(relation="free"), y = list(relation="free")), adjust = 1.5, pch = "|", layout = c(4, 1), auto.key = list(columns = 3))
箱线图
featurePlot(x = iris[, 1:4], y = iris$Species, plot = "box", ## Pass in options to bwplot() scales = list(y = list(relation="free"), x = list(rot = 90)), layout = c(4,1 ), auto.key = list(columns = 2))
3.数据预处理
1.生成哑变量 2.去除相关性变量 3.线性依赖关系 4.中心化标准化 5.因子变量距离分析
生成哑变量
数据中,pclass和sex是因子变量,R中的基础函数model.matrix可以生成哑变量。
library(earth)
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
data(etitanic)head(etitanic)
## pclass survived sex age sibsp parch## 1 1st 1 female 29.0000 0 0## 2 1st 1 male 0.9167 1 2## 3 1st 0 female 2.0000 1 2## 4 1st 0 male 30.0000 1 2## 5 1st 0 female 25.0000 1 2## 6 1st 1 male 48.0000 0 0
head(model.matrix(survived ~ ., data = etitanic))
## (Intercept) pclass2nd pclass3rd sexmale age sibsp parch## 1 1 0 0 0 29.0000 0 0## 2 1 0 0 1 0.9167 1 2## 3 1 0 0 0 2.0000 1 2## 4 1 0 0 1 30.0000 1 2## 5 1 0 0 0 25.0000 1 2## 6 1 0 0 1 48.0000 0 0
利用caret中的dummyVars函数,可以更方便的完成此项工作:
dummies <- dummyVars(survived ~ ., data = etitanic)head(predict(dummies, newdata = etitanic))
## pclass.1st pclass.2nd pclass.3rd sex.female sex.male age sibsp parch## 1 1 0 0 1 0 29.0000 0 0## 2 1 0 0 0 1 0.9167 1 2## 3 1 0 0 1 0 2.0000 1 2## 4 1 0 0 0 1 30.0000 1 2## 5 1 0 0 1 0 25.0000 1 2## 6 1 0 0 0 1 48.0000 0 0
去除相关性变量
变量之间的有相关性说明变量存在冗余,并且模型会不稳定
descrCor <- cor(iris[,-5])highlyCorDescr <- findCorrelation(descrCor, cutoff = .75)highlyCorDescr
## [1] 3 4
说明第4个变量和第3个变量存在相关性。
线性依赖关系
函数findLinearCombos使用矩阵的QR分解来枚举一组线性组合(如果存在):
ltfrDesign <- matrix(0, nrow=6, ncol=6)ltfrDesign[,1] <- c(1, 1, 1, 1, 1, 1)ltfrDesign[,2] <- c(1, 1, 1, 0, 0, 0)ltfrDesign[,3] <- c(0, 0, 0, 1, 1, 1)ltfrDesign[,4] <- c(1, 0, 0, 1, 0, 0)ltfrDesign[,5] <- c(0, 1, 0, 0, 1, 0)ltfrDesign[,6] <- c(0, 0, 1, 0, 0, 1)comboInfo <- findLinearCombos(ltfrDesign)comboInfo
## $linearCombos## $linearCombos[[1]]## [1] 3 1 2## ## $linearCombos[[2]]## [1] 6 1 4 5## ## ## $remove## [1] 3 6
ltfrDesign[, -comboInfo$remove]
## [,1] [,2] [,3] [,4]## [1,] 1 1 1 0## [2,] 1 1 0 1## [3,] 1 1 0 0## [4,] 1 0 1 0## [5,] 1 0 0 1## [6,] 1 0 0 0
中心化标准化
set.seed(96)data(mdrr)inTrain <- sample(seq(along = mdrrClass), length(mdrrClass)/2)training <- mdrrDescr[inTrain,]test <- mdrrDescr[-inTrain,]trainMDRR <- mdrrClass[inTrain]testMDRR <- mdrrClass[-inTrain]preProcValues <- preProcess(training, method = c("center", "scale"))
## Warning in preProcess.default(training, method = c("center", "scale")):## These variables have zero variances: nI, nR08, D.Dr08, T.N..I., T.O..I.,## T.Cl..Br., T.I..I., G.N..I., G.O..I., G.Cl..Br., G.I..I.
trainTransformed <- predict(preProcValues, training)testTransformed <- predict(preProcValues, test)
pac ica 主成分分析和独立性分析
library(AppliedPredictiveModeling)transparentTheme(trans = .4)plotSubset <- data.frame(scale(mdrrDescr[, c("nC", "X4v")])) xyplot(nC ~ X4v, data = plotSubset, groups = mdrrClass, auto.key = list(columns = 2))
transformed <- spatialSign(plotSubset)transformed <- as.data.frame(transformed)xyplot(nC ~ X4v, data = transformed, groups = mdrrClass, auto.key = list(columns = 2))
因变量距离计算
trainSet <- sample(1:150, 100) distData <- classDist(iris[trainSet, 1:4], iris$Species[trainSet]) newDist <- predict(distData, iris[-trainSet, 1:4]) splom(newDist, groups = iris$Species[-trainSet])
4.数据分割
简单的按比例分割
library(caret)set.seed(3456)trainIndex <- createDataPartition(iris$Species, p = .8, list = FALSE, times = 1)head(trainIndex)
## Resample1## [1,] 1## [2,] 2## [3,] 4## [4,] 5## [5,] 6## [6,] 8
irisTrain <- iris[ trainIndex,]irisTest <- iris[-trainIndex,]head(irisTrain)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species## 1 5.1 3.5 1.4 0.2 setosa## 2 4.9 3.0 1.4 0.2 setosa## 4 4.6 3.1 1.5 0.2 setosa## 5 5.0 3.6 1.4 0.2 setosa## 6 5.4 3.9 1.7 0.4 setosa## 8 5.0 3.4 1.5 0.2 setosa
head(irisTest)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species## 3 4.7 3.2 1.3 0.2 setosa## 7 4.6 3.4 1.4 0.3 setosa## 12 4.8 3.4 1.6 0.2 setosa## 14 4.3 3.0 1.1 0.1 setosa## 21 5.4 3.4 1.7 0.2 setosa## 23 4.6 3.6 1.0 0.2 setosa
5.模型训练
caret 中的train函数可以训练几乎所有的统计机器学习算法:
library(mlbench)data(Sonar)str(Sonar[, 1:10])
## 'data.frame': 208 obs. of 10 variables:## $ V1 : num 0.02 0.0453 0.0262 0.01 0.0762 0.0286 0.0317 0.0519 0.0223 0.0164 ...## $ V2 : num 0.0371 0.0523 0.0582 0.0171 0.0666 0.0453 0.0956 0.0548 0.0375 0.0173 ...## $ V3 : num 0.0428 0.0843 0.1099 0.0623 0.0481 ...## $ V4 : num 0.0207 0.0689 0.1083 0.0205 0.0394 ...## $ V5 : num 0.0954 0.1183 0.0974 0.0205 0.059 ...## $ V6 : num 0.0986 0.2583 0.228 0.0368 0.0649 ...## $ V7 : num 0.154 0.216 0.243 0.11 0.121 ...## $ V8 : num 0.16 0.348 0.377 0.128 0.247 ...## $ V9 : num 0.3109 0.3337 0.5598 0.0598 0.3564 ...## $ V10: num 0.211 0.287 0.619 0.126 0.446 ...
library(caret)set.seed(998)inTraining <- createDataPartition(Sonar$Class, p = .75, list = FALSE)training <- Sonar[ inTraining,]testing <- Sonar[-inTraining,]fitControl <- trainControl(## 10-fold CV method = "repeatedcv", number = 10, ## repeated ten times repeats = 10)set.seed(825)gbmFit1 <- train(Class ~ ., data = training, method = "gbm", trControl = fitControl, ## This last option is actually one ## for gbm() that passes through verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
## ## Attaching package: 'survival'
## The following object is masked from 'package:caret':## ## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
## Loading required package: plyr
gbmFit1
## Stochastic Gradient Boosting ## ## 157 samples## 60 predictor## 2 classes: 'M', 'R' ## ## No pre-processing## Resampling: Cross-Validated (10 fold, repeated 10 times) ## Summary of sample sizes: 142, 142, 140, 142, 142, 141, ... ## Resampling results across tuning parameters:## ## interaction.depth n.trees Accuracy Kappa ## 1 50 0.7609191 0.5163703## 1 100 0.7934216 0.5817734## 1 150 0.7977230 0.5897796## 2 50 0.7858235 0.5667749## 2 100 0.8188897 0.6316548## 2 150 0.8194363 0.6329037## 3 50 0.7895686 0.5726290## 3 100 0.8130564 0.6195719## 3 150 0.8221348 0.6383441## ## Tuning parameter 'shrinkage' was held constant at a value of 0.1## ## Tuning parameter 'n.minobsinnode' was held constant at a value of 10## Accuracy was used to select the optimal model using the largest value.## The final values used for the model were n.trees = 150,## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
trellis.par.set(caretTheme())plot(gbmFit1)
plot(gbmFit1, metric = "Kappa", plotType = "level", scales = list(x = list(rot = 90)))
ggplot(gbmFit1)
## Warning: Ignoring unknown aesthetics: shape
网友评论