The caret packages

caret 是一个完备的数据挖掘工具包,功能如下: 1. 数据分割 2. 数据预处理 3. 特征选择 4. 模型训练与调优 5. 变量重要性评估


library(AppliedPredictiveModeling)transparentTheme(trans = .4)library(caret)

featurePlot(x = iris[, 1:4], y = iris$Species, plot = "pairs", ## Add a key at the top auto.key = list(columns = 3))


transparentTheme(trans = .9)featurePlot(x = iris[, 1:4], y = iris$Species, plot = "density", ## Pass in options to xyplot() to  ## make it prettier scales = list(x = list(relation="free"), y = list(relation="free")), adjust = 1.5, pch = "|", layout = c(4, 1), auto.key = list(columns = 3))


featurePlot(x = iris[, 1:4], y = iris$Species, plot = "box", ## Pass in options to bwplot()  scales = list(y = list(relation="free"), x = list(rot = 90)), layout = c(4,1 ), auto.key = list(columns = 2))

1.生成哑变量 2.去除相关性变量 3.线性依赖关系 4.中心化标准化 5.因子变量距离分析


## pclass survived sex age sibsp parch## 1 1st 1 female 29.0000 0 0## 2 1st 1 male 0.9167 1 2## 3 1st 0 female 2.0000 1 2## 4 1st 0 male 30.0000 1 2## 5 1st 0 female 25.0000 1 2## 6 1st 1 male 48.0000 0 0

head(model.matrix(survived ~ ., data = etitanic))

## (Intercept) pclass2nd pclass3rd sexmale age sibsp parch## 1 1 0 0 0 29.0000 0 0## 2 1 0 0 1 0.9167 1 2## 3 1 0 0 0 2.0000 1 2## 4 1 0 0 1 30.0000 1 2## 5 1 0 0 0 25.0000 1 2## 6 1 0 0 1 48.0000 0 0

dummies <- dummyVars(survived ~ ., data = etitanic)head(predict(dummies, newdata = etitanic))

## pclass.1st pclass.2nd pclass.3rd sex.female sex.male age sibsp parch## 1 1 0 0 1 0 29.0000 0 0## 2 1 0 0 0 1 0.9167 1 2## 3 1 0 0 1 0 2.0000 1 2## 4 1 0 0 0 1 30.0000 1 2## 5 1 0 0 1 0 25.0000 1 2## 6 1 0 0 0 1 48.0000 0 0

descrCor <- cor(iris[,-5])highlyCorDescr <- findCorrelation(descrCor, cutoff = .75)highlyCorDescr

## [1] 3 4


ltfrDesign <- matrix(0, nrow=6, ncol=6)ltfrDesign[,1] <- c(1, 1, 1, 1, 1, 1)ltfrDesign[,2] <- c(1, 1, 1, 0, 0, 0)ltfrDesign[,3] <- c(0, 0, 0, 1, 1, 1)ltfrDesign[,4] <- c(1, 0, 0, 1, 0, 0)ltfrDesign[,5] <- c(0, 1, 0, 0, 1, 0)ltfrDesign[,6] <- c(0, 0, 1, 0, 0, 1)comboInfo <- findLinearCombos(ltfrDesign)comboInfo

## $linearCombos## $linearCombos[[1]]## [1] 3 1 2## ## $linearCombos[[2]]## [1] 6 1 4 5## ## ## $remove## [1] 3 6

ltfrDesign[, -comboInfo$remove]

## [,1] [,2] [,3] [,4]## [1,] 1 1 1 0## [2,] 1 1 0 1## [3,] 1 1 0 0## [4,] 1 0 1 0## [5,] 1 0 0 1## [6,] 1 0 0 0

set.seed(96)data(mdrr)inTrain <- sample(seq(along = mdrrClass), length(mdrrClass)/2)training <- mdrrDescr[inTrain,]test <- mdrrDescr[-inTrain,]trainMDRR <- mdrrClass[inTrain]testMDRR <- mdrrClass[-inTrain]preProcValues <- preProcess(training, method = c("center", "scale"))

## Warning in preProcess.default(training, method = c("center", "scale")):## These variables have zero variances: nI, nR08, D.Dr08, T.N..I., T.O..I.,## T.Cl..Br., T.I..I., G.N..I., G.O..I., G.Cl..Br., G.I..I.

trainTransformed <- predict(preProcValues, training)testTransformed <- predict(preProcValues, test)

pac ica 主成分分析和独立性分析
library(AppliedPredictiveModeling)transparentTheme(trans = .4)plotSubset <- data.frame(scale(mdrrDescr[, c("nC", "X4v")])) xyplot(nC ~ X4v, data = plotSubset, groups = mdrrClass, auto.key = list(columns = 2)) 

transformed <- spatialSign(plotSubset)transformed <- ~ X4v, data = transformed, groups = mdrrClass, auto.key = list(columns = 2)) 
trainSet <- sample(1:150, 100) distData <- classDist(iris[trainSet, 1:4], iris$Species[trainSet]) newDist <- predict(distData, iris[-trainSet, 1:4]) splom(newDist, groups = iris$Species[-trainSet])



library(caret)set.seed(3456)trainIndex <- createDataPartition(iris$Species, p = .8, list = FALSE, times = 1)head(trainIndex)

## Resample1## [1,] 1## [2,] 2## [3,] 4## [4,] 5## [5,] 6## [6,] 8

irisTrain <- iris[ trainIndex,]irisTest <- iris[-trainIndex,]head(irisTrain)

## Sepal.Length Sepal.Width Petal.Length Petal.Width Species## 1 5.1 3.5 1.4 0.2 setosa## 2 4.9 3.0 1.4 0.2 setosa## 4 4.6 3.1 1.5 0.2 setosa## 5 5.0 3.6 1.4 0.2 setosa## 6 5.4 3.9 1.7 0.4 setosa## 8 5.0 3.4 1.5 0.2 setosa


## Sepal.Length Sepal.Width Petal.Length Petal.Width Species## 3 4.7 3.2 1.3 0.2 setosa## 7 4.6 3.4 1.4 0.3 setosa## 12 4.8 3.4 1.6 0.2 setosa## 14 4.3 3.0 1.1 0.1 setosa## 21 5.4 3.4 1.7 0.2 setosa## 23 4.6 3.6 1.0 0.2 setosa

caret 中的train函数可以训练几乎所有的统计机器学习算法:

library(mlbench)data(Sonar)str(Sonar[, 1:10])

## 'data.frame': 208 obs. of 10 variables:## $ V1 : num 0.02 0.0453 0.0262 0.01 0.0762 0.0286 0.0317 0.0519 0.0223 0.0164 ...## $ V2 : num 0.0371 0.0523 0.0582 0.0171 0.0666 0.0453 0.0956 0.0548 0.0375 0.0173 ...## $ V3 : num 0.0428 0.0843 0.1099 0.0623 0.0481 ...## $ V4 : num 0.0207 0.0689 0.1083 0.0205 0.0394 ...## $ V5 : num 0.0954 0.1183 0.0974 0.0205 0.059 ...## $ V6 : num 0.0986 0.2583 0.228 0.0368 0.0649 ...## $ V7 : num 0.154 0.216 0.243 0.11 0.121 ...## $ V8 : num 0.16 0.348 0.377 0.128 0.247 ...## $ V9 : num 0.3109 0.3337 0.5598 0.0598 0.3564 ...## $ V10: num 0.211 0.287 0.619 0.126 0.446 ...

library(caret)set.seed(998)inTraining <- createDataPartition(Sonar$Class, p = .75, list = FALSE)training <- Sonar[ inTraining,]testing <- Sonar[-inTraining,]fitControl <- trainControl(## 10-fold CV method = "repeatedcv", number = 10, ## repeated ten times repeats = 10)set.seed(825)gbmFit1 <- train(Class ~ ., data = training, method = "gbm", trControl = fitControl, ## This last option is actually one ## for gbm() that passes through verbose = FALSE)

## Stochastic Gradient Boosting ## ## 157 samples## 60 predictor## 2 classes: 'M', 'R' ## ## No pre-processing## Resampling: Cross-Validated (10 fold, repeated 10 times) ## Summary of sample sizes: 142, 142, 140, 142, 142, 141, ... ## Resampling results across tuning parameters:## ## interaction.depth n.trees Accuracy Kappa ## 1 50 0.7609191 0.5163703## 1 100 0.7934216 0.5817734## 1 150 0.7977230 0.5897796## 2 50 0.7858235 0.5667749## 2 100 0.8188897 0.6316548## 2 150 0.8194363 0.6329037## 3 50 0.7895686 0.5726290## 3 100 0.8130564 0.6195719## 3 150 0.8221348 0.6383441## ## Tuning parameter 'shrinkage' was held constant at a value of 0.1## ## Tuning parameter 'n.minobsinnode' was held constant at a value of 10## Accuracy was used to select the optimal model using the largest value.## The final values used for the model were n.trees = 150,## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.

plot(gbmFit1, metric = "Kappa", plotType = "level", scales = list(x = list(rot = 90)))

