Supervised machine learning
loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/" ds <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data" url <- paste(loc, ds, sep="") breast <- read.table(url, sep=",", header=FALSE, na.strings="?") names(breast) <- c("ID", "clumpThickness", "sizeUniformity", "shapeUniformity", "maginalAdhesion","singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitosis", "class") df <- breast[-1] df$class <- factor(df$class, levels=c(2,4), labels=c("benign", "malignant")) set.seed(1234) train <- sample(nrow(df), 0.7*nrow(df)) df.train <- df[train,] df.validate <- df[-train,] table(df.train$class) table(df.validate$class)
1. logistic regression
fit.logit <- glm(class ~.,data = df.train,family = binomial())summary(fit.logit)
#logit.fit.reduced <- step(fit.logit)prob <- predict(fit.logit,df.validate,type = "response")logit.pred <- factor(prob > .5,levels = c(FALSE,TRUE),labels = c("benign","maglignant"))logit.perf <- table(df.validate$class, logit.pred, dnn=c("Actual", "Predicted"))logit.perf
2.1 decision treee
library(rpart)set.seed(1234)head(df.train)dtree <- rpart(class ~.,data = df.train,method = "class",parms = list(split = "information"))dtreedtree$cptableplotcp(dtree)dtree.pruned <- prune(dtree,cp=.0125)library(rpart.plot)prp(dtree.pruned,type = 2,extra= 104,fallen.leaves = T,main = "Decision Tree")dtree.pred <- predict(dtree.pruned, df.validate, type="class")dtree.perf <- table(df.validate$class,dtree.pred,dnn = c("actual","predicted"))dtree.perf#print(dtree);summary(dtree)
2.2. conditional inference tree
library(party)ctree <- ctree(class ~.,data = df.train)plot(ctree,main = "conditional inference tree")ctree.pred <- predict(ctree,df.validate,type = "response")ctree.predctree.perf <- table(df.validate$class,ctree.pred,dnn=c("actual","predicted"))prop.table(ctree.perf)
table(subset(df.train,df.train$normalNucleoli >3 & df.train$sizeUniformity <= 3 & df.train$bareNuclei <= 5,class))
3. random forest
library(randomForest)fit.forest <- randomForest(class ~.,data = df.train,na.action = na.roughfix,importance = T)fit.forest
importance(fit.forest,type = 2)forest.pred <- predict(fit.forest,df.validate)forest.perf <- table(df.validate$class,forest.pred,dnn = c("actual","predicted"))forest.perfprop.table(forest.perf)
4. Support vector machine
library(e1071)fit.svm <- svm(class~.,data = df.train)fit.svmsvm.pred <- predict(fit.svm, na.omit(df.validate))svm.perf <- table(na.omit(df.validate)$class, svm.pred, dnn=c("Actual", "Predicted"))svm.perf
two para gamma,cost (>0)
tuned <- tune.svm(class ~.,data = df.train,gamma = 10^(-6:1),cost = 10^(-10:10))
tuned
fit.svmtuned <- svm(class~.,data = df.train,gamma = 0.01,cost =1)
rattle
loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/" ds <- "pima-indians-diabetes/pima-indians-diabetes.data" url <- paste(loc, ds, sep="") diabetes <- read.table(url, sep=",", header=FALSE) names(diabetes) <- c("npregant", "plasma", "bp", "triceps", "insulin", "bmi", "pedigree", "age", "class") diabetes$class <- factor(diabetes$class, levels=c(0,1), labels=c("normal", "diabetic"))
all these data from R action edition 2
网友评论