rminer R语言机器学习库

作者: Liam_ml | 来源:发表于2018-01-22 13:47 被阅读22次

    前言

    做数据挖掘相关工作的时候,会使用借鉴一些已经包装好的算法。在R中,存在许许多多已经可以使用的算法包。当你想要使用一个算法,可以调用相应的包。
    但是有一个问题,如果我需要使用的算法比较多,那我需要从很多不同的包中调用不同的函数,这就有些许繁琐。
    所以就有了今天这一篇文章,rminer集成了很多算法,通多设定参数改变你所需要的算法,免去了调用不同包的繁琐。
    这里简单介绍一下

    rminer

    再rminer中,主要通过fit进行训练模型,通过改变model这个参数,进行训练不同的模型。以下是一些例子,包括训练模型,设置参数。

    ### dontrun is used when the execution of the example requires some computational effort.
    
    ### simple regression (with a formula) example.
    x1=rnorm(200,100,20); x2=rnorm(200,100,20)
    y=0.7*sin(x1/(25*pi))+0.3*sin(x2/(25*pi))
    M=fit(y~x1+x2,model="mlpe")
    new1=rnorm(100,100,20); new2=rnorm(100,100,20)
    ynew=0.7*sin(new1/(25*pi))+0.3*sin(new2/(25*pi))
    P=predict(M,data.frame(x1=new1,x2=new2,y=rep(NA,100)))
    print(mmetric(ynew,P,"MAE"))
    
    ### simple classification example.
    ## Not run: 
    data(iris)
    M=fit(Species~.,iris,model="rpart")
    plot(M@object); text(M@object) # show model
    P=predict(M,iris)
    print(mmetric(iris$Species,P,"CONF"))
    print(mmetric(iris$Species,P,"ALL"))
    mgraph(iris$Species,P,graph="ROC",TC=2,main="versicolor ROC",
    baseline=TRUE,leg="Versicolor",Grid=10)
    
    M2=fit(Species~.,iris,model="ctree")
    plot(M2@object) # show model
    P2=predict(M2,iris)
    print(mmetric(iris$Species,P2,"CONF"))
    
    # ctree with different setup:
    # (ctree_control is from the party package)
    M3=fit(Species~.,iris,model="ctree",controls = party::ctree_control(testtype="MonteCarlo"))
    plot(M3@object) # show model
    
    ## End(Not run)
    
    ### simple binary classification example with cv.glmnet and xgboost
    ## Not run: 
    data(sa_ssin_2)
    H=holdout(sa_ssin_2$y,ratio=2/3)
    # cv.glmnet:
    M=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet",task="cla") # pure classes
    P=predict(M,sa_ssin_2[H$ts,])
    cat("1st prediction, class:",as.character(P[1]),"\n")
    cat("Confusion matrix:\n")
    print(mmetric(sa_ssin_2[H$ts,]$y,P,"CONF")$conf)
    
    M2=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet") # probabilities
    P2=predict(M2,sa_ssin_2[H$ts,])
    L=M2@levels
    cat("1st prediction, prob:",L[1],"=",P2[1,1],",",L[2],"=",P2[1,2],"\n")
    cat("Confusion matrix:\n")
    print(mmetric(sa_ssin_2[H$ts,]$y,P2,"CONF")$conf)
    cat("AUC of ROC curve:\n")
    print(mmetric(sa_ssin_2[H$ts,]$y,P2,"AUC"))
    
    M3=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet",nfolds=3) # use 3 folds instead of 10
    plot(M3@object) # show cv.glmnet object
    P3=predict(M3,sa_ssin_2[H$ts,])
    
    # xgboost:
    M4=fit(y~.,sa_ssin_2[H$tr,],model="xgboost",verbose=1) # nrounds=2, show rounds:
    P4=predict(M4,sa_ssin_2[H$ts,])
    print(mmetric(sa_ssin_2[H$ts,]$y,P4,"AUC"))
    M5=fit(y~.,sa_ssin_2[H$tr,],model="xgboost",nrounds=3,verbose=1) # nrounds=3, show rounds:
    P5=predict(M5,sa_ssin_2[H$ts,])
    print(mmetric(sa_ssin_2[H$ts,]$y,P5,"AUC"))
    
    ## End(Not run)
    
    ### classification example with discrete classes, probabilities and holdout
    ## Not run: 
    data(iris)
    H=holdout(iris$Species,ratio=2/3)
    M=fit(Species~.,iris[H$tr,],model="ksvm",task="class")
    M2=fit(Species~.,iris[H$tr,],model="ksvm",task="prob")
    P=predict(M,iris[H$ts,])
    P2=predict(M2,iris[H$ts,])
    print(mmetric(iris$Species[H$ts],P,"CONF"))
    print(mmetric(iris$Species[H$ts],P2,"CONF"))
    print(mmetric(iris$Species[H$ts],P,"CONF",TC=1))
    print(mmetric(iris$Species[H$ts],P2,"CONF",TC=1))
    print(mmetric(iris$Species[H$ts],P2,"AUC"))
    
    ### exploration of some rminer classification models:
    models=c("lda","naiveBayes","kknn","randomForest","cv.glmnet","xgboost")
    for(m in models)
     { cat("model:",m,"\n") 
       M=fit(Species~.,iris[H$tr,],model=m)
       P=predict(M,iris[H$ts,])
       print(mmetric(iris$Species[H$ts],P,"AUC")[[1]])
     }
    
    ## End(Not run)
    
    ### classification example with hyperparameter selection 
    ###    note: for regression, similar code can be used
    ### SVM 
    ## Not run: 
    data(iris)
    # large list of SVM configurations:
    # SVM with kpar="automatic" sigma rbfdot kernel estimation and default C=1:
    #  note: each execution can lead to different M@mpar due to sigest stochastic nature:
    M=fit(Species~.,iris,model="ksvm")
    print(M@mpar) # model hyperparameters/arguments
    # same thing, explicit use of mparheuristic:
    M=fit(Species~.,iris,model="ksvm",search=list(search=mparheuristic("ksvm")))
    print(M@mpar) # model hyperparameters
    
    # SVM with C=3, sigma=2^-7
    M=fit(Species~.,iris,model="ksvm",C=3,kpar=list(sigma=2^-7))
    print(M@mpar)
    # SVM with different kernels:
    M=fit(Species~.,iris,model="ksvm",kernel="polydot",kpar="automatic") 
    print(M@mpar)
    # fit already has a scale argument, thus the only way to fix scale of "tanhdot"
    # is to use the special search argument with the "none" method:
    s=list(smethod="none",search=list(scale=2,offset=2))
    M=fit(Species~.,iris,model="ksvm",kernel="tanhdot",search=s) 
    print(M@mpar)
    # heuristic: 10 grid search values for sigma, rbfdot kernel (fdebug is used only for more verbose):
    s=list(search=mparheuristic("ksvm",10)) # advised "heuristic10" usage
    M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
    print(M@mpar)
    # same thing, uses older search="heuristic10" that works for fewer rminer models
    M=fit(Species~.,iris,model="ksvm",search="heuristic10",fdebug=TRUE)
    print(M@mpar)
    # identical search under a different and explicit code:
    s=list(search=2^seq(-15,3,2))
    M=fit(Species~.,iris,model="ksvm",search=2^seq(-15,3,2),fdebug=TRUE)
    print(M@mpar)
    
    # uniform design "UD" for sigma and C, rbfdot kernel, two level of grid searches, 
    # under exponential (2^x) search scale:
    M=fit(Species~.,iris,model="ksvm",search="UD",fdebug=TRUE)
    print(M@mpar)
    M=fit(Species~.,iris,model="ksvm",search="UD1",fdebug=TRUE)
    print(M@mpar)
    M=fit(Species~.,iris,model="ksvm",search=2^seq(-15,3,2),fdebug=TRUE)
    print(M@mpar)
    # now the more powerful search argument is used for modeling SVM:
    # grid 3 x 3 search:
    s=list(smethod="grid",search=list(sigma=2^c(-15,-5,3),C=2^c(-5,0,15)),convex=0,
                metric="AUC",method=c("kfold",3,12345))
    print(s)
    M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
    print(M@mpar)
    # identical search with different argument smethod="matrix" 
    s$smethod="matrix"
    s$search=list(sigma=rep(2^c(-15,-5,3),times=3),C=rep(2^c(-5,0,15),each=3))
    print(s)
    M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
    print(M@mpar)
    # search for best kernel (only works for kpar="automatic"):
    s=list(smethod="grid",search=list(kernel=c("rbfdot","laplacedot","polydot","vanilladot")),
           convex=0,metric="AUC",method=c("kfold",3,12345))
    print(s)
    M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
    print(M@mpar)
    # search for best parameters of "rbfdot" or "laplacedot" (which use same kpar):
    s$search=list(kernel=c("rbfdot","laplacedot"),sigma=2^seq(-15,3,5))
    print(s)
    M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
    print(M@mpar)
    
    ### randomForest
    # search for mtry and ntree
    s=list(smethod="grid",search=list(mtry=c(1,2,3),ntree=c(100,200,500)),
                convex=0,metric="AUC",method=c("kfold",3,12345))
    print(search)
    M=fit(Species~.,iris,model="randomForest",search=s,fdebug=TRUE)
    print(M@mpar)
    
    ### rpart
    # simpler way to tune cp in 0.01 to 0.9 (10 searches):
    s=list(search=mparheuristic("rpart",n=10,lower=0.01,upper=0.9),method=c("kfold",3,12345))
    M=fit(Species~.,iris,model="rpart",search=s,fdebug=TRUE)
    print(M@mpar)
    
    # same thing but with more lines of code
    # note: this code can be adapted to tune other rpart parameters,
    #       while mparheuristic only tunes cp
    # a vector list needs to be used for the search$search parameter
    lcp=vector("list",10) # 10 grid values for the complexity cp
    names(lcp)=rep("cp",10) # same cp name 
    scp=seq(0.01,0.9,length.out=10) # 10 values from 0.01 to 0.18
    for(i in 1:10) lcp[[i]]=scp[i] # cycle needed due to [[]] notation
    s=list(smethod="grid",search=list(control=lcp),
                convex=0,metric="AUC",method=c("kfold",3,12345))
    M=fit(Species~.,iris,model="rpart",search=s,fdebug=TRUE)
    print(M@mpar)
    
    ### ctree 
    # simpler way to tune mincriterion in 0.1 to 0.98 (9 searches):
    mint=c("kfold",3,123) # internal validation method
    s=list(search=mparheuristic("ctree",n=8,lower=0.1,upper=0.99),method=mint)
    M=fit(Species~.,iris,model="ctree",search=s,fdebug=TRUE)
    print(M@mpar)
    # same thing but with more lines of code
    # note: this code can be adapted to tune other ctree parameters,
    #       while mparheuristic only tunes mincriterion
    # a vector list needs to be used for the search$search parameter
    lmc=vector("list",9) # 9 grid values for the mincriterion
    smc=seq(0.1,0.99,length.out=9)
    for(i in 1:9) lmc[[i]]=party::ctree_control(mincriterion=smc[i]) 
    s=list(smethod="grid",search=list(controls=lmc),method=mint,convex=0)
    M=fit(Species~.,iris,model="ctree",search=s,fdebug=TRUE)
    print(M@mpar)
    
    ### some MLP fitting examples:
    # simplest use:
    M=fit(Species~.,iris,model="mlpe")  
    print(M@mpar)
    # same thing, with explicit use of mparheuristic:
    M=fit(Species~.,iris,model="mlpe",search=list(search=mparheuristic("mlpe")))
    print(M@mpar)
    
    print(M@mpar) # hidden nodes and number of ensemble mlps
    # setting some nnet parameters:
    M=fit(Species~.,iris,model="mlpe",size=3,decay=0.1,maxit=100,rang=0.9) 
    print(M@mpar) # mlpe hyperparameters
    # MLP, 5 grid search fdebug is only used to put some verbose in the console:
    s=list(search=mparheuristic("mlpe",n=5)) # 5 searches for size
    print(s) # show search
    M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE)
    print(M@mpar)
    # previous searches used a random holdout (seed=NULL), now a fixed seed (123) is used:
    s=list(smethod="grid",search=mparheuristic("mlpe",n=5),convex=0,metric="AUC",
                method=c("holdout",2/3,123))
    print(search)
    M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE)
    print(M@mpar)
    # faster and greedy grid search:
    s$convex=1;s$search=list(size=0:9)
    print(search)
    M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE)
    print(M@mpar)
    # 2 level grid with total of 5 searches 
    #  note of caution: some "2L" ranges may lead to non integer (e.g. 1.3) values at
    #  the 2nd level search. And some R functions crash if non integer values are used for
    #  integer parameters.
    s$smethod="2L";s$convex=0;s$search=list(size=c(4,8,12))
    print(s)
    M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE)
    print(M@mpar)
    
    ## End(Not run)
    
    ### example of an error (warning) generated using fit:
    ## Not run: 
    data(iris)
    # size needs to be a positive integer, thus 0.1 leads to an error:
    M=fit(Species~.,iris,model="mlp",size=0.1)  
    print(M@object)
    
    ## End(Not run)
    
    ### exploration of some rminer regression models:
    ## Not run: 
    data(sa_ssin)
    H=holdout(sa_ssin$y,ratio=2/3,seed=12345)
    models=c("lm","mr","ctree","mars","cubist","cv.glmnet","xgboost","rvm")
    for(m in models)
     { cat("model:",m,"\n") 
       M=fit(y~.,sa_ssin[H$tr,],model=m)
       P=predict(M,sa_ssin[H$ts,])
       print(mmetric(sa_ssin$y[H$ts],P,"MAE"))
     }
    
    ## End(Not run)
    
    ### regression example with hyperparameter selection:
    ## Not run: 
    data(sa_ssin)
    # some SVM experiments:
    # default SVM:
    M=fit(y~.,data=sa_ssin,model="svm")
    print(M@mpar)
    # SVM with (Cherkassy and Ma, 2004) heuristics to set C and epsilon:
    M=fit(y~.,data=sa_ssin,model="svm",C=NA,epsilon=NA)
    print(M@mpar)
    # SVM with Uniform Design set sigma, C and epsilon:
    M=fit(y~.,data=sa_ssin,model="ksvm",search="UD",fdebug=TRUE)
    print(M@mpar)
    
    # sensitivity analysis feature selection
    M=fit(y~.,data=sa_ssin,model="ksvm",search=list(search=mparheuristic("ksvm",n=5)),feature="sabs") 
    print(M@mpar)
    print(M@attributes) # selected attributes (1, 2 and 3 are the relevant inputs)
    
    # example that shows how transform works:
    M=fit(y~.,data=sa_ssin,model="mr") # linear regression
    P=predict(M,data.frame(x1=-1000,x2=0,x3=0,x4=0,y=NA)) # P should be negative
    print(P)
    M=fit(y~.,data=sa_ssin,model="mr",transform="positive")
    P=predict(M,data.frame(x1=-1000,x2=0,x3=0,x4=0,y=NA)) # P is not negative
    print(P)
    
    ## End(Not run)
    
    ### pure classification example with a generic R model ###
    ## Not run: 
    ### nnet is adopted here but virtually ANY fitting function/package could be used:
    
    # since the default nnet prediction is to provide probabilities, there is
    # a need to create this "wrapping" function:
    predictprob=function(object,newdata)
    { predict(object,newdata,type="class") }
    # list with a fit and predict function:
    # nnet::nnet (package::function)
    model=list(fit=nnet::nnet,predict=predictprob,name="nnet")
    data(iris)
    # note that size is not a fit parameter and it is sent directly to nnet:
    M=fit(Species~.,iris,model=model,size=3,task="class") 
    P=predict(M,iris)
    print(P)
    
    ## End(Not run)
    

    看完这边文章,你就可以用rminer训练各种模型了。

    相关文章

      网友评论

        本文标题:rminer R语言机器学习库

        本文链接:https://www.haomeiwen.com/subject/dwylaxtx.html