美文网首页
机器学习学习笔记--随机森林算法

机器学习学习笔记--随机森林算法

作者: 松爱家的小秦 | 来源:发表于2017-12-07 19:34 被阅读0次

    1.Hello 随机森林

    #-*- coding:utf-8 -*-

    from sklearn.model_selection import cross_val_score

    from sklearn.datasets import make_blobs

    from sklearn.ensemble import RandomForestClassifier

    from sklearn.ensemble import ExtraTreesClassifier

    from sklearn.tree import DecisionTreeClassifier

    x,y = make_blobs(n_samples=10000,n_features=10,centers=100,random_state=0)

    clf = DecisionTreeClassifier(max_depth=None,min_samples_split=2,random_state=0)

    scores = cross_val_score(clf,x,y)

    print scores.mean()

    #以上是决策树算法

    #以下是随机森林算法

    clf = RandomForestClassifier(n_estimators=10,max_depth=None,min_samples_split=2,random_state=0)

    scores = cross_val_score(clf,x,y)

    print scores.mean()

    输出:

    0.979408793821

    0.999607843137

    随机森林的判决能力优于决策树

    2.对比随机森林和决策树 检测FTP暴力破解

    # -*- coding:utf-8 -*-

    import re

    import matplotlib.pyplot as plt

    import os

    from sklearn.feature_extraction.text import CountVectorizer

    from sklearn import cross_validation

    import os

    from sklearn.datasets import load_iris

    from sklearn import tree

    from sklearn.ensemble import RandomForestClassifier

    import numpy as np

    def load_one_file(filename):

    x=[]

    with open(filename) as f:

    line=f.readline()

    line=line.strip('\n')

    return line

    def load_adfa_training_files(rootdir):

    x=[]

    y=[]

    list = os.listdir(rootdir)

    for i in range(0,len(list)):

    path = os.path.join(rootdir,list[i])

    if os.path.isfile(path):

    x.append(load_one_file(path))

    y.append(0)

    return x,y

    def dirlist(path,allfile):

    filelist = os.listdir(path)

    for filename in filelist:

    filepath = os.path.join(path,filename)

    if os.path.isdir(filepath):

    dirlist(filepath,allfile)

    else:

    allfile.append(filepath)

    return allfile

    def load_adfa_hydra_ftp_files(rootdir):

    x=[]

    y=[]

    allfile=dirlist(rootdir,[])

    for file in allfile:

    if re.match(r"/home/qin/code/python/web-ml/1book-master/data/ADFA-LD/Attack_Data_Master/Hydra_FTP_\d+/UAD-Hydra-FTP*",file):

    x.append(load_one_file(file))

    y.append(1)

    return x,y

    if __name__ == "__main__":

    x1,y1= load_adfa_training_files("/home/qin/code/python/web-ml/1book-master/data/ADFA-LD/Training_Data_Master/")

    x2,y2 = load_adfa_hydra_ftp_files("/home/qin/code/python/web-ml/1book-master/data/ADFA-LD/Attack_Data_Master/")

    x=x1+x2

    y=y1+y2

    vectorizer = CountVectorizer(min_df=1)

    x=vectorizer.fit_transform(x)

    x=x.toarray()

    clf1 = tree.DecisionTreeClassifier()

    score=cross_validation.cross_val_score(clf1,x,y,n_jobs=-1,cv=10)

    print np.mean(score)

    clf2 = RandomForestClassifier(n_estimators=10,max_depth=None,min_samples_split=2,random_state=0)

    score=cross_validation.cross_val_score(clf2,x,y,n_jobs=-1,cv=10)

    print np.mean(score)

    输出:

    0.962736573657

    0.986898789879

    随机森林好于决策树

    相关文章

      网友评论

          本文标题:机器学习学习笔记--随机森林算法

          本文链接:https://www.haomeiwen.com/subject/pfpjixtx.html