美文网首页
机器学习学习笔记--朴素贝叶斯实践

机器学习学习笔记--朴素贝叶斯实践

作者: 松爱家的小秦 | 来源:发表于2017-11-30 22:40 被阅读0次

    朴素贝叶斯算法是应用最为广泛的分类算法之一。简称NB算法。可以用来检测异常操作,检测DGA域名,检测针对Apache的DDos攻击,检测基于MNIST数据集的验证码。

    朴素贝叶斯算法包括以下算法

    高斯朴素贝叶斯算法

    多项式朴素贝叶斯算法

    伯努利朴素贝叶斯

    1.hellobeiyesi

    # coding: utf-8

    from sklearn import datasets

    iris = datasets.load_iris()

    from sklearn.naive_bayes import GaussianNB

    gnb = GaussianNB()

    y_pred = gnb.fit(iris.data,iris.target).predict(iris.data)#训练并预测数据

    print ("Number of mislabeled points out of total %d points : %d" % (iris.data.shape[0],(iris.target != y_pred).sum()))

    2.检测异常操作

    操作思路:

    1.数据的搜集与清洗

    2.特征化

    3.训练模型

    4.效果验证

    检测异常操作:

    # -*- coding:utf-8 -*-

    import sys

    import urllib

    import urlparse

    import re

    from hmmlearn import hmm

    import numpy as np

    from sklearn.externals import joblib

    import HTMLParser

    import nltk#用来分词

    import csv

    import matplotlib.pyplot as plt

    from nltk.probability import FreqDist

    from sklearn.feature_extraction.text import CountVectorizer

    from sklearn.neighbors import KNeighborsClassifier

    from sklearn.naive_bayes import GaussianNB

    N = 90

    def load_user_cmd_new(filename):

    cmd_list=[]

    dist = []

    with open(filename) as f:

    i=0

    x=[]

    for line in f:

    line = line.strip('\n')

    x.append(line)

    dist.append(line)

    i+=1

    if i == 100:

    cmd_list.append(x)

    x=[]

    i=0

    fdist=FreqDist(dist).keys()

    return cmd_list,fdist

    def load_user_cmd(filename):

    cmd_list=[]

    dist_max=[]

    dist_min=[]

    dist=[]

    with open(filename) as f:

    i=0

    x=[]

    for line in f:

    line=line.strip('\n')

    x.append(line)

    dist.append(line)

    i+=1

    if i == 100:

    cmd_list.append(x)

    x=[]

    i=0

    fdist = FreqDist(dist).keys()

    dist_max = set(fdist[0:50])

    dist_min = set(fdist[-50:])

    return cmd_list,dist_max,dist_min

    #以下是提取特征

    def get_user_cmd_feature(user_cmd_list,dist_max,dist_min):

    user_cmd_feature=[]

    for cmd_block in user_cmd_list:

    f1=len(set(cmd_block))

    fdist = FreqDist(cmd_block).keys()

    f2 = fdist[0:10]

    f3 = fdist[-10:]

    f2 = len(set(f2) & set(dist_max))

    f3 = len(set(f3)&set(dist_min))

    x = [f1,f2,f3]

    user_cmd_feature.append(x)

    return user_cmd_feature

    def get_user_cmd_feature_new(user_cmd_list,dist):

    user_cmd_feature=[]

    for cmd_list in user_cmd_list:

    v=[0]*len(dist)

    for i in range(0,len(dist)):

    if dist[i] in cmd_list:

    v[i]+=1

    user_cmd_feature.append(v)

    return user_cmd_feature

    def get_label(filename,index=0):

    x=[]

    with open(filename) as f:

    for line in f:

    line=line.strip('\n')

    x.append(int(line.split()[index]))

    return x

    if __name__ == '__main__':

    user_cmd_list,dist=load_user_cmd_new("/home/qin/code/python/web-ml/1book-master/data/MasqueradeDat/User3")

    user_cmd_feature=get_user_cmd_feature_new(user_cmd_list,dist)

    labels=get_label("/home/qin/code/python/web-ml/1book-master/data/MasqueradeDat/label.txt",2)

    y=[0]*50+labels

    x_train=user_cmd_feature[0:N]

    y_train=y[0:N]

    x_test = user_cmd_feature[N:150]

    y_test = y[N:150]

    neigh = KNeighborsClassifier(n_neighbors=3)

    neigh.fit(x_train,y_train)

    y_predict_knn=neigh.predict(x_test)

    print y_train

    clf = GaussianNB().fit(x_train,y_train)

    y_predict_nb=clf.predict(x_test)

    score=np.mean(y_test==y_predict_knn)*100

    print "KNN %d " % score

    score=np.mean(y_test==y_predict_nb)*100

    print "NB %d" % score

    结果:

    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

    KNN 83

    NB 83

    2.检测WebShell

    # -*- coding:utf-8 -*-

    import os

    from sklearn.feature_extraction.text import CountVectorizer

    import sys

    import numpy as np

    from sklearn import cross_validation

    from sklearn.naive_bayes import GaussianNB

    def load_file(file_path):

    t=""

    with open(file_path) as f:

    for line in f:

    line=line.strip('\n')

    t+=line

    return t

    def load_files(path):

    files_list=[]

    for r,d,files in os.walk(path):

    for file in files:

    if file.endswith('.php'):

    file_path=path+file

    print "Load %s" % file_path

    t=load_file(file_path)

    files_list.append(t)

    return files_list

    if __name__=='__main__':

    webshell_bigram_vectorizer=CountVectorizer(ngram_range=(2,2),decode_error="ignore",token_pattern= r'\b\w+\b',min_df=1)

    #ngram_range表明基于2-gram,ignore表示忽略异常字符的影响,token_pattern表示按单词切分

    webshell_files_list=load_files("/home/qin/code/python/web-ml/1book-master/data/PHP-WEBSHELL/xiaoma/")

    x1=webshell_bigram_vectorizer.fit_transform(webshell_files_list).toarray()

    y1=[1]*len(x1)

    vocabulary=webshell_bigram_vectorizer.vocabulary_

    wp_bigram_vectorizer = CountVectorizer(ngram_range=(2,2),decode_error="ignore",token_pattern=r"\b\w+\b",min_df=1,vocabulary=vocabulary)

    wp_files_list=load_files("/home/qin/code/python/web-ml/1book-master/data/wordpress/")

    x2=wp_bigram_vectorizer.fit_transform(wp_files_list).toarray()

    y2=[0]*len(x2)

    x=np.concatenate((x1,x2))

    y=np.concatenate((y1,y2))

    clf = GaussianNB()

    print cross_validation.cross_val_score(clf,x,y,n_jobs=-1,cv=3)

    3.shell2

    import os

    from sklearn.feature_extraction.text import CountVectorizer

    import sys

    import numpy as np

    from sklearn import cross_validation

    from sklearn.naive_bayes import GaussianNB

    r_token_pattern=r'\b\w+\b\(|\'w+\''

    def load_file(file_path):

    t=""

    with open(file_path) as f:

    for line in f:

    line=line.strip('\n')

    t+=line

    return t

    def load_files(path):

    files_list=[]

    for r, d, files in os.walk(path):

    for file in files:

    if file.endswith('.php'):

    file_path=path+file

    #print "Load %s" % file_path

    t=load_file(file_path)

    files_list.append(t)

    return  files_list

    if __name__ == '__main__':

    webshell_bigram_vectorizer = CountVectorizer(ngram_range=(1, 1), decode_error="ignore",

    token_pattern = r_token_pattern,min_df=1)

    webshell_files_list=load_files("/home/qin/code/python/web-ml/1book-master/data/PHP-WEBSHELL/xiaoma")

    x1=webshell_bigram_vectorizer.fit_transform(webshell_files_list).toarray()

    y1=[1]*len(x1)

    vocabulary=webshell_bigram_vectorizer.vocabulary_

    wp_bigram_vectorizer = CountVectorizer(ngram_range=(1, 1), decode_error="ignore",

    token_pattern = r_token_pattern,min_df=1,vocabulary=vocabulary)

    wp_files_list=load_files("/home/qin/code/python/web-ml/1book-master/data/wordpress/")

    x2=wp_bigram_vectorizer.transform(wp_files_list).toarray()

    y2=[0]*len(x2)

    x=np.concatenate((x1,x2))

    y=np.concatenate((y1, y2))

    clf = GaussianNB()

    print vocabulary

    print cross_validation.cross_val_score(clf,x,y,n_jobs=-1,cv=3)

    相关文章

      网友评论

          本文标题:机器学习学习笔记--朴素贝叶斯实践

          本文链接:https://www.haomeiwen.com/subject/wiwbbxtx.html