美文网首页
机器学习学习笔记--SVM区分僵尸网络

机器学习学习笔记--SVM区分僵尸网络

作者: 松爱家的小秦 | 来源:发表于2017-12-08 23:45 被阅读0次

    引入一个概念,杰卡德距离(Jaccard Distance),是用来衡量两个集合差异性的一种指标。两个集合交集的元素个数除以并集的元素个数。

    Jaccard index[1], 又称为Jaccard相似系数(Jaccard similarity coefficient)用于比较有限样本集之间的相似性与差异性。Jaccard系数值越大,样本相似度越高。

    隐马尔可夫模型(Hidden Markov Model,HMM)是统计模型,它用来描述一个含有隐含未知参数的马尔可夫过程。其难点是从可观察的参数中确定该过程的隐含参数。然后利用这些参数来作进一步的分析,例如模式识别

    #-*- coding:utf-8 -*-

    import sys

    import urllib

    import urlparse

    import re

    from hmmlearn import hmm  #导出隐马尔可夫模型

    import numpy as np

    from sklearn.externals import joblib

    import HTMLParser

    import nltk

    import csv

    import matplotlib.pyplot as plt

    import os

    #处理域名的最小长度

    MIN_LEN=10

    #状态个数

    N=8

    #最大似然概率阈值

    T=-50

    FILE_MODEL="9-2.m"

    #从CSV文件里逐行读取 出 每一行的第二列元素 取出域名数据

    def load_alexa(filename):

    domain_list=[]

    csv_reader = csv.reader(open(filename))

    for row in csv_reader:

    domain=row[1]

    if len(domain) >= MIN_LEN:

    domain_list.append(domain)

    return domain_list

    #chr()函数用一个范围在range(256)内的(就是0~255)整数作参数,返回一个对应的字符。unichr()跟它一样,只不过返回的是Unicode字符

    #ord()函数是chr()函数(对于8位的ASCII字符串)或unichr()函数(对于Unicode对象)的配对函数,它以一个字符(长度为1的字符串)作为参数,返回对应的ASCII数值,或者Unicode数值

    def domain2ver(domain):

    ver=[]

    for i in range(0,len(domain)):

    ver.append([ord(domain[i])])

    return ver

    #numpy提供了numpy.concatenate((a1,a2,...), axis=0)函数。能够一次完成多个数组的拼接。其中a1,a2,...是数组类型的参数

    #请参考这个教程:http://blog.csdn.net/zyl1042635242/article/details/43162031

    def train_hmm(domain_list):

    x = [[0]]

    x_lens = [1]

    for domain in domain_list:

    ver=domain2ver(domain)

    np_ver = np.array(ver)

    x=np.concatenate([x,np_ver])

    x_lens.append(len(np_ver))

    remodel = hmm.GaussianHMM(n_components=N,covariance_type="full",n_iter=100)

    remodel.fit(x,x_lens)

    joblib.dump(remodel,FILE_MODEL)

    return remodel

    def load_dga(filename):

    domain_list=[]

    with open(filename) as f:

    for line in f:

    domain=line.split(",")[0]

    if len(domain) >= MIN_LEN:

    domain_list.append(domain)

    return domain_list

    def test_dga(remodel,filename):

    x=[]

    y=[]

    dga_cryptolocke_list = load_dga(filename)

    for domain in dga_cryptolocke_list:

    domain_ver = domain2ver(domain)

    np_ver = np.array(domain_ver)

    pro = remodel.score(np_ver)

    x.append(len(domain))

    y.append(pro)

    return x,y

    def test_alexa(remodel,filename):

    x=[]

    y=[]

    alexa_list = load_alexa(filename)

    for domain in alexa_list:

    domain_ver=domain2ver(domain)

    np_ver = np.array(domain_ver)

    pro = remodel.score(np_ver)

    #print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)

    x.append(len(domain))

    y.append(pro)

    return x, y

    def show_hmm():

    domain_list = load_alexa("/home/qin/code/python/web-ml/1book-master/data/top-1000.csv")

    if not os.path.exists(FILE_MODEL):

    remodel=train_hmm(domain_list)

    remodel=joblib.load(FILE_MODEL)

    x_3,y_3=test_dga(remodel, "/home/qin/code/python/web-ml/1book-master/data/dga-post-tovar-goz-1000.txt")

    x_2,y_2=test_dga(remodel,"/home/qin/code/python/web-ml/1book-master/data/dga-cryptolocke-1000.txt")

    x_1,y_1=test_alexa(remodel, "/home/qin/code/python/web-ml/1book-master/data/test-top-1000.csv")

    fig,ax=plt.subplots()

    ax.set_xlabel('Domain Length')

    ax.set_ylabel('HMM Score')

    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')

    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')

    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')

    ax.legend(loc='best')

    plt.show()

    def get_aeiou(domain_list):

    x=[]

    y=[]

    for domain in domain_list:

    x.append(len(domain))

    count=len(re.findall(r'[aeiou]',domain.lower()))

    count=(0.0+count)/len(domain)

    y.append(count)

    return x,y

    def show_aeiou():

    x1_domain_list = load_alexa("/home/qin/code/python/web-ml/1book-master/data/top-1000.csv")

    x_1,y_1=get_aeiou(x1_domain_list)

    x2_domain_list = load_dga("/home/qin/code/python/web-ml/1book-master/data/dga-cryptolocke-1000.txt")

    x_2,y_2=get_aeiou(x2_domain_list)

    x3_domain_list = load_dga("/home/qin/code/python/web-ml/1book-master/data/dga-post-tovar-goz-1000.txt")

    x_3,y_3=get_aeiou(x3_domain_list)

    fig,ax=plt.subplots()

    ax.set_xlabel('Domain Length')

    ax.set_ylabel('AEIOU Score')

    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')

    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')

    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')

    ax.legend(loc='best')

    plt.show()

    def get_uniq_char_num(domain_list):

    x=[]

    y=[]

    for domain in domain_list:

    x.append(len(domain))

    count=len(set(domain))

    count=(0.0+count)/len(domain)

    y.append(count)

    return x,y

    def show_uniq_char_num():

    x1_domain_list = load_alexa("/home/qin/code/python/web-ml/1book-master/data/top-1000.csv")

    x_1,y_1=get_uniq_char_num(x1_domain_list)

    x2_domain_list = load_dga("/home/qin/code/python/web-ml/1book-master/data/dga-cryptolocke-1000.txt")

    x_2,y_2=get_uniq_char_num(x2_domain_list)

    x3_domain_list = load_dga("/home/qin/code/python/web-ml/1book-master/data/dga-post-tovar-goz-1000.txt")

    x_3,y_3=get_uniq_char_num(x3_domain_list)

    fig,ax=plt.subplots()

    ax.set_xlabel('Domain Length')

    ax.set_ylabel('UNIQ CHAR NUMBER')

    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')

    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')

    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')

    ax.legend(loc='best')

    plt.show()

    def count2string_jarccard_index(a,b):

    x=set(' '+a[0])

    y=set(' '+b[0])

    for i in range(0,len(a)-1):

    x.add(a[i]+a[i+1])

    x.add(a[len(a)-1]+' ')

    for i in range(0,len(b)-1):

    y.add(b[i]+b[i+1])

    y.add(b[len(b)-1]+' ')

    return (0.0+len(x-y))/len(x|y)

    def get_jarccard_index(a_list,b_list):

    x=[]

    y=[]

    for a in a_list:

    j=0.0

    for b in b_list:

    j+=count2string_jarccard_index(a,b)

    x.append(len(a))

    y.append(j/len(b_list))

    return x,y

    def show_jarccard_index():

    x1_domain_list = load_alexa("/home/qin/code/python/web-ml/1book-master/data/top-1000.csv")

    x_1,y_1=get_jarccard_index(x1_domain_list,x1_domain_list)

    x2_domain_list = load_dga("/home/qin/code/python/web-ml/1book-master/data/dga-cryptolocke-1000.txt")

    x_2,y_2=get_jarccard_index(x2_domain_list,x1_domain_list)

    x3_domain_list = load_dga("/home/qin/code/python/web-ml/1book-master/data/dga-post-tovar-goz-1000.txt")

    x_3,y_3=get_jarccard_index(x3_domain_list,x1_domain_list)

    fig,ax=plt.subplots()

    ax.set_xlabel('Domain Length')

    ax.set_ylabel('JARCCARD INDEX')

    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')

    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')

    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')

    ax.legend(loc='lower right')

    plt.show()

    if __name__ == '__main__':

    show_jarccard_index()

    相关文章

      网友评论

          本文标题:机器学习学习笔记--SVM区分僵尸网络

          本文链接:https://www.haomeiwen.com/subject/glkyixtx.html