# coding: utf-8
import re
import urllib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def train():
goodCaseFile = 'goodcase.csv'
badCaseFile = 'xss.csv'
fgood = open(goodCaseFile)
fbad = open(badCaseFile)
goodCase = []
for case in fgood.readlines():
query = urllib.parse.unquote(case)
goodCase.append(query)
badCase = []
for case in fbad.readlines():
query = urllib.parse.unquote(case)
badCase.append(query)
fgood.close()
fbad.close()
goodY = [0 for i in range(0,len(goodCase))]
badY = [1 for i in range(0,len(badCase))]
queries = goodCase + badCase
Y = goodY + badY
regex = re.compile(r"\w+\W")
tok = lambda x: regex.findall(x)
vectorizer = TfidfVectorizer(tokenizer=tok)
X = vectorizer.fit_transform(queries)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
log_reg = LogisticRegression()
log_reg.fit(X_train, Y_train)
#test
query = ["<img onerror=alert(1)>","dddaaa","<script src=\"http://t.c\">"]
test = vectorizer.transform(query)
result = log_reg.predict(test)
print (result)
if __name__ == '__main__':
train()
结果:
[root@kali xiaokui]# python3 check.py
[1 0 1]
关键点:
使用正则处理所有请求。
regex = re.compile(r"\w+\W")
tok = lambda x: regex.findall(x)
本打算把正样本"<img onerror=alert(1)>" 处理为 "['<', 'img, 'onerror', '=', 'alert', '(', '1', ')', '>']" ,但是觉得这样处理不合适,机器学习的维度不是越多越好,而是越精简越有效才更好。
所以处理成['onerror=', 'alert(', '1)'] 这样的。然后使用逻辑回归进行训练。
网友评论