朴素贝叶斯

作者: 梦vctor | 来源:发表于2018-10-29 18:42 被阅读0次

训练算法：从词向量计算概率

#朴素贝叶斯分类器训练函数
# 条件概率的计算，trainMatrix为文档矩阵，trainCategory为每篇文档类别标签所构成的向量
def trainNB0(trainMatrix,trainCategory):
    # 计算文档的数目
    numTrainDocs=len(trainMatrix)
    # 计算单词的数目
    numWords=len(trainMatrix[0])
    # 计算类别的概率，abusive为1，not abusive为0
    pAbusive=sum(trainCategory)/float(numTrainDocs)
    # 初始化计数器，1行numWords列，p0是not abusive
    p0Num=zeros(numWords)
    # 初始化计数器，p1是abusive
    p1Num=zeros(numWords)
    # 初始化分母
    p0Denom=0.0
    p1Denom=0.0
    # 遍历文档
    for i in range(numTrainDocs):
        # 计算abusive对应的词汇的数目，trainMatrix为0-1值形成的向量
        if trainCategory[i]==1:
            # p1Num存储的是每个词出现的次数
            p1Num+=trainMatrix[i]
            # p1Denom存储的是词的总数目
            p1Denom+=sum(trainMatrix[i])
        # 计算not abusive词汇的数目
        else:
            # 每个词在not abusive下出现的次数
            p0Num+=trainMatrix[i]
            # not abusive下的总词数
            p0Denom+=sum(trainMatrix[i])
    # 计算abusive下每个词出现的概率
    p1Vect=p1Num/p1Denom
    # 计算not abusive下每个词出现的概率
    p0Vect=p0Num/p0Denom
    # 返回词出现的概率和文档为abusive的概率，not abusive的概率为1-pAbusive
    return p0Vect,p1Vect,pAbusive

#输出结果：
trainMat=[]
for postinDoc in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc))
print(trainMat)

p0v,p1v,pAb=bayes.trainNB0(trainMat,listClasses)
print(pAb)
print(p0v)
print(p1v)

输出：

[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]
0.5
[0.         0.04166667 0.04166667 0.04166667 0.         0.04166667
 0.04166667 0.04166667 0.         0.04166667 0.         0.
 0.04166667 0.         0.         0.04166667 0.         0.04166667
 0.         0.04166667 0.04166667 0.04166667 0.08333333 0.04166667
 0.         0.04166667 0.04166667 0.04166667 0.125      0.04166667
 0.         0.04166667]
[0.05263158 0.05263158 0.         0.         0.15789474 0.
 0.05263158 0.         0.10526316 0.         0.05263158 0.05263158
 0.         0.05263158 0.05263158 0.         0.05263158 0.
 0.05263158 0.         0.         0.         0.05263158 0.
 0.05263158 0.         0.         0.         0.         0.
 0.05263158 0.10526316]

测试算法：根据现实情况修改分类器

#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    # 计算abusive的概率
    p1=sum(vec2Classify*p1Vec)+log(pClass1)
    # 计算not abusive的概率
    p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
    # 根据概率大小判断属于哪个类
    if p1>p0:
        return 1
    else:
        return 0

# 测试
def testingNB():
    # 加载数据集
    listOPosts,listClasses=loadDataSet()
    # 创建词汇列表
    myVocabList=createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses))
    testEntry=['love','my','dalmation']
    thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry=['stupid','garbage']
    thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))

#输出结果：
print(bayes.testingNB())

输出：
['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as: 1

朴素贝叶斯一个最著名的应用：电子邮件垃圾过滤

image.png

准备数据

def textParse(bigString):
    # 导入正则表达式的包
    import re
    # 用正则表达式分割字符串
    listOfTokens=re.split(r'\W*',bigString)
    # 返回小写单词列表
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

# 垃圾邮件测试
def spamTest():
    # 定义docList文档列表，classList类别列表，fullText所有文档词汇
    docList=[]
    classList=[]
    fullText=[]
    # 遍历email/spam和email/ham下的txt文件
    for i in range(1,26):
        # 定义并读取垃圾邮件文件的词汇分割列表
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        # 将词汇列表加到文档列表中
        docList.append(wordList)
        # 将所有词汇列表汇总到fullText中
        fullText.extend(wordList)
        # 文档类别为1，spam
        classList.append(1)
        # 读取非垃圾邮件的文档
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        # 添加到文档列表中
        docList.append(wordList)
        # 添加到所有词汇列表中
        docList.extend(wordList)
        # 类别为0，非垃圾邮件
        classList.append(0)
    # 创建词汇列表
    vocabList=createVocabList(docList)
    # 定义训练集的索引和测试集
    trainingSet=range(50)
    testSet=[]
    # 随机的选择10个作为测试集
    for i in range(10):
        # 随机索引
        randIndex=int(random.uniform(0,len(trainingSet)))
        # 将随机选择的文档加入到测试集中
        testSet.append(trainingSet[randIndex])
        # 从训练集中删除随机选择的文档
        del(trainingSet[randIndex])
    # 定义训练集的矩阵和类别
    trainMat=[]
    trainClasses=[]
    # 遍历训练集，求得先验概率和条件概率
    for docIndex in trainingSet:
        # 将词汇列表变为向量放到trainMat中
        trainMat.append(bagOfWords2Vec(vocabList,docList[docIndex]))
        # 训练集的类别标签
        trainClasses.append(classList[docIndex])
    # 计算先验概率，条件概率
    p0V,p1V,pSam=trainNB0(array(trainMat),array(trainClasses))
    # 定义错误计数
    errorCount=0
    # 对测试集进行分类
    for docIndx in testSet:
        # 将测试集词汇向量化
        wordVector=bagOfWords2Vec(vocabList,docList[docIndex])
        # 对测试数据进行分类
        if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndx]:
            # 分类不正确，错误计数加1
            errorCount+=1
    # 输出错误率
    print('the error rate is :',float(errorCount)/len(testSet))

Debug1：
RuntimeWarning: invalid value encountered in multiply
解决方案：
是因为数组中存在0元素，添加下列代码：

import numpy as np
from numpy import *
np.seterr(divide='ignore',invalid='ignore')

Debug2:
UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multibyte sequence
解决方案：
email\ham中的23.txt中第二段多了一个问号，导致解码失败，删除‘？’之后便可以继续执行。

Debug3:
TypeError: 'range' object doesn't support item deletion
解决方案：
python3中range不返回数组对象，而是返回range对象.trainingSet = range(50);替换为trainingSet = list(range(50))。

Debug4:
NameError: name 'pSpam' is not defined
解决方案:
单词拼写错误，将pSpam改为pSam.

随机选择数据的一部分作为训练集，而剩余部分作为测试集的过程称为留存交叉验证。

网友评论

本文标题：朴素贝叶斯

本文链接：https://www.haomeiwen.com/subject/iooctqtx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！