美文网首页机器学习
机器学习实战第4章-朴素贝叶斯

机器学习实战第4章-朴素贝叶斯

作者: 异想派 | 来源:发表于2017-03-18 17:52 被阅读74次
    • 从文本中构建词向量
    def loaddataset():
        postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classvec=[0,1,0,1,0,1]
        return postinglist,classvec
    
    def createvocablist(dataset):
        vocabset=set([])
        for document in dataset:
            vocabset=vocabset|set(document)   #union of the two sets
        return list(vocabset)
    
    def setofwords2vec(vocablist,inputset):
        returnvec=[0]*len(vocablist)   #create a vector where compose of 0
        for word in inputset:
            if word in vocablist:
                returnvec[vocablist.index(word)]=1
            else:
                print "the word:%s is not in my vocabulary" %word
        return returnvec
    
    if __name__=='__main__':
        listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
        myvocablist=createvocablist(listoposts)
        a=setofwords2vec(myvocablist,listoposts[3])
        print myvocablist
        print a
    

    • 从词向量计算概率
    from numpy import *
    def loaddataset():
        postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classvec=[0,1,0,1,0,1]
        return postinglist,classvec
    
    def createvocablist(dataset):
        vocabset=set([])
        for document in dataset:
            vocabset=vocabset|set(document)   #union of the two sets
        return list(vocabset)
    
    def setofwords2vec(vocablist,inputset):
        returnvec=[0]*len(vocablist)   #create a vector where compose of 0
        for word in inputset:
            if word in vocablist:
                returnvec[vocablist.index(word)]=1
            else:
                print "the word:%s is not in my vocabulary" %word
        return returnvec
    
    def trainnb0(trainmatrix,traincategory):
        numtraindocs=len(trainmatrix)   #number of document
        numwords=len(trainmatrix[0])    #element of each document
        pabusive=sum(traincategory)/float(numtraindocs)
        p0num=zeros(numwords)
        p1num=zeros(numwords)
        p0denom=0.0 ; p1denom=0.0
        for i in range(numtraindocs):
            if traincategory[i]==1:
                p1num+=trainmatrix[i]
                p1denom+=sum(trainmatrix[i])
            else:
                p0num+=trainmatrix[i]
                p0denom+=sum(trainmatrix[i])
        p1vect=p1num/p1denom
        p0vect=p0num/p0denom
        return p0vect,p1vect,pabusive
    
    if __name__=='__main__':
        listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
        myvocablist=createvocablist(listoposts)   #contain unique value of documents
        trainmat=[]
        for postindoc in listoposts:
            trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
        p0v,p1v,pab=trainnb0(trainmat,listclasses)
        print myvocablist
        print trainmat
        print pab
        print p0v
        print p1v
    

    • 测试算法,根据现实情况修改分类器
    '''
    Created on March, 2017
    
    @author: yang
    '''
    
    from numpy import *
    def loaddataset():
        postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classvec=[0,1,0,1,0,1]
        return postinglist,classvec
    
    def createvocablist(dataset):
        vocabset=set([])
        for document in dataset:
            vocabset=vocabset|set(document)   #union of the two sets
        return list(vocabset)
    
    def setofwords2vec(vocablist,inputset):
        returnvec=[0]*len(vocablist)   #create a vector where compose of 0
        for word in inputset:
            if word in vocablist:
                returnvec[vocablist.index(word)]=1
            else:
                print "the word:%s is not in my vocabulary" %word
        return returnvec
    
    def trainnb0(trainmatrix,traincategory):
        numtraindocs=len(trainmatrix)   #number of document
        numwords=len(trainmatrix[0])    #element of each document
        pabusive=sum(traincategory)/float(numtraindocs)
        p0num=ones(numwords)
        p1num=ones(numwords)
        p0denom=2.0 ; p1denom=2.0
        for i in range(numtraindocs):
            if traincategory[i]==1:
                p1num+=trainmatrix[i]
                p1denom+=sum(trainmatrix[i])
            else:
                p0num+=trainmatrix[i]
                p0denom+=sum(trainmatrix[i])
        p1vect=log(p1num/p1denom)
        p0vect=log(p0num/p0denom)
        return p0vect,p1vect,pabusive
    
    def classifynb(vec2classify,p0vec,p1vec,pclass1):
        p1=sum(vec2classify*p1vec)+log(pclass1)
        p0=sum(vec2classify*p0vec)+log(1-pclass1)
        if p1>p0:
            return 1
        else:
            return 0
    
    def testingnb():
        listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
        myvocablist=createvocablist(listoposts)   #contain unique value of documents
        trainmat=[]
        for postindoc in listoposts:
            trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
        p0v,p1v,pab=trainnb0(trainmat,listclasses)  
        testentry=['love','my','dalmation']
        thisdoc=array(setofwords2vec(myvocablist,testentry))
        print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
        testentry=['stupid','garbage']
        thisdoc=array(setofwords2vec(myvocablist,testentry))
        print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
    
    if __name__=='__main__':
        testingnb()
    

    • 测试算法:使用朴素贝叶斯进行交叉验证
    from numpy import *
    def loaddataset():
        postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classvec=[0,1,0,1,0,1]
        return postinglist,classvec
    
    def createvocablist(dataset):
        vocabset=set([])
        for document in dataset:
            vocabset=vocabset|set(document)   #union of the two sets
        return list(vocabset)
    
    def setofwords2vec(vocablist,inputset):
        returnvec=[0]*len(vocablist)   #create a vector where compose of 0
        for word in inputset:
            if word in vocablist:
                returnvec[vocablist.index(word)]+=1  #bag-of    
            else:
                print "the word:%s is not in my vocabulary" %word
        return returnvec
    
    def trainnb0(trainmatrix,traincategory):
        numtraindocs=len(trainmatrix)   #number of document
        numwords=len(trainmatrix[0])    #element of each document
        pabusive=sum(traincategory)/float(numtraindocs)
        p0num=ones(numwords)
        p1num=ones(numwords)
        p0denom=2.0 ; p1denom=2.0
        for i in range(numtraindocs):
            if traincategory[i]==1:
                p1num+=trainmatrix[i]
                p1denom+=sum(trainmatrix[i])
            else:
                p0num+=trainmatrix[i]
                p0denom+=sum(trainmatrix[i])
        p1vect=log(p1num/p1denom)
        p0vect=log(p0num/p0denom)
        return p0vect,p1vect,pabusive
    
    def classifynb(vec2classify,p0vec,p1vec,pclass1):
        p1=sum(vec2classify*p1vec)+log(pclass1)
        p0=sum(vec2classify*p0vec)+log(1-pclass1)
        if p1>p0:
            return 1
        else:
            return 0
    
    def testingnb():
        listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
        myvocablist=createvocablist(listoposts)   #contain unique value of documents
        trainmat=[]
        for postindoc in listoposts:
            trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
        p0v,p1v,pab=trainnb0(trainmat,listclasses)  
        testentry=['love','my','dalmation']
        thisdoc=array(setofwords2vec(myvocablist,testentry))
        print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
        testentry=['stupid','garbage']
        thisdoc=array(setofwords2vec(myvocablist,testentry))
        print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
    
    def textparse(bigstring):
        import re
        listoftokens=re.split(r'\W*',bigstring)
        return [tok.lower() for tok in listoftokens if len(tok)>2]
    
    def spamtest():
        doclist=[]
        classlist=[]
        fulltext=[]
        for i in range(1,26):
            wordlist=textparse(open('/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/spam/%d.txt' % i).read())
            doclist.append(wordlist)
            fulltext.extend(wordlist)
            classlist.append(1)
            wordlist=textparse(open("/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/ham/%d.txt" % i).read())
            doclist.append(wordlist)
            fulltext.extend(wordlist)
            classlist.append(0)
        vocablist=createvocablist(doclist)
        trainingset=range(50)
        testset=[]
        for i in range(10):
            randindex=int(random.uniform(0,len(trainingset)))
            testset.append(trainingset[randindex])
            del (trainingset[randindex])
        trainmat=[]
        trainclasses=[]
        for docindex in trainingset:
            trainmat.append(setofwords2vec(vocablist,doclist[docindex]))
            trainclasses.append(classlist[docindex])
        p0v,p1v,pspam=trainnb0(array(trainmat),array(trainclasses))
        errorcount=0
        for docindex in testset:
            wordvector=setofwords2vec(vocablist,doclist[docindex])
            if classifynb(array(wordvector),p0v,p1v,pspam)!=classlist[docindex]:
                errorcount+=1
        print 'the error rate is: ',float(errorcount)/len(testset) 
    
    if __name__=='__main__':
        spamtest()
    

    相关文章

      网友评论

        本文标题:机器学习实战第4章-朴素贝叶斯

        本文链接:https://www.haomeiwen.com/subject/plfpnttx.html