美文网首页
python过滤敏感字的算法(dfa)

python过滤敏感字的算法(dfa)

作者: 不懒狮Blaise | 来源:发表于2018-07-25 09:44 被阅读0次
    class cNode(object):
        def __init__(self):
            self.children = None
            
    # The encode of word is UTF-8
    # The encode of message is UTF-8
    class cDfa(object):
        def __init__(self,lWords):
            self.root=None
            self.root=cNode()
            for sWord in lWords:
                self.addWord(sWord)
                
     
        # The encode of word is UTF-8
        def addWord(self, word):
            node = self.root
            iEnd=len(word)-1
            for i in range(len(word)):
                
                if node.children == None:
                    node.children = {}
                    if i!=iEnd:
                        node.children[word[i]]=(cNode(),False)
                    else:
                        node.children[word[i]]=(cNode(),True)
     
                elif word[i] not in node.children:
                    if i!=iEnd:
                        node.children[word[i]]=(cNode(),False)
                    else:
                        node.children[word[i]]=(cNode(),True)
                else: #word[i] in node.children:
                    if i==iEnd:
                        Next,bWord=node.children[word[i]]
                        node.children[word[i]]=(Next,True)
                
                print(word[i], node.children[word[i]])
                node=node.children[word[i]][0]
                
     
        def isContain(self,sMsg):
            root=self.root
            iLen=len(sMsg)
            for i in range(iLen):
                p = root
                j = i
                while (j<iLen and p.children!=None and sMsg[j] in p.children):
                    (p, bWord) = p.children[sMsg[j]]
                    if bWord:
                        return True
                    j = j + 1
            return False
     
        def filter(self,sMsg):
            lNew=[]
            root=self.root
            iLen=len(sMsg)
            i=0
            bContinue=False
            while i<iLen:
                p=root
                j=i
                while (j<iLen and p.children!=None and sMsg[j] in p.children):
                    (p,bWord) = p.children[sMsg[j]]
                    if bWord:
                        #print sMsg[i:j+1]
                        lNew.append(u'*'*(j-i+1))#关键字替换
                        i=j+1
                        bContinue=True
                        break
                    j=j+1
                if bContinue:
                    bContinue=False
                    continue
                lNew.append(sMsg[i])
                i=i+1
            return ''.join(lNew)
    
    

    参考
    https://blog.csdn.net/gamesofsailing/article/details/36421539

    相关文章

      网友评论

          本文标题:python过滤敏感字的算法(dfa)

          本文链接:https://www.haomeiwen.com/subject/mbnumftx.html