美文网首页
python过滤敏感字的算法(dfa)

python过滤敏感字的算法(dfa)

作者: 不懒狮Blaise | 来源:发表于2018-07-25 09:44 被阅读0次
class cNode(object):
    def __init__(self):
        self.children = None
        
# The encode of word is UTF-8
# The encode of message is UTF-8
class cDfa(object):
    def __init__(self,lWords):
        self.root=None
        self.root=cNode()
        for sWord in lWords:
            self.addWord(sWord)
            
 
    # The encode of word is UTF-8
    def addWord(self, word):
        node = self.root
        iEnd=len(word)-1
        for i in range(len(word)):
            
            if node.children == None:
                node.children = {}
                if i!=iEnd:
                    node.children[word[i]]=(cNode(),False)
                else:
                    node.children[word[i]]=(cNode(),True)
 
            elif word[i] not in node.children:
                if i!=iEnd:
                    node.children[word[i]]=(cNode(),False)
                else:
                    node.children[word[i]]=(cNode(),True)
            else: #word[i] in node.children:
                if i==iEnd:
                    Next,bWord=node.children[word[i]]
                    node.children[word[i]]=(Next,True)
            
            print(word[i], node.children[word[i]])
            node=node.children[word[i]][0]
            
 
    def isContain(self,sMsg):
        root=self.root
        iLen=len(sMsg)
        for i in range(iLen):
            p = root
            j = i
            while (j<iLen and p.children!=None and sMsg[j] in p.children):
                (p, bWord) = p.children[sMsg[j]]
                if bWord:
                    return True
                j = j + 1
        return False
 
    def filter(self,sMsg):
        lNew=[]
        root=self.root
        iLen=len(sMsg)
        i=0
        bContinue=False
        while i<iLen:
            p=root
            j=i
            while (j<iLen and p.children!=None and sMsg[j] in p.children):
                (p,bWord) = p.children[sMsg[j]]
                if bWord:
                    #print sMsg[i:j+1]
                    lNew.append(u'*'*(j-i+1))#关键字替换
                    i=j+1
                    bContinue=True
                    break
                j=j+1
            if bContinue:
                bContinue=False
                continue
            lNew.append(sMsg[i])
            i=i+1
        return ''.join(lNew)

参考
https://blog.csdn.net/gamesofsailing/article/details/36421539

相关文章

网友评论

      本文标题:python过滤敏感字的算法(dfa)

      本文链接:https://www.haomeiwen.com/subject/mbnumftx.html