美文网首页
拼写自动矫正

拼写自动矫正

作者: 梧青 | 来源:发表于2016-08-11 11:36 被阅读0次

    R语言2行代码实现

    sorted_words<-names(sort(table(strsplit(tolower(paste(readLines("http://www.norvig.com/big.txt"),collapse=" ")),"[^a-z]+")),decreasing=TRUE))
    correct<-function(word){c(sorted_words[adist(word,sorted_words)<=min(adist(word,sorted_words),2)],word)[1]}
    

    代码解释

    # Read in big.txt, a 6.5 mb collection of different English texts.
    raw_text<-paste(readLines("http://www.norvig.com/big.txt"),collapse=" ")
    # Make the text lowercase and split it up creating a huge vector of word tokens.
    split_text<-strsplit(tolower(raw_text),"[^a-z]+")
    # Count the number of different type of words.
    word_count<-table(split_text)
    # Sort the words and create an ordered vector with the most common type of words first.
    sorted_words<-names(sort(word_count,decreasing=TRUE))
    correct<-function(word)
    # Calculate the edit distance between the word and all other words in sorted_words.
    edit_dist<-adist(word,sorted_words)
    # Calculate the minimum edit distance to find a word that exists in big.txt
    # with a limit of two edits.
    min_edit_dist<-min(edit_dist,2)
    # Generate a vector with all words with this minimum edit distance.
    # Since sorted_words is ordered from most common to least common, the resulting
    # vector will have the most common / probable match first.
    proposals_by_prob<-c(sorted_words[edit_dist<=min(edit_dist,2)])
    # In case proposals_by_prob would be empty we append the word to be corrected...
    proposals_by_prob<-c(proposals_by_prob,word)
    # ... and return the first / most probable word in the vector.
    proposals_by_prob[1]
    

    Python 21行代码实现

    """Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html
    
    Copyright (c) 2007-2016 Peter Norvig
    MIT license: www.opensource.org/licenses/mit-license.php
    """
    
    ################ Spelling Corrector 
    
    import re
    from collections import Counter
    
    def words(text): return re.findall(r'\w+', text.lower())
    
    WORDS = Counter(words(open('big.txt').read()))
    
    def P(word, N=sum(WORDS.values())): 
        "Probability of `word`."
        return WORDS[word] / N
    
    def correction(word): 
        "Most probable spelling correction for word."
        return max(candidates(word), key=P)
    
    def candidates(word): 
        "Generate possible spelling corrections for word."
        return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
    
    def known(words): 
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in words if w in WORDS)
    
    def edits1(word):
        "All edits that are one edit away from `word`."
        letters    = 'abcdefghijklmnopqrstuvwxyz'
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
        inserts    = [L + c + R               for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)
    
    def edits2(word): 
        "All edits that are two edits away from `word`."
        return (e2 for e1 in edits1(word) for e2 in edits1(e1))
    

    测试代码

    ################ Test Code 
    
    def unit_tests():
        assert correction('speling') == 'spelling'              # insert
        assert correction('korrectud') == 'corrected'           # replace 2
        assert correction('bycycle') == 'bicycle'               # replace
        assert correction('inconvient') == 'inconvenient'       # insert 2
        assert correction('arrainged') == 'arranged'            # delete
        assert correction('peotry') =='poetry'                  # transpose
        assert correction('peotryy') =='poetry'                 # transpose + delete
        assert correction('word') == 'word'                     # known
        assert correction('quintessential') == 'quintessential' # unknown
        assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
        assert Counter(words('This is a test. 123; A TEST this is.')) == (
               Counter({'123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2}))
        assert len(WORDS) == 32192
        assert sum(WORDS.values()) == 1115504
        assert WORDS.most_common(10) == [
         ('the', 79808),
         ('of', 40024),
         ('and', 38311),
         ('to', 28765),
         ('in', 22020),
         ('a', 21124),
         ('that', 12512),
         ('he', 12401),
         ('was', 11410),
         ('it', 10681)]
        assert WORDS['the'] == 79808
        assert P('quintessential') == 0
        assert 0.07 < P('the') < 0.08
        return 'unit_tests pass'
    
    def spelltest(tests, verbose=False):
        "Run correction(wrong) on all (right, wrong) pairs; report results."
        import time
        start = time.clock()
        good, unknown = 0, 0
        n = len(tests)
        for right, wrong in tests:
            w = correction(wrong)
            good += (w == right)
            if w != right:
                unknown += (right not in WORDS)
                if verbose:
                    print('correction({}) => {} ({}); expected {} ({})'
                          .format(wrong, w, WORDS[w], right, WORDS[right]))
        dt = time.clock() - start
        print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '
              .format(good / n, n, unknown / n, n / dt))
        
    def Testset(lines):
        "Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
        return [(right, wrong)
                for (right, wrongs) in (line.split(':') for line in lines)
                for wrong in wrongs.split()]
    
    if __name__ == '__main__':
        print(unit_tests())
        spelltest(Testset(open('spell-testset1.txt')))
        spelltest(Testset(open('spell-testset2.txt')))
    

    致谢:

    1. norvig大大 http://norvig.com/spell-correct.html

    相关文章

      网友评论

          本文标题:拼写自动矫正

          本文链接:https://www.haomeiwen.com/subject/pizjsttx.html