美文网首页
英文文本预处理

英文文本预处理

作者: _龙雀 | 来源:发表于2019-06-12 18:55 被阅读0次
    import nltk
    nltk.download('stopwords')
    
    def text_to_list(text):
        text = str(text)
        text = text.lower()
    
        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r"e - mail", "email", text)
        text = text.split()
        
        #去停用词
        from nltk.corpus import stopwords  
        stops = list(stopwords.words('english'))
        clean_text = []
        for  i  in text:
              if i in stops:
                continue  
              clean_text.append(i)
    
        return clean_text
    

    相关文章

      网友评论

          本文标题:英文文本预处理

          本文链接:https://www.haomeiwen.com/subject/ferrfctx.html