英文文本预处理

作者: _龙雀 | 来源:发表于2019-06-12 18:55 被阅读0次

import nltk
nltk.download('stopwords')

def text_to_list(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"e - mail", "email", text)
    text = text.split()
    
    #去停用词
    from nltk.corpus import stopwords  
    stops = list(stopwords.words('english'))
    clean_text = []
    for  i  in text:
          if i in stops:
            continue  
          clean_text.append(i)

    return clean_text