文本清理步骤
屏幕快照 2019-04-28 下午21.32.08 下午.png
语句分离器
from nltk.tokenize import sent_tokenize
import nltk.tokenize.punkt
splitlist = sent_tokenize(inputstring)
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer
标识化处理
word_tokenize(s)
词干提取
from nltk.stem import PorterStemmer #Porter词干提取器
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.Snowball import SnowballStemmer #Snowball词干提取器
pst = PorterStemmer()
lst = LancasterStemmer()
lst.stem("eating")
#eat
pst.stem("eating")
#eat
词形还原
from nltk.stem import WordNetLemmatizer
wlem = WordNetLemmatizer()
wlem.lemmatize("ate")
#eat
停用词移除
from nltk.corpus import stopwords
stoplist =stopwords.words('english')
text = "this is just a test"
cleanwordlist = [word for word in text.split() if word not in stoplist]
#['test']
罕见词移除
freq_dist = nltk_FreqDist(token)
rarewords = freq_dist.keys()[-50:]
chuliwords = [ word for word in token not in rarewords]
拼写纠错
from nltk.metrics import edit_distance
edit_distance()
网友评论