文本预处理——去停用词
停用词文本可以从https://pan.baidu.com/s/1q21hIK95QU9qDstptd8V8g 自提,不谢
该停用词文本转自https://blog.csdn.net/FontThrone/article/details/74200026,自己还未创建新的停用词,后续更新。。。。
# - * - coding: utf - 8 -*-
import sys
# 获取停用词的List
def GetListOfStopWords(filepath):
f_stop = open(filepath, encoding='utf-8')
try:
f_stop_text = f_stop.read()
finally:
f_stop.close()
f_stop_seg_list = f_stop_text.split('\n')
return f_stop_seg_list
# 保存List
# def SaveFile(list, filename):
# f_stop = open(filename, 'w', encoding='utf-8')
# for item in range(len(list)):
# if item != len(list):
# f_stop.writelines((list[item].encode('utf-8')) + '\n')
# else:
# f_stop.writelines(list[item].encode('utf-8'))
# f_stop.close()
# 求List并集
def GetListUnion(listName):
ListUnion = ['!']
for item in listName:
# print(item)
ListUnion.extend(GetListOfStopWords(item))
return list(set(ListUnion))
def GetStopWords(listOfFileName, FileName='CNstopwords.txt', keynumber=1):
stopwords_pathCN = stop_dir + 'CNstopwords.txt' # 默认中文总表 1
stopwords_pathEN = stop_dir + 'ENstopwords.txt' # 默认英文总表 2
stopwords_pathCNEN = stop_dir + 'CNENstopwords.txt' # 默认中英文混合总表 4
if keynumber == 1:
listOfFileName.append(stopwords_pathCN)
elif keynumber == 2:
listOfFileName.append(stopwords_pathEN)
elif keynumber == 3:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathEN)
elif keynumber == 5:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathCNEN)
elif keynumber == 6:
listOfFileName.append(stopwords_pathEN)
listOfFileName.append(stopwords_pathCNEN)
elif keynumber == 7:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathEN)
listOfFileName.append(stopwords_pathCNEN)
else:
listOfFileName.append(stopwords_pathCN)
print('The keynumber is wrong,change keynumber to 1 ')
listOfFileName.append(stopwords_pathCNEN)
ListUnion = GetListUnion(listOfFileName)
return ListUnion
# SaveFile(ListUnion, FileName)
listOfFileName = []
# 需要添加的 中文 停用词词表
stop_dir = "./stopwords/"
stopwords_path1 = stop_dir + 'stopwords1893.txt'
stopwords_path2 = stop_dir + 'stopwords1229.txt'
stopwords_path3 = stop_dir + 'stopwordshagongdakuozhan.txt'
stopwords_path4 = stop_dir + 'stop_words_zh.txt'
# 需要添加的 英文 停用词词表
stopwords_path5 = stop_dir + 'stop_words_eng.txt'
stopwords_path6 = stop_dir + 'ENstopwords891.txt'
# 需要添加的 中文 停用词词表路径
listOfFileName.append(stopwords_path1)
listOfFileName.append(stopwords_path2)
listOfFileName.append(stopwords_path3)
listOfFileName.append(stopwords_path4)
# 需要添加的 英文 停用词词表路径
listOfFileName.append(stopwords_path5)
listOfFileName.append(stopwords_path6)
res = GetStopWords(listOfFileName, FileName=stop_dir + 'ENstopwords.txt', keynumber=2)
网友评论