美文网首页机器学习+深度学习
NLP入门:文本预处理(一)停用词

NLP入门:文本预处理(一)停用词

作者: Gary_sun | 来源:发表于2019-08-21 16:37 被阅读0次
文本预处理——去停用词
停用词文本可以从https://pan.baidu.com/s/1q21hIK95QU9qDstptd8V8g 自提,不谢
该停用词文本转自https://blog.csdn.net/FontThrone/article/details/74200026,自己还未创建新的停用词,后续更新。。。。
# - * - coding: utf - 8 -*-
import sys

# 获取停用词的List
def GetListOfStopWords(filepath):
    f_stop = open(filepath, encoding='utf-8')
    try:
        f_stop_text = f_stop.read()
    finally:
        f_stop.close()
    f_stop_seg_list = f_stop_text.split('\n')

    return f_stop_seg_list


# 保存List
# def SaveFile(list, filename):
#     f_stop = open(filename, 'w', encoding='utf-8')
#     for item in range(len(list)):
#         if item != len(list):
#             f_stop.writelines((list[item].encode('utf-8')) + '\n')
#         else:
#             f_stop.writelines(list[item].encode('utf-8'))
#     f_stop.close()


# 求List并集
def GetListUnion(listName):
    ListUnion = ['!']
    for item in listName:
#         print(item)
        ListUnion.extend(GetListOfStopWords(item))
    return list(set(ListUnion))


def GetStopWords(listOfFileName, FileName='CNstopwords.txt', keynumber=1):
    stopwords_pathCN = stop_dir + 'CNstopwords.txt'  # 默认中文总表 1
    stopwords_pathEN = stop_dir + 'ENstopwords.txt'  # 默认英文总表 2
    stopwords_pathCNEN = stop_dir + 'CNENstopwords.txt'  # 默认中英文混合总表 4
    if keynumber == 1:
        listOfFileName.append(stopwords_pathCN)
    elif keynumber == 2:
        listOfFileName.append(stopwords_pathEN)
    elif keynumber == 3:
        listOfFileName.append(stopwords_pathCN)
        listOfFileName.append(stopwords_pathEN)
    elif keynumber == 5:
        listOfFileName.append(stopwords_pathCN)
        listOfFileName.append(stopwords_pathCNEN)
    elif keynumber == 6:
        listOfFileName.append(stopwords_pathEN)
        listOfFileName.append(stopwords_pathCNEN)
    elif keynumber == 7:
        listOfFileName.append(stopwords_pathCN)
        listOfFileName.append(stopwords_pathEN)
        listOfFileName.append(stopwords_pathCNEN)
    else:
        listOfFileName.append(stopwords_pathCN)
        print('The keynumber is wrong,change keynumber to 1 ')
        
        listOfFileName.append(stopwords_pathCNEN)
    ListUnion = GetListUnion(listOfFileName)
    return ListUnion
#     SaveFile(ListUnion, FileName)

listOfFileName = []

# 需要添加的 中文 停用词词表
stop_dir = "./stopwords/"
stopwords_path1 = stop_dir + 'stopwords1893.txt'
stopwords_path2 = stop_dir + 'stopwords1229.txt'
stopwords_path3 = stop_dir + 'stopwordshagongdakuozhan.txt'
stopwords_path4 = stop_dir + 'stop_words_zh.txt'

# 需要添加的 英文 停用词词表
stopwords_path5 = stop_dir + 'stop_words_eng.txt'
stopwords_path6 = stop_dir + 'ENstopwords891.txt'

# 需要添加的 中文 停用词词表路径
listOfFileName.append(stopwords_path1)
listOfFileName.append(stopwords_path2)
listOfFileName.append(stopwords_path3)
listOfFileName.append(stopwords_path4)

# 需要添加的 英文 停用词词表路径
listOfFileName.append(stopwords_path5)
listOfFileName.append(stopwords_path6)

res = GetStopWords(listOfFileName, FileName=stop_dir + 'ENstopwords.txt', keynumber=2)


相关文章

网友评论

    本文标题:NLP入门:文本预处理(一)停用词

    本文链接:https://www.haomeiwen.com/subject/qedisctx.html