分词

作者: 月夜星空下 | 来源:发表于2020-02-06 17:35 被阅读0次
    import jieba
    import re
    word = input("请输入:")
    # 创建停用词列表
    def stopwordslist():
       stopwords = [line.strip() for line in open('/Users/lilong/Desktop/stop_words', encoding='UTF-8').readlines()]
       return stopwords
    
    cleaned_data = re.findall(u"[\u4e00-\u9fa5]+",word)
    # print(cleaned_data)
    r = ''
    for ic in cleaned_data:
       b = str(cleaned_data)
       i = str(ic)
       r += ic
    a = jieba.lcut(r)
    print("正在分词...")
    # print(a)
    stopwords = stopwordslist()
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in a:
       if word not in stopwords:
           if word != '\t':
               outstr += word
               outstr += " "
    # outstr = outstr.replace(" ","")
    print(outstr)
    print(type(outstr))
    

    相关文章

      网友评论

          本文标题:分词

          本文链接:https://www.haomeiwen.com/subject/sjyuxhtx.html