美文网首页算法技术
使用sklearn+jieba分词写的文本分类

使用sklearn+jieba分词写的文本分类

作者: 碎冰op | 来源:发表于2017-06-21 23:38 被阅读1039次

    结巴分词是对中文语言进行处理的一个Python模块

    import jieba
    luca = jieba.cut('遇见你真的是太好了')
    print(list(luca))#直接使用jieba.cut()返回的不是列表,需要用list()或set(),''.join()等方式输出,或使用jieba.lcut()
    
    import jieba.analyse
    for x, w in jieba.analyse.extract_tags(text):#可以再添加一个参数指定输出个数
        print(x, w)#直接输出关键词和词频
    

    jieba分词的这个函数返回的高频词效果不太理想,在实际中我没使用它,事实上也没使用analyse的停词方法,而是手动进行了停词处理

    def rm_char1(text1):
        text1 = re.sub('\u3000', '', text1)    
        return text1
        
    def rm_char2(text2):
        text2 = re.sub('\xa0', '', text2)
        return text2
    
    def get_stop_words():
        # stop_words中,每行放一个停用词,以\n分隔
        with open('F:\jieba\stop_words.txt','r', encoding = 'utf8') as f:
            file = f.read().split('\n')
        return set(file)
    
    def rm_tokens(words): # 去掉一些停用词和数字
        words_list = list(words)
        stop_words = get_stop_words()
        for i in range(words_list.__len__())[::-1]:
            if words_list[i] in stop_words: # 去除停用词
                words_list.pop(i)
            elif words_list[i].isdigit():
                words_list.pop(i)
        return words_list
    
    def convert_text_to_wordlist(str_doc):
        # 分词的主要方法
        sent_list = str_doc.split('\n')
        sent_list = map(rm_char1, sent_list) # 去掉一些字符,例如\u3000
        sent_list = map(rm_char2, sent_list) # 去掉\xa0
        word_2dlist = [rm_tokens(jieba.cut(part)) for part in sent_list] # 分词
        word_list = sum(word_2dlist,[])
        return word_list
    

    使用的话直接

    luca = convert_text_to_wordlist('遇见你真的是太好了')
    
    import jieba
    import jieba.analyse
    import csv
    from sklearn import feature_extraction  
    
    def get_dataset():
        data, targetdata = [], []
        with open('D:\datatrain.csv',  'r', encoding='gb18030') as file:
            f = csv.reader(file)
            for line in f:
                seglist = jieba.cut(line[2])
                words = ' '.join(seglist)
                data.append(words)
                targetdata.append(1) if 'T' in line[1] or 't' in line[1] else targetdata.append(0)
        return data,targetdata
    def get_testset():
        testdata, targettest = [], []
        with open('D:\datatest.csv', 'r',encoding='gb18030') as file:
            f = csv.reader(file)
            for line in f:
                seglist = jieba.cut(line[2])
                words = ' '.join(seglist)
                testdata.append(words)
                targettest.append(1) if 'T' in line[1] or 't' in line[1] else targettest.append(0)
        return testdata, targettest
    
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn import svm
    
    def data_pro():
        data_,target_train = get_dataset()
        testdata, target_test = get_testset()
        
        v = TfidfVectorizer()
        train_data = v.fit_transform(data_)
        test_data = v.transform(testdata)
        
        return train_data, test_data
    
    datapro()   
    clf = MultinomialNB(alpha=0.01)  
    clf.fit(train_data,target_train)   
    pred = clf.predict(testdata)
    '''#辣鸡不如Naive bayes
    svc = svm.SVC(kernel='linear')  
    svc.fit(train_data,target_train)   
    pred = svc.predict(testdata) 
    '''
    count=0
    for l,r in zip(pred, target_test):
        if l == r:
            count +=1
    print(count/len(target_test))#输出正确率
    

    相关文章

      网友评论

        本文标题:使用sklearn+jieba分词写的文本分类

        本文链接:https://www.haomeiwen.com/subject/aakxcxtx.html