美文网首页ml
专利文本分类

专利文本分类

作者: 卷心菜呀 | 来源:发表于2018-04-24 21:06 被阅读7次

    具体描述:
    tmt.txt,food.txt,eng,txt三个文件为训练集,里面每一行为一个样本,分别为tmt行业,食品行业和建筑行业的专利,test.txt为测试集,里面有以上三个行业的专利。现需编写python程序,计算test.txt中,这三个行业专利的比例,如{‘tmt’:0.333,'food':0.333,'eng':0.333}。

    数据集下载:链接:https://pan.baidu.com/s/1P1_w97Gc1kCedcnsC1kc9A 密码:o4g0

    思路:把每个txt文件拆分成许多小的txt文本,放入相应的文件夹下,标签为文件夹名,可以借助Bunch对象标记类别……

    1、预处理,分词

    #!/usr/bin/env python
    # -*- coding: UTF-8 -*-
    import sys
    import os
    import jieba
    # 配置utf-8输出环境
    # sys.setdefaultencoding('utf-8')
    # 分词结果的保存路径
    def savefile(savepath, content):
        with open(savepath, "w", encoding='utf-8') as fp:
            fp.write(content)
    
    # 读取待分词文件
    def readfile(path):
        with open(path, "r", encoding='utf-8') as fp:
            content = fp.read()
        return content
    
    def corpus_segment(corpus_path, seg_path):
        '''
        corpus_path: 原始训练数据的路径
        seg_path: 将原始训练数据分词后的路径
        '''
        catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录
        '''
        其中子目录的名字就是类别名,例如:
        train_corpus/tmt/tmt_21.txt中,'train_corpus/'是corpus_path,'tmt'是catelist中的一个成员
        '''
    
        # 获取每个目录(类别)下所有的文件
        for mydir in catelist:
            '''
            这里mydir就是train_corpus/tmt/21.txt中的tmt(即catelist中的一个类别)
            '''
            class_path = corpus_path + mydir + "/"  # 拼出分类子目录的路径如: train_corpus/tmt/
            seg_dir = seg_path + mydir + "/"  # 拼出分词后存贮的对应目录路径如: train_corpus_seg/tmt/
    
            if not os.path.exists(seg_dir):  # 是否存在分词目录, 如果没有则创建该目录
                os.makedirs(seg_dir)
    
            file_list = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本
            '''
            例如: 
            train_corpus/tmt/中的
            tmt_21.txt,
            tmt_22.txt,
            tmt_23.txt
            ...
            file_list=[..., 'tmt_21.txt', 'tmt_22.txt', ...]
            '''
            for file_path in file_list:  # 遍历类别目录下的所有文件
                fullname = class_path + file_path  # 拼出文件名全路径如:train_corpus/art/21.txt
                content = readfile(fullname)  # 读取文件内容
    
                content = content.replace("\r\n", "")  # 删除换行
                content = content.replace(" ", "") # 删除空行、多余的空格
                content_seg = jieba.cut(content, HMM=True)  # 为文件内容分词
                savefile(seg_dir + file_path, " ".join(content_seg))  # 将处理后的文件保存到分词后语料目录
    
    
    if __name__=="__main__":
        # 对训练集进行分词
        corpus_path = "./train_corpus/"
        seg_path = "./train_corpus_seg/"
        corpus_segment(corpus_path, seg_path)
        print("训练集分词完成!")
    
        corpus_path = "./test_corpus/"
        seg_path = "./test_corpus_seg/"
        corpus_segment(corpus_path, seg_path)
        print("训练集分词完成!")
    

    2、打标签

    #!/usr/bin/env python
    # -*- coding: UTF-8 -*-
    
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    import os # python内置的包, 用于进行文件目录操作, 之后将会用到os.listdir函数
    import pickle # 导入cPickle包并且取一个别名pickle
    
    from sklearn.datasets.base import Bunch
    
    def _readfile(path):
        '''读取文件'''
        with open(path, "rb") as fp:
            content = fp.read()
        return content
    
    def corpus2Bunch(wordbag_path,seg_path):
        catelist = os.listdir(seg_path)  # 获取seg_path下的所有子目录, 也就是分类信息
        # 创建一个Bunch实例
        bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
        bunch.target_name.extend(catelist)
        '''
        extend(addlist)是python list中的函数, 
        意思是用新的list(addlist)去扩充原来的list
        '''
    
        # 获取每个目录下所有的文件
        for mydir in catelist:
            class_path = seg_path + mydir + "/"  # 拼出分类子目录的路径
            file_list = os.listdir(class_path)  # 获取class_path下的所有文件
            for file_path in file_list:  # 遍历类别目录下文件
                fullname = class_path + file_path  # 拼出文件名全路径
                bunch.label.append(mydir)
                bunch.filenames.append(fullname)
                bunch.contents.append(_readfile(fullname))  # 读取文件内容
    
        # 将bunch存储到wordbag_path路径中
        with open(wordbag_path, "wb") as file_obj:
            pickle.dump(bunch, file_obj)
    
    if __name__ == "__main__":
        # 对训练集进行Bunch化操作
        wordbag_path = "train_word_bag/train_set.dat"  # Bunch存储路径
        seg_path = "train_corpus_seg/"
        corpus2Bunch(wordbag_path, seg_path)
        print("train Bunch created!")
    
        # 对测试集进行Bunch化操作
        wordbag_path = "test_word_bag/test_set.dat"  # Bunch存储路径
        seg_path = "test_corpus_seg/"
        corpus2Bunch(wordbag_path, seg_path)
        print("test Bunch created")
    

    3、向量化及特征提取

    #!/usr/bin/env python
    # -*- coding: UTF-8 -*-
    
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    
    # 引入Bunch类  
    from sklearn.datasets.base import Bunch
    import pickle
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # 读取文件  
    def _readfile(path):
        with open(path, "rb") as fp:
            content = fp.read()
        return content
    
    # 读取bunch对象 
    def _readbunchobj(path):
        with open(path, "rb") as file_obj:
            bunch = pickle.load(file_obj)
        return bunch
    
    # 写入bunch对象 
    def _writebunchobj(path, bunchobj):
        with open(path, "wb") as file_obj:
            pickle.dump(bunchobj, file_obj)
    
    #该函数用于创建TF-IDF词向量空间  
    def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):
    
        stpwrdlst = _readfile(stopword_path).splitlines() # 读取停用词  
        bunch = _readbunchobj(bunch_path) # 读取Bunch对象
        # 构建tf-idf词向量空间对象  
        tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
        # tdm是权值矩阵
        # vocabulary是词典索引, 即每个词对应的序号
    
        if train_tfidf_path is not None:
            trainbunch = _readbunchobj(train_tfidf_path)
            tfidfspace.vocabulary = trainbunch.vocabulary
            vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)
            tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    
        else:
            vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) # 使用TfidfVectorizer初始化向量空间模型
            # sublinear_tf=true: 计算tf值采用亚线性策略, 
            # 比如, 我们以前算tf是词频, 现在用1+log(tf)来充当词频
            # max_df=0.5: 把出现频率超过50%的词设为停用词, 比如一个词"XX"在tmt中的50%以上的文档都出现了,
            # 那么它就没有很强的分类价值, 可以将其设为停用词.
            tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) # tdm中存的就是权值矩阵
            tfidfspace.vocabulary = vectorizer.vocabulary_
    
        _writebunchobj(space_path, tfidfspace)
    
    if __name__ == '__main__':
        stopword_path = "train_word_bag/hlt_stop_words.txt"
        bunch_path = "train_word_bag/train_set.dat"
        space_path = "train_word_bag/tfdifspace.dat"
        vector_space(stopword_path,bunch_path,space_path)
        print("train TF-IDF created")
    
        bunch_path = "test_word_bag/test_set.dat"
        space_path = "test_word_bag/testspace.dat"
        train_tfidf_path="train_word_bag/tfdifspace.dat"
        vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)
        print("test TF-IDF created")
    

    4、用朴素贝叶斯分类

    #!/usr/bin/env python
    # -*- coding: UTF-8 -*-
    
    
    import pickle
    from sklearn.naive_bayes import MultinomialNB  # 导入多项式贝叶斯算法
    
    
    # 读取bunch对象
    def _readbunchobj(path):
        with open(path, "rb") as file_obj:
            bunch = pickle.load(file_obj)
        return bunch
    
    # 导入训练集
    trainpath = "train_word_bag/tfdifspace.dat" # tfdifspace是一个bunch结构, 包括了tdm矩阵和词典(vocabulary)
    train_set = _readbunchobj(trainpath)
    
    # 导入测试集
    testpath = "test_word_bag/testspace.dat"
    test_set = _readbunchobj(testpath)
    
    # 训练分类器: 输入tdm矩阵(词向量)和分类标签, alpha:0.01 alpha越小, 迭代次数越多, 精度越高
    clf = MultinomialNB(alpha=0.01).fit(train_set.tdm, train_set.label)
    
    # 预测分类结果
    predicted = clf.predict(test_set.tdm)
    tmt_count = 0
    food_count = 0
    eng_count = 0
    
    for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
        # print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate
        if flabel != expct_cate:
            if expct_cate=="tmt":
                tmt_count = tmt_count + 1
            if expct_cate=="food":
                food_count = food_count + 1
            if expct_cate=="eng":
                eng_count = eng_count + 1
    
    total = eng_count + food_count + tmt_count
    tmt_ratio = float(tmt_count) / float(total)
    food_ratio = float(food_count) / float(total)
    eng_ratio = float(eng_count) / float(total)
    
    print("predict result: ")
    print("tmt_ratio = %r" %(tmt_ratio))
    print("food_ratio = %r" %(food_ratio))
    print("eng_ratio = %r" %(eng_ratio))
    
    

    参考文献

    https://blog.csdn.net/pangtouyu_qy/article/details/79838681
    https://github.com/YasinQiu/Chinese-Text-Classification-NBayes

    相关文章

      网友评论

        本文标题:专利文本分类

        本文链接:https://www.haomeiwen.com/subject/smsjlftx.html