美文网首页
Python: Word(docx)文档词频统计

Python: Word(docx)文档词频统计

作者: autumn1919 | 来源:发表于2019-07-22 09:07 被阅读0次
    #该程序读入D:/data_temp下的所有docx文件,并实现词频统计
    #输出每个文档的单词频数,并进行绘图
    #docx 
    import os 
    import docx
    from pyecharts.charts import Bar
    from pyecharts import options as opts
    words=['security','as','nation','百度','law']
    
    def getFileName(path):
            filename = []
            f_list = os.listdir(path)
            for i in f_list:
                if os.path.splitext(i)[1] == '.docx':
                    filename.append(i)
            return filename
    lsdir=os.listdir('d:/data_temp')
    
    for file in lsdir:
        data = []
        #print(getFileName('d:/'))
        doc1=r'd:/data_temp/%s'%file
        document=docx.Document(doc1)
        #print(document.paragraphs[0].text)
        for i in range(len(document.paragraphs)):
            para=document.paragraphs[i].text.replace('\r',' ').replace('\n',' ').replace('(',' ').replace(')',' ').replace(',',' ').replace('.',' ').strip().lower().split(' ')
            data.extend(para)
        #print(data)
        new_dict = {}
        for strs in data:
            if strs in new_dict.keys():
                new_dict[strs] = new_dict[strs]+1
            else:
                new_dict[strs] = 1
    #count_list=sorted(new_dict.items(),key=lambda x:x[1],reverse=True)
        plot_name=[]
        plot_value=[]
        lists=[]
        for k in words:
            if k in new_dict:
                plot_name.append(k)
                plot_value.append(new_dict[k])
                print("%s"%file+" 单词 "+"%s"%k+" 的出现频数为 "+"%s"%new_dict[k]+" 次")
            else:
                print("%s"%file+" 单词 "+"%s"%k+" 未出现!")
                plot_name.append(k)
                plot_value.append(0)
        bar=Bar()
        bar.add_xaxis(plot_name)
        bar.add_yaxis("词语出现次数", plot_value)
        #bar.add("词语出现次数", plot_name,plot_value,is_label_show=True, is_datazoom_show=False, xaxis_rotate=30)
        bar.set_global_opts(title_opts=opts.TitleOpts(title="词频统计"))
        file_abb=file.replace('\.docx','')
        name="%s"%file_abb+"-"+"汇总词频统计" 
        bar.render('%s.html'%name)
        for k in range(len(plot_name)):
            lists.append([plot_name[k],plot_value[k]])
        with open('%s.txt'%name,'w') as f:
            f.write('词语,频数'+'\n')
            for i in lists:
                i=str(i).strip('[').strip(']').replace('\'','')
                #print(i)
                f.write(i+'\n')
    

    相关文章

      网友评论

          本文标题:Python: Word(docx)文档词频统计

          本文链接:https://www.haomeiwen.com/subject/ntxilctx.html