# -*- coding: utf-8 -*-
"""
Created on Fri Aug 31 08:57:45 2018
@author: Shirley
"""
import xlrd
import jieba
from collections import defaultdict
from openpyxl import load_workbook
stopwords = []
with open("D:/anaconda/shirleylearn/cipintongji/stopwords.txt","r") as f:
for stopword in f.readlines():
stopwords.append(stopword.replace("\n",""))#可以用.strip()方法去掉首位空白符,但要保留空格,所以这里不用
path = "D:/anaconda/shirleylearn/cipintongji/wordexample2.xlsx"
myexcel = xlrd.open_workbook(path)
mysheet = myexcel.sheet_by_name("Sheet1")
n = mysheet.nrows
#cut_dic = defaultdict(int)
cutlist = []
#多个循环实现每条评论的词频统计
for i in range(1,n):
comment = mysheet.row(i)[1].value#循环获取每条评论
comment_cut = jieba.lcut(comment)
cut_dic = defaultdict(int)#词频不叠加,每次统计一个句子后就清空
for word in comment_cut:#对每条评论进行词频统计
if word not in stopwords:
cut_dic[word] += 1
order = sorted(cut_dic.items(),key = lambda x:x[1],reverse = True)#降序排列词频
#print(order)
myresult = ""#字典不叠加,每次统计一个句子后就清空
for j in range(0,len(order)):#把每条评论的词频统计结果保存为str格式
result = order[j][0]+"("+str(order[j][1])+")"
myresult = myresult + " " + result#myresult和result的顺序不能换,否则就变升序啦
cutlist.append(myresult)
#print(cutlist)
#打开excel,把词频统计结果放入
loadfile = load_workbook(path)
sheet = loadfile["Sheet1"]#激活sheet名为“Sheet1”的表格
sheet["C1"] = "result"
for k in range(2,len(cutlist)+2):
sheet.cell(k,3,cutlist[k-2])
loadfile.save(path)
这样的统计结果方便回到原文中挑新词
网友评论