import jieba #用于分词
import re #正则匹配
import matplotlib.pyplot as plt #绘图
from pyecharts.charts import Geo
from pyecharts import options as opts
from pyecharts.globals import GeoType
#jieba添加自定义字典
def addword(filename):
path = '.\emotion_lexicon\\'
for i in range(len(filename)):
jieba.load_userdict(path + filename[i])
#评论清洗,去除网址url,@,无意义内容等
def cleanword():
with open('weibo.txt','r',encoding='utf-8') as f:
txt = f.readlines()
for text in txt:
#print(text)
if '分享图片' in text:
continue
text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", " ", text) # 去除正文中的@和回复/转发中的用户名
text = re.sub(r"\[\S+\]", "", text) # 去除表情符号
# text = re.sub(r"#\S+#", "", text) # 保留话题内容
URL_REGEX = re.compile(
r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
re.IGNORECASE)
text = re.sub(URL_REGEX, "", text) # 去除网址
text = text.replace("我在:", "") # 去除无意义的词语
text = text.replace("我在这里:","")
text = re.sub(r"\s+", " ", text) # 合并正文中过多的空格
file = open('cleanword.txt','a',encoding='utf-8')
file.write(text+'\n')
#构造闭包,导入情绪字典,并分析评论的情绪
def emodict(filename):
emodict = []
path = '.\emotion_lexicon\\'
for i in range(len(filename)):
file = open(path + filename[i],'r',encoding='utf-8')
emodict.append([line.strip() for line in file.readlines()])
file.close()
def splitword(): #分词获取情绪以及对应的时间地点
nonlocal emodict
emotion_list,time_list,address_list = [],[],[]
with open('cleanword.txt','r',encoding='utf-8') as f:
txt = f.readlines()
for line in txt:
sline = line.strip().split(' +0800 2013 ')
address = sline[-1]
sline = sline[0].split()
time = sline[-4:]
sentence = ''.join(sline[:-4])
#print(sentence)
emotion_dict = {'anger':0,'disgust':0,'fear':0,'joy':0,'sadness':0}
splitword = jieba.lcut(sentence)
#print(splitword)
for word in splitword:
if word in emodict[0]:
emotion_dict['anger']+=1
elif word in emodict[1]:
emotion_dict['disgust']+=1
elif word in emodict[2]:
emotion_dict['fear']+=1
elif word in emodict[3]:
emotion_dict['joy'] +=1
elif word in emodict[4]:
emotion_dict['sadness']+=1
if max(emotion_dict.values())==0:
emotion = 'no'
else:
emotion = max(emotion_dict,key=emotion_dict.get)
emotion_list.append(emotion)
time_list.append(time)
address_list.append(eval(address))
#print(emotion_dict,end = ' ')
print(len(emotion_list))
print(len(time_list))
print(len(address_list))
return emotion_list,time_list,address_list
return splitword
#绘制某种情绪咋某个时间模式下的变化趋势
def plotime(emotion,time,emotion_list,time_list):
week = ['Mon','Tus','Wed','Ths','Fri','Sat','Sun']
week_dict = {}
week_dict = week_dict.fromkeys(week,0)
month = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
month_dict = {}
month_dict = month_dict.fromkeys(month,0)
hour = ['{:0>2d}'.format(i) for i in range(24)]
hour_dict = {}
hour_dict = hour_dict.fromkeys(hour,0)
if time == 'week':
for tm in time_list:
if emotion_list[time_list.index(tm)] == emotion:
week_dict[tm[0]] += 1
week_value = []
for value in week_dict.values():
week_value.append(value)
plt.plot(week,week_value,'o-',color='r',label='week_{}'.format(emotion))
plt.xlabel("week")#横坐标名字
plt.ylabel("times")#纵坐标名字
plt.legend(loc = "best")#图例
for a,b in zip(week,week_value):
plt.text(a,b+1,b,ha = 'center',va = 'bottom',fontsize=10)
#print(week_dict)
elif time == 'month':
for tm in time_list:
if emotion_list[time_list.index(tm)] == emotion:
month_dict[tm[1]] += 1
month_value = []
for value in month_dict.values():
month_value.append(value)
plt.plot(month,month_value,'o-',color='b',label='month_{}'.format(emotion))
plt.xlabel("month")#横坐标名字
plt.ylabel("times")#纵坐标名字
plt.legend(loc = "best")#图例
for a,b in zip(month,month_value):
plt.text(a,b+1,b,ha = 'center',va = 'bottom',fontsize=10)
#print(month_dict)
elif time == 'hour':
for tm in time_list:
if emotion_list[time_list.index(tm)] == emotion:
tm = tm[-1].split(':')
hour_dict[tm[0]] += 1
hour_value = []
for value in hour_dict.values():
hour_value.append(value)
plt.plot(hour,hour_value,'o-',color='y',label='hour_{}'.format(emotion))
plt.xlabel("hour")#横坐标名字
plt.ylabel("times")#纵坐标名字
plt.legend(loc = "best")#图例
for a,b in zip(hour,hour_value):
plt.text(a,b+1,b,ha = 'center',va = 'bottom',fontsize=10)
#print(hour_dict)
else:
print('enter error!')
plt.savefig('{}_{}.png'.format(time,emotion),dpi=800)
plt.show()
#以某点为中心,r为半径的区域内各种情绪的占比
def distance(emotion_list,address_list,k=50,r=0.3):
emo1 = {'sadness':0,'joy':0,'fear':0,'disgust':0,'anger':0}
center = address_list[k]
for i in range(len(address_list)):
if emotion_list[i]!='no':
if sum([(center[j]-address_list[i][j])**2 for j in range(2)]) <= r**2:
emo1[emotion_list[i]] +=1
print(emo1)
plt.figure(figsize=(6,9)) #调节图形大小
labels = ['sadness','joy','fear','disgust','anger'] #定义标签
sizes = []
for emo in emo1.values():
sizes.append(emo)
if sum(sizes)==0:
print('No message in the area!')
colors = ['red','yellow','green','blue','pink'] #每块颜色定义
explode = (0,0,0,0,0) #将某一块分割出来,值越大分割出的间隙越大
plt.pie(sizes,explode=explode,labels=labels,colors=colors,
autopct = '%3.2f%%', #数值保留固定小数位
shadow = False, #无阴影设置
startangle =90, #逆时针起始角度设置
pctdistance = 0.6) #数值距圆心半径倍数距离
#patches饼图的返回值,texts1饼图外label的文本,texts2饼图内部的文本
# x,y轴刻度设置一致,保证饼图为圆形
plt.axis('equal')
plt.savefig('pie.png',dpi=800)
plt.show()
#在北京地图上标注出不同情绪的空间分布
def test_geo(emotion_list,address_list):
emo = {'sadness':5,'joy':15,'fear':25,'disgust':35,'anger':45}
g = Geo()
#print(address_list)
data_pair = []
g.add_schema(maptype='北京')
for k in range(len(emotion_list)):
#address_list[k] = eval(address_list[k])
if emotion_list[k] !='no':
data_pair.append((emotion_list[k]+str(k),emo[emotion_list[k]]))
#print(type(address_list[k]))
g.add_coordinate(emotion_list[k]+str(k),address_list[k][1],address_list[k][0])
# 定义坐标对应的名称,添加到坐标库中 add_coordinate(name, lng, lat)
# 将数据添加到地图上
#print(data_pair)
g.add('', data_pair, type_=GeoType.EFFECT_SCATTER, symbol_size=5)
# 设置样式
g.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
# 自定义分段 color 可以用取色器取色
pieces = [
{'min': 1, 'max': 10, 'label': 'sadness', 'color': '#3700A4'},
{'min': 10, 'max': 20, 'label': 'joy', 'color': '#81AE9F'},
{'min': 20, 'max': 30, 'label': 'fear', 'color': '#E2C568'},
{'min': 30, 'max': 40, 'label': 'disgust', 'color': '#FCF84D'},
{'min': 40, 'max': 50, 'label': 'anger', 'color': '#DD0200'}
]
# is_piecewise 是否自定义分段, 变为true 才能生效
g.set_global_opts(
visualmap_opts=opts.VisualMapOpts(is_piecewise=True, pieces=pieces),
title_opts=opts.TitleOpts(title="北京-情绪分布"),
)
return g
#主函数调用各函数
def main():
filename = ['anger.txt','disgust.txt','fear.txt','joy.txt','sadness.txt']
addword(filename)
cleanword()
f1 = emodict(filename)
emotion_list,time_list,address_list = f1()
emotion = input('please enter the emotion:')
time = input('please enter the time:')
plotime(emotion,time,emotion_list,time_list)
distance(emotion_list,address_list)
g = test_geo(emotion_list,address_list)
g.render('test_render.html') # 渲染成html, 可用浏览器直接打开
#调用主函数
if __name__ == '__main__':
main()
原始评论 weibo.txt
分享图片 我在这里:http://t.cn/z8L6aJV Fri Oct 11 21:25:07 +0800 2013 [39.88293, 116.37024]
@高娅洁 是黑妹吗? 我在:http://t.cn/zRGIa79 Fri Oct 11 19:44:31 +0800 2013 [39.964324, 116.354873]
男士秋冬新款小脚裤! 我在:http://t.cn/zRq5Uhl Sat Oct 12 21:05:40 +0800 2013 [39.83868, 116.37965]
楚国的灭亡皆是因接受了贿赂,让她的小伎俩离自己远一点 我在:http://t.cn/zRGqrYC Fri Oct 11 17:39:22 +0800 2013 [39.929925, 116.42866]
我们的志愿者在全国盲人柔道锦标赛现场 我在:http://t.cn/zRbkKjP Fri Oct 11 10:45:42 +0800 2013 [39.935349, 116.283485]
还没睡醒就被抓来补课 @悠长的 [偷笑]请问新店是只对理工科开放么?文科生在此好惶恐@jimmyzhuang [思考] Fri Oct 11 12:44:07 +0800 2013 [39.926456, 116.450493]
cx. : 说我不真实,我活在现实,0K? 我在这里:http://t.cn/z8AtYtq Fri Oct 11 20:43:56 +0800 2013 [39.84068, 116.31896]
清洗之后的cleanword.txt
清洗之后去掉了无意义评论,以及网址,@,表情包等等无意义内容
是黑妹吗? Fri Oct 11 19:44:31 +0800 2013 [39.964324, 116.354873]
男士秋冬新款小脚裤! Sat Oct 12 21:05:40 +0800 2013 [39.83868, 116.37965]
楚国的灭亡皆是因接受了贿赂,让她的小伎俩离自己远一点 Fri Oct 11 17:39:22 +0800 2013 [39.929925, 116.42866]
我们的志愿者在全国盲人柔道锦标赛现场 Fri Oct 11 10:45:42 +0800 2013 [39.935349, 116.283485]
还没睡醒就被抓来补课 请问新店是只对理工科开放么?文科生在此好惶恐 Fri Oct 11 12:44:07 +0800 2013 [39.926456, 116.450493]
cx. : 说我不真实,我活在现实,0K? Fri Oct 11 20:43:56 +0800 2013 [39.84068, 116.31896]
emotion_list (如果没有情绪词定义为'no',其他五种情绪正常)
['no', 'no', 'disgust', 'joy', 'fear', 'no', 'no', 'joy', 'anger', 'joy', 'no', 'no', 'no', 'disgust', 'no', 'joy', 'sadness', 'joy', 'sadness', 'joy', 'joy', 'sadness', 'no', 'joy', 'no', 'joy', 'joy', 'sadness', 'sadness', 'joy', 'no', 'no', 'joy', 'joy', 'no', 'no', 'no', 'no', 'no', 'sadness', 'no', 'joy', 'no', 'no', 'joy', 'fear', 'no', 'no', 'joy', 'disgust', 'no', 'no', 'anger', 'joy', 'no', 'no', 'joy', 'no', 'sadness', 'joy', 'joy', 'no', 'no', 'no', 'joy', 'joy', 'joy', 'no', 'no', 'no', 'joy', 'no', 'joy',
'no', 'joy', 'no', 'no', 'joy', 'no', 'no', 'sadness', 'fear', 'no', 'anger', 'no', 'no', 'no', 'joy', 'disgust', 'no', 'no', 'joy', 'sadness', 'no', 'fear', 'joy', 'no', 'sadness', 'sadness', 'no', 'no', 'no', 'no', 'joy', 'joy', 'no', 'sadness', 'no', 'no', 'no', 'sadness', 'joy', 'fear', 'no', 'sadness', 'joy', 'no', 'no', 'no', 'sadness',
情绪的时间变化趋势
五种情绪配置三种时间模式,共有15种图像
情绪的空间比例分布
以某一坐标为原点,某一距离为半径的范围内各种情绪的分布比例图
网友评论