发姐,这个“清纯不做作”的主播凉了……😔
爬取下评论,看看 大家对她这次事件的看法,顺带着学习一波
声明:这次学习借鉴了一些大佬的代码,还望海涵,侵删。
找到问题的一个回答,找到加载所需要的XHR请求,分析Json数据
引入库:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import json
import jieba
import binascii
from urllib.parse import urlencode
import sys
得到json数据
def get_json(url):
headers = {
'cookie': 'd_c0="AHBkqIQryw2PTouoMHWcxeuH1TbpVnfySVU=|1529752806"; _zap=83268240-1571-4dd8-a7d0-52d2c5148795; _xsrf=PB8R3d9skggRQW2MQBu2AfAOL9g5IETF; q_c1=3b96c96861f3458db95a2fa70fc3fb65|1532427518000|1529752806000; tgw_l7_route=1c2b7f9548c57cd7d5a535ac4812e20e; l_n_c=1; l_cap_id="YzVmZjE3NDNjZDY1NDA1YWJhZTVlOTTMwOTFiYjQ=|1533105697|aa6d722a350fe7fd049fadfb92dcdc363946f1e9"; r_cap_id="NzE5OTI0OGFmMDQ2NGExNjllOWEzODhiN2YxZDM4MDk=|1533105697|3fb366cf6109df0d32807ebb2acc2997edb293c6"; cap_id="Zjg3MmJhYTRkMjE4NGU4NmE5MGM4ZDBhMDQxMjc3ZDM=|1533105697|279da2af57248cc79a4fa522c0343e4f999c2a25"; n_c=1',
'referer': 'https://www.zhihu.com/question/287656023',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
return response.content
处理数据:
def get_comments(code_json):
json_dict = json.loads(code_json)
for item in json_dict['data']:
# 16进制转化为字符串
comment = item['content'].encode('utf-8')
comment = binascii.b2a_hex(comment)
comment = binascii.a2b_hex(comment).decode("utf8")
yield comment
这里为啥那么多comment?原因是返回汉字的16进制形式,这里的都是为了处理,网上翻了好久,度娘的搜索相关度真的堪忧……
主函数&&词云生成:
def get_comments(code_json):
json_dict = json.loads(code_json)
for item in json_dict['data']:
# 16进制转化为字符串
comment = item['content'].encode('utf-8')
comment = binascii.b2a_hex(comment)
comment = binascii.a2b_hex(comment).decode("utf8")
yield comment
def wordcloud(all_comments):
# 对句子进行分词,加载停用词
# 打开和保存文件时记得加encoding='utf-8'编码,不然会报错。
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 精确模式
stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()] # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
for line in all_comments:
line_seg = seg_sentence(line) # 这里的返回值是字符串
with open('outputs.txt', 'a', encoding='utf-8') as f:
f.write(line_seg + '\n')
data = open('outputs.txt', 'r', encoding='utf-8').read()
my_wordcloud = WordCloud(
background_color='white', #设置背景颜色
max_words=200, #设置最大实现的字数
font_path=r'SimHei.ttf', #设置字体格式,如不设置显示不了中文
).generate(data)
plt.figure()
plt.imshow(my_wordcloud)
plt.axis('off')
plt.show() # 展示词云
def main():
comment_list = []
for i in range(0,800,20):
url = "https://www.zhihu.com/api/v4/answers/457413146/comments?"
data = {
'include': 'data[*].author,collapsed,reply_to_author,disliked,content,voting,vote_count,is_parent_author,is_author',
'order': 'normal',
'limit': '20',
'offset': i,
'status': 'open'
}
data = urlencode(data)
url = url + data
code_json = get_json(url)
sys.stdout.write(" 已下载:%.3f%%" % float(i/800*100) + '\r')#不能同时两行刷新
sys.stdout.flush()
for reslut in get_comments(code_json):
#print(reslut)
comment_list.append(reslut)
wordcloud(comment_list)
if __name__ == '__main__':
main()
其中生成词云所需要的中文字体和分词文档是找的网上的
抓了800个评论,词云如下:
陈一发.png
你是我心中的禅
网友评论