前言
嗨喽,大家好呀~这里是爱看美女的茜茜呐
又到了学Python时刻~今天我们来采集一下评论数据!
WB态数据抓包+所有的数据提取方式+词云图可视化
开发环境:
-
python 3.8: 解释器
-
pycharm: 代码编辑器
-
requests 第三方模块
采集评论代码
# 导入模块
import requests
import parsel
import re
import csv
import time
headers = {
'cookie': 'XSRF-TOKEN=V48EJHd1wO3DP9ffnlwgfvQr; WBPSESS=yr8Ogb3qBlrorv2L6-ukSsE1SdVJvjLsi6ub0yOZpfazK2TqOMmvxlay7kNrt6LGuwSQINF-zpQWhR5GxHKCX1k4G2jaPAJoABJpxykZAJt4WAVgjdO_FFGWKvaHbvCJoOFzEoJ5rXkc31Ex4pDEylNKVb9H913jTpjFGBoBha4=; login_sid_t=8f13cfe80a400ba04cd5d9094175b145; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; _s_tentry=weibo.com; Apache=9429320084537.793.1662010843614; SINAGLOBAL=9429320084537.793.1662010843614; ULV=1662010843618:1:1:1:9429320084537.793.1662010843614:; wb_view_log=1920*10801; SSOLoginState=1662010869; SUB=_2A25OFDZPDeRhGeFI6lsT-CnPyDqIHXVtYCCHrDV8PUNbmtANLXDXkW9NfV7QbU7-nuy6Ejf4yBGzw8ymJY1CysT9; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWGGcL4DsCvRg-RQA6cXEKN5JpX5KzhUgL.FoMceK.E1hM0e0q2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNSo24eonNe0ec; ALF=1693546910; wvr=6; wb_view_log_7619287336=1920*10801; webim_unReadCount=%7B%22time%22%3A1662011426887%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A23%2C%22msgbox%22%3A0%7D; PC_TOKEN=b6ef7633b7',
response = requests.get(url=url, headers=headers)
html_data = response.json()['data']['html']
selector = parsel.Selector(html_data)
# .list_box > .list_ul > div .list_con .WB_text:nth-child(1)
divs = selector.css('.list_box > .list_ul > div')
try:
sub_ = re.findall('action-data="(id=4808806519278561.*?)"', html_data)[0]
except:
sub_ = ''
print(sub_)
for div in divs[0: -1]:
content = div.css('.list_con .WB_text:nth-child(1)::text').getall()[1].replace(':', '').replace(' ', '')
imgUrl = div.css('.WB_face.W_fl img::attr(src)').get()
user = div.css('.WB_text a:nth-child(1)::text').get()
time_ = div.css('.WB_from.S_txt2::text').get()
print(user, content, time_, imgUrl)
csv_writer.writerow([user, content, time_, imgUrl])
if sub_ == '':
return 0
get_next(sub_)
get_next('id=4808806519278561&from=singleWeiBo&__rnd=1662011439459')
词云图代码
import jieba
import pandas as pd
import stylecloud
# 读取文件
df_wb = pd.read_csv('微博评论.csv')
def get_cut_words(content_series):
# 读入停用词表
stop_words = []
with open("stop_words.txt", 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
stop_words.append(line.strip())
# 添加关键词
my_words = ['没有欲望', '便宜点']
for i in my_words:
jieba.add_word(i)
# 自定义停用词
my_stop_words = []
stop_words.extend(my_stop_words)
# 分词
word_num = jieba.lcut(content_series.str.cat(sep='。'), cut_all=False)
# 条件筛选
word_num_selected = [i for i in word_num if i not in stop_words and len(i) >= 2]
return word_num_selected
text = get_cut_words(content_series=df_wb['content'])
# 绘制词云图
stylecloud.gen_stylecloud(
text=' '.join(text),
collocations=False,
font_path=r'C:\Windows\Fonts\msyh.ttc',
icon_name='fab fa-apple',
size = 768,
output_name='iPhone.png',
)
记得点赞鸭~
尾语 💝
感谢你观看我的文章呐~本次航班到这里就结束啦 🛬
希望本篇文章有对你带来帮助 🎉,有学习到一点知识~
躲起来的星星🍥也在努力发光,你也要努力加油(让我们一起努力叭)。
最后,博主要一下你们的三连呀(点赞、评论、收藏),不要钱的还是可以搞一搞的嘛~
不知道评论啥的,即使扣个6666也是对博主的鼓舞吖 💞 感谢 💐
网友评论