import requests
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 爬取数据
for i in range(0, 200, 20):
# 通过浏览器“检查”分析,得到URL数据接口。在不断往下刷新页面的过程中,发现URL中只有“start”参数不断产生变化,依次为0,20,40,60,80
url = 'https://m.douban.com/rexxar/api/v2/gallery/topic/125573/items?' \
'sort=new&start={}&count=20&status_full_text=1&guest_only=0&ck=null'.format(i)
print(url)
# 破解防爬虫,带上请求头
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0".3809.100 Safari/537.36',
'Referer': 'https://www.douban.com/gallery/topic/125573/?from=gallery_trend &sort=hot'}
# 发送请求,获取响应
reponse = requests.get(url, headers=headers)
html = reponse.json()
# 解析数据,获得短评
# 保存到本地
for j in range(19):
abst = html['items'][j]['abstract']
with open("want_after.txt", "a", encoding='utf-8') as f:
f.write(abst)
print(abst)
# 获得wordcloud 需要的文本格式
with open("want_after.txt", "r", encoding='utf-8') as f:
words = ' '.join(jieba.cut(f.read(), cut_all=False))
backgroud_Image = plt.imread(r'C:\Users\tomding\Videos\图片1.png') # 背景图
# 词云的一些参数设置
wcloud = WordCloud(
background_color='white',
mask=backgroud_Image,
font_path=r'c:\\windows\\Fonts\\simhei.ttf',
max_words=200,
max_font_size=200,
min_font_size=8,
random_state=50)
# 生成词云
word_cloud = wcloud.generate_from_text(words)
plt.imshow(word_cloud)
plt.axis('off')
wcloud.to_file('结果.jpg')
# 看看词频高的有哪些
process_word = WordCloud.process_text(wcloud, words)
words_sorted = sorted(process_word.items(), key=lambda i: i[1], reverse=True)
sort_after = words_sorted[:50]
print(sort_after)
# 把数据存成csv文件
df = pd.DataFrame(sort_after)
# 保证不乱码
df.to_csv('sort_after.csv', encoding='utf_8_sig')
网友评论