没错又来水博客
和图书爬虫思路一致,加了点花里胡哨的东西
直接上代码
# 作者:Lino
# 参考于作者:Charles
import re
import os
import requests
from bs4 import BeautifulSoup
import bs4
import xlwt
import time
import pickle
# 简化版豆瓣影评获取器
# 生成词云
# 影评及作者.xls保存于当前目录
# 暂不拥有模拟登录功能
def get_page(url):
headers = {
'Uesr-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
try:
res = requests.get(url, headers=headers)
res.raise_for_status()
return res.text
except:
return ""
def fillCommentsDatas(data, html):
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all('div', attrs={'class': 'comment-item'})
for div in divs:
if isinstance(div, bs4.element.Tag):
nickname = div.find('a', attrs={'title': True}).get('title')
StarandDate = div.find_all('span', attrs={'title': True})
if len(StarandDate) == 2:
star = float(re.findall('allstar(\d\d).*?', str(StarandDate[0]))[0]) / 10
date = StarandDate[1].get('title')
else:
star = "无"
date = StarandDate[0].get('title')
comment = div.find('span', attrs={'class': 'short'}).string.strip()
data[nickname] = [date, star, comment]
def save_to_pkl(savepath, data):
f = open(os.path.join(savepath, "影评.pkl"), 'wb')
pickle.dump(data, f)
f.close()
def write_to_excel(data):
workbook = xlwt.Workbook(encoding='ascii')
worksheet = workbook.add_sheet('BookSheet')
worksheet.col(0).width = 4000
worksheet.col(1).width = 3000
worksheet.col(2).width = 8000
worksheet.col(3).width = 30000
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = "宋体"
font.height = 11 * 20
alignment = xlwt.Alignment()
alignment.horz = xlwt.Alignment.HORZ_CENTER
alignment.vert = xlwt.Alignment.VERT_CENTER
alignment.wrap = 1
style.font = font
style.alignment = alignment
worksheet.write(0, 0, "昵称", style)
worksheet.write(0, 1, "评分", style)
worksheet.write(0, 2, "日期", style)
worksheet.write(0, 3, "评论", style)
tmp = 1
for key, value in data.items():
worksheet.write(tmp, 0, key, style)
worksheet.write(tmp, 1, value[1], style)
worksheet.write(tmp, 2, value[0], style)
worksheet.write(tmp, 3, value[2], style)
tmp += 1
workbook.save('影评.xls')
if __name__ == '__main__':
data = {}
mid = input("输入电影的代号:")
for i in range(20):
url = "https://movie.douban.com/subject/" + str(mid) + "/comments?start=" + str(i*20) + "&limit=20&sort=new_score&status=P"
html = get_page(url)
fillCommentsDatas(data, html)
save_to_pkl(os.getcwd(), data)
write_to_excel(data)
time.sleep(1)
from wordcloud import WordCloud
import pickle
import os
import jieba
def generateWordCloud(words, savepath):
wc = WordCloud(font_path='simkai.ttf', background_color='white', max_words=2000, width=1920, height=1080, margin=5)
wc.generate_from_frequencies(words)
wc.to_file(os.path.join(savepath, 'commentscloud.jpg'))
def frequencies(texts, stopwords):
words_dict = {}
for text in texts:
temp = jieba.cut(text)
for t in temp:
if t in stopwords:
continue
if t in words_dict.keys():
words_dict[t] += 1
else:
words_dict[t] = 1
return words_dict
if __name__ == '__main__':
f = open('影评.pkl', 'rb')
data = pickle.load(f)
f.close()
texts = [d[1][2] for d in data.items()]
stopwords = open('stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]
words_dict = frequencies(texts, stopwords)
generateWordCloud(words_dict, os.getcwd())
效果
Excel
词云
网友评论