import requests
from lxml import etree
import json
import time
from wordcloud import WordCloud
from matplotlib import pyplot as plt
url = 'https://item.jd.com/100004404916.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'
}
res = requests.get(url, headers=headers)
res.encoding = 'gbk'
html = etree.HTML(res.text.encode('gbk'))
#print(res.text)
productname=html.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[3]/li[1]/@title')[0]
print('***************',productname,'**************')
time.sleep(2)
all_content=''
headers['Referer'] = url
comment_url_0 = 'https://sclub.jd.com/comment/productPageComments.action?&productId=100004404916&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1'
j = 0
for i in range(0, 20):
comment_url = comment_url_0.format(i)
comment_res = requests.get(comment_url, headers=headers)
print(comment_res.status_code, comment_res.text)
json_data = json.loads(comment_res.text)
comments = json_data['comments']
for comment in comments:
content = comment['content'].replace('\n', '')
nickname = comment['nickname']
j = j+1
print(str(j)+'、',nickname+':',content)
all_content = all_content +'\n' +content
time.sleep(10)
print(all_content)
font = r'C:\Windows\Fonts\simhei.ttf'
wc = WordCloud(
font_path=font, #如果是中文必须要添加这个,否则会显示成框框
background_color='white',
width=959,
height=959,
#mask=img_array,
#stopwords=stopword #设置停止词,也就是你不想显示的词
).generate(all_content)
#wc.to_file('ss.png') #保存图片
plt.imshow(wc) #用plt显示图片
plt.axis('off') #不显示坐标轴
plt.show() #显示图片
Figure_1.png
网友评论