![](https://img.haomeiwen.com/i730751/d0501c97f04f8c17.png)
审查元素发现字符内容的xpath样式是:
//div[@class="weibo-text"]/text()
输进去也有,但是Python出来的竟然是空列表。
import requests
from lxml import etree
url_weibo = 'https://m.weibo.cn/status/H3HRLlgEm?mblogid=H3HRLlgEm&luicode=10000011&lfid=1076032618726865&sudaref=m.weibo.cn&display=0&retcode=6102'
html = requests.get(url_weibo)
page_xpath = etree.HTML(html.text)
texts = page_xpath.xpath('//div[@class="weibo-text"]/text()')
print(texts)
运行结果为
[]
我说这是什么事儿呢?
原来这个破网页也有js跳转,但是鸡贼的是如果你把响应体抓到以后里面还是有微博内容的,不过就是和网页里位置不太一样,所以抓出来是空。
好吗,那就直接上splash就可以了。先爬10页看看吧。
import requests
import json
from lxml import etree
import pandas as pd
import threading
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'Upgrade-Insecure-Requests':'1'
}
#url_weibo = 'https://m.weibo.cn/status/H3HRLlgEm?mblogid=H3HRLlgEm&luicode=10000011&lfid=1076032618726865&sudaref=m.weibo.cn&display=0&retcode=6102'
weibo_url = 'https://m.weibo.cn/api/container/getIndex?display=0&retcode=6102&containerid=1076032618726865&page='
#一次请求返回十个微博内容
def process_page(url_weibo):
time.sleep(2.3)
html = requests.get('http://localhost:8050/render.html?url=' + url_weibo + '&wait=3')
page_xpath = etree.HTML(html.text)
if '打开微博客户端' in html.text:
return '未知','未知','未知','未知','未知','未知'
cont = ''.join(page_xpath.xpath('//div[@class="weibo-text"]/text()'))
try:
timep = page_xpath.xpath("//span[@class='time']/text()")[0]
except:
print('时间出错地址为:' + url_weibo)
timep = '未知'
try:
device = page_xpath.xpath("//span[@class='from']/text()")[0].split()[1]
except:
print('设备出错地址为:' + url_weibo)
device = '未知'
repo,comm,like = page_xpath.xpath('//div[@class="lite-page-tab"]/div/i[2]/text()')
return cont,timep,device,repo,comm,like
all_lists = []
for j in range(10):
print(f'正在处理第{j+1}页API')
time.sleep(2)
wb_data = requests.get(weibo_url + str(j),headers = headers).json()
weibo_ids_addrs = [(wb_data['data']['cards'][i]['mblog']['id'], wb_data['data']['cards'][i]['scheme']) for i in range(len(wb_data['data']['cards']))]
all_lists = all_lists + weibo_ids_addrs
items = []
all_lists = list(set(all_lists))
for item in all_lists:
wbcount = len(all_lists)
cont, timep, device, repo, comm, like = process_page(item[1])
if cont == '未知':
print(f'正在处理第{all_lists.index(item)}条数据,共{wbcount}条,但是这条有问题')
else:
print(f'正在处理第{all_lists.index(item)}条数据,共{wbcount}条')
item_dicts = {
'id': item[0],
'time': timep,
'device':device,
'repo': repo,
'comm': comm,
'like':like,
'cont': cont
}
items += [item_dicts]
wbdf = pd.DataFrame(items)
wbdf.to_csv('xtyweibo.csv')
这回就没问题了。
爬完以后整理文本、整理发出设备。
import matplotlib.pyplot as plt
import requests
import json
import pandas as pd
from matplotlib import font_manager as fm
df = pd.read_csv('xtyweibo.csv')
dev = df['cont']
dev2 = list(df['cont'])
devset = set(dev)
while '未知' in devset:
devset.remove('未知')
outset = list(devset)
with open ('weibo.txt','w',encoding='utf-8') as f:
for i in dev2:
f.write(i+ '\n')
# counts = [dev2.count(item) for item in outset]
#
# plt.figure()
#
# plt.rcParams['font.sans-serif'] = ['STFangsong']
# plt.rcParams['axes.unicode_minus'] = False
# proptease = fm.FontProperties()
# proptease.set_size('large')
# patches, texts, autotexts = plt.pie(
# counts, #哪个变量要画饼
# labels = outset, #变量的标签
# autopct = '%.2f%%', #显示数目百分比
# shadow = True, #阴影饼状图
# #explode = [0,0.05], #使各个扇形彼此分离,列表内的值是距离圆心的offset
# #fontproperties=FontProperties(fname='/System/Library/Fonts/PingFang.ttc')
#
# )
# plt.setp(autotexts, fontproperties=proptease)
# plt.setp(texts, fontproperties=proptease)
# plt.tight_layout()
# plt.show()
后面是个饼图的。
![](https://img.haomeiwen.com/i730751/24d5a69d3980e3b7.png)
![](https://img.haomeiwen.com/i730751/57648b946d84bc57.png)
![](https://img.haomeiwen.com/i730751/aa59ce465044a8ad.png)
网友评论