以绝世高手为例,爬取前三页的所有音频
采集地址:https://www.ximalaya.com/youshengshu/16411402/
思路只有两个:获取一级页面、获取二级页面
先一步二步看一下代码:
def get_mes(self):
id_list = []
# 获取前三页的地址,range顾头不顾尾
for i in range(1, 4):
page_url = f'https://www.ximalaya.com/youshengshu/16411402/p{i}/'
# 请求页面
response = requests.get(url=page_url, headers=self.headers).content.decode('utf-8')
# 使用正则表达式获取小说名字和id
# 获取id是获取音频给关键
pat = '<a title="(.*?)" href="(.*?)">'
result = re.findall(pat, response, re.S)
# print(result)
for title, href in result[6:]:
self.name_list.append(title)
id_list.append(href.split('/')[-1])
# self.url_list = 'https://www.ximalaya.com' + href
return id_list
如果你觉得两个for看着不舒服也可以拆成两个方法
翻页就是通过观察它的地址:
![](https://img.haomeiwen.com/i23190432/fb76f42e21e3e589.png)
def get_audio(self, id_list):
j = 0
for i in id_list:
audio_url = f'https://www.ximalaya.com/revision/play/v1/audio?id={i}&ptype=1'
res = requests.get(audio_url, headers=self.headers).content.decode('utf-8')
pat = '"src":"(.*?)"'
result = re.findall(pat, res, re.S)
# print(result[0])
response = requests.get(result[0])
if response:
with open(f'{self.name_list[j]}.m4a', 'wb') as f:
f.write(response.content)
print(f'{self.name_list[j]}爬取成功')
j += 1
![](https://img.haomeiwen.com/i23190432/f94d3a5ead3fb6c5.png)
![](https://img.haomeiwen.com/i23190432/6c9087994f03094a.png)
完整代码:
import requests
import re
class Spider(object):
def __init__(self):
# 放名字
self.name_list = []
# self.url_list = []
# 请求头
self.headers = {
'user-agent': '这里是你的user-agent的内容'
}
def get_mes(self):
id_list = []
for i in range(1, 4):
page_url = f'https://www.ximalaya.com/youshengshu/16411402/p{i}/'
response = requests.get(url=page_url, headers=self.headers).content.decode('utf-8')
pat = '<a title="(.*?)" href="(.*?)">'
result = re.findall(pat, response, re.S)
# print(result)
for title, href in result[6:]:
self.name_list.append(title)
id_list.append(href.split('/')[-1])
# self.url_list = 'https://www.ximalaya.com' + href
return id_list
def get_audio(self, id_list):
j = 0
for i in id_list:
audio_url = f'https://www.ximalaya.com/revision/play/v1/audio?id={i}&ptype=1'
res = requests.get(audio_url, headers=self.headers).content.decode('utf-8')
pat = '"src":"(.*?)"'
result = re.findall(pat, res, re.S)
# print(result[0])
response = requests.get(result[0])
if response:
with open(f'{self.name_list[j]}.m4a', 'wb') as f:
f.write(response.content)
print(f'{self.name_list[j]}爬取成功')
j += 1
def run(self):
id_list = self.get_mes()
self.get_audio(id_list)
spider = Spider()
spider.run()
讨论、交流。加Q群313074041领取
网友评论