python爬取喜马拉雅有声小说

作者: BABYMISS | 来源:发表于2020-05-19 17:11 被阅读0次

python爬取喜马拉雅有声小说
2017-12-31
python 爬虫练习（一）
身为一个程序员看小说还需要花钱么，不存在的，Python爬虫摆设
BeautifulSoup4小试牛刀
python 爬虫爬取小说
scrapy对爬取的内容进行更新爬取
不巧看到这篇文章，那就送你2000套妹子图吧！
Python 爬取喜马拉雅音频
爬取Python教程博客并转成PDF

以绝世高手为例，爬取前三页的所有音频

采集地址：https://www.ximalaya.com/youshengshu/16411402/

思路只有两个：获取一级页面、获取二级页面

先一步二步看一下代码：

def get_mes(self):

id_list = []

# 获取前三页的地址，range顾头不顾尾

for i in range(1, 4):

page_url = f'https://www.ximalaya.com/youshengshu/16411402/p{i}/'

# 请求页面

response = requests.get(url=page_url, headers=self.headers).content.decode('utf-8')

# 使用正则表达式获取小说名字和id

# 获取id是获取音频给关键

pat = '<a title="(.*?)" href="(.*?)">'

result = re.findall(pat, response, re.S)

# print(result)

for title, href in result[6:]:

self.name_list.append(title)

id_list.append(href.split('/')[-1])

# self.url_list = 'https://www.ximalaya.com' + href

return id_list

如果你觉得两个for看着不舒服也可以拆成两个方法

翻页就是通过观察它的地址：

def get_audio(self, id_list):

j = 0

for i in id_list:

audio_url = f'https://www.ximalaya.com/revision/play/v1/audio?id={i}&ptype=1'

res = requests.get(audio_url, headers=self.headers).content.decode('utf-8')

pat = '"src":"(.*?)"'

result = re.findall(pat, res, re.S)

# print(result[0])

response = requests.get(result[0])

if response:

with open(f'{self.name_list[j]}.m4a', 'wb') as f:

f.write(response.content)

print(f'{self.name_list[j]}爬取成功')

j += 1

完整代码：

import requests

import re

class Spider(object):

def __init__(self):

# 放名字

self.name_list = []

# self.url_list = []

# 请求头

self.headers = {

'user-agent': '这里是你的user-agent的内容'

}

def get_mes(self):

id_list = []

for i in range(1, 4):

page_url = f'https://www.ximalaya.com/youshengshu/16411402/p{i}/'

response = requests.get(url=page_url, headers=self.headers).content.decode('utf-8')

pat = '<a title="(.*?)" href="(.*?)">'

result = re.findall(pat, response, re.S)

# print(result)

for title, href in result[6:]:

self.name_list.append(title)

id_list.append(href.split('/')[-1])

# self.url_list = 'https://www.ximalaya.com' + href

return id_list

def get_audio(self, id_list):

j = 0

for i in id_list:

audio_url = f'https://www.ximalaya.com/revision/play/v1/audio?id={i}&ptype=1'

res = requests.get(audio_url, headers=self.headers).content.decode('utf-8')

pat = '"src":"(.*?)"'

result = re.findall(pat, res, re.S)

# print(result[0])

response = requests.get(result[0])

if response:

with open(f'{self.name_list[j]}.m4a', 'wb') as f:

f.write(response.content)

print(f'{self.name_list[j]}爬取成功')

j += 1

def run(self):

id_list = self.get_mes()

self.get_audio(id_list)

spider = Spider()

spider.run()