- 第一步先分析笔趣看各个章节
url = https://www.biqukan.com/1_1094/ 一念永恒的网址
# encoding:utf-8
__author__ = 'Fioman'
__date__ = '2018/10/25 16:17'
from bs4 import BeautifulSoup
import requests
""""
类说明:下载<<笔趣看>>网小说:url:https//www.biqukan.com/
Params:
url - <<笔趣看>>网指定的小说目录地址(string)
"""
class BiquSpider(object):
def __init__(self):
self.base_url = "https://www.biqukan.com/" # 网站首页
self.url = "https://www.biqukan.com/1_1094/" # 小说章节目录的url
self.chapter_names = [] # 存放各个章节的名字
self.chapter_urls = [] # 存放各个章节对应的url
self.numbers = 0 # 存放章节数
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
'HOST': 'www.biqukan.com',
"Referer": 'http://www.biqukan.com/',
}
# 获取各个章节以及各个章节对应的urls
def get_chapter_urls(self):
# 解除警告,ssl取消验证的警告解除
requests.packages.urllib3.disable_warnings()
res = requests.get(url=self.url, headers=self.headers,verify=False)
html = res.text
# 获取soup对象
soup = BeautifulSoup(html, 'lxml')
# 获取章节名称和对应的url,先获取整个div
div = soup.find('div', class_='listmain')
# 然后找到div下的a标签,并剔除前面的12个是当前更新的无用标签
chapters = div.find_all('a')[12:]
self.numbers = len(chapters)
# 遍历chapters,获取章节的名称和对应的url
for each in chapters:
self.chapter_names.append(each.text)
self.chapter_urls.append(self.base_url + each['href'])
# 遍历章节和章节对应的urls,先去urls获取内容,然后再写入到本地文件中
for chapter_name,chapter_url in zip(self.chapter_names,self.chapter_urls):
content = self.getChapterContent(chapter_url)
# 根据内容和章节的名称,将下载的小说写入到本地文件中
self.savePage(chapter_name,content)
# 根据章节的url获取内容
def getChapterContent(self,url):
res = requests.get(url,headers = self.headers,verify=False)
html = res.text
soup = BeautifulSoup(html,'lxml')
# 获取内容的div
div = soup.find('div',id='content',class_='showtxt')
# 将获取的内容返回
return div.text
# 将获取的内容保存进文件中
def savePage(self,name,content):
filename = 'novel/' + name + '.txt'
with open(filename,'a',encoding='utf-8') as f:
f.write(content)
if __name__ == '__main__':
spider = BiquSpider()
spider.get_chapter_urls()
- 下载的格式不是很好,等待后续的改进
网友评论