先上代码:
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys,os,json,re
"""
类说明:下载《唐三书》网小说《斗罗大陆》
Parameters:
无
Returns:
无
Modify:
2018-09-23
"""
class downloader(object):
def __init__(self,server,target,bookname):
self.server = server
self.target = target
self.names = [] #存放章节名
self.urls = [] #存放章节链接
self.nums = 0 #章节数
self.bookname = bookname
self.book_storge_path = bookname + '.txt'
self.book_config_path = bookname + '.json'
self.bookconfig = []
"""
函数说明:获取下载链接
Parameters:
无
Returns:
无
Modify:
2018-09-23
"""
def get_download_url(self):
req = requests.get(url = self.target)
req.encoding = 'gbk'
html = req.text
div_bf = BeautifulSoup(html,features = "html.parser")
div = div_bf.find_all('table')
a_bf = BeautifulSoup(str(div[0]),features = "html.parser")
a = a_bf.find_all('a')
self.nums = 0
for each in a:
if not each.string is None:
self.names.append(each.string)
self.urls.append(self.server + each.get('href'))
self.nums +=1
"""
函数说明:获取章节内容
Parameters:
target - 下载连接(string)
Returns:
texts - 章节内容(string)
Modify:
2018-09-23
"""
def get_contents(self, target):
req = requests.get(url = target)
req.encoding = 'gbk'
html = req.text
bf = BeautifulSoup(html,features="html.parser")
#print(bf)
texts = bf.find_all('div', class_ = 'blockcontent')
texts = texts[0].text.replace('\xa0'*8,'\n\n')
return texts
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
Returns:
无
Modify:
2018-09-23
"""
def writer(self, name, path, text):
#write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
"""
函数说明:自动开始下载
Parameters:
无
Returns:
无
Modify:
2018-09-23
"""
def auto_download(self):
self.get_download_url()
print('《' + self.bookname + '》开始下载:')
if os.path.isfile(self.book_config_path):
with open(self.book_config_path,'r',encoding = 'utf-8') as file:
config_str = file.read()
config_str = json.loads(config_str)
for each in config_str:
self.bookconfig.append({'name':each['name'],'url':each['url'],'isDownloaded':each['isDownloaded']})
else:
print('进度文件不存在,从头开始下载')
for i in range(self.nums):
self.bookconfig.append({'name':self.names[i],'url':self.urls[i],'isDownloaded':'no'})
for i in range(self.nums):
if self.bookconfig[i]['isDownloaded'] == 'yes':
print(self.names[i] + ' 已下载,跳过')
continue
self.writer(self.names[i], self.bookname + '.txt', self.get_contents(self.urls[i]))
sys.stdout.write(self.names[i] +' 下载完成,完成度:%.2f%%'%float((i+1)/self.nums*100) + '\r')
sys.stdout.flush()
self.bookconfig[i]['isDownloaded'] = 'yes'
with open(self.book_config_path,'w',encoding = 'utf-8') as file:
file.write(json.dumps(self.bookconfig,indent = 2,ensure_ascii = False))
print('《' + self.bookname + '》下载完成')
if __name__ == "__main__":
dl = []
nums = 0
urls = []
names = []
server = 'http://www.tangsanshu.com'
req = requests.get(url = server)
req.encoding = 'gbk'
html = req.text
div_bf = BeautifulSoup(html,features = "html.parser")
div = div_bf.find_all('div',id = 'navmenu')
a_bf = BeautifulSoup(str(div[0]),features = "html.parser")
a = a_bf.find_all('a')
for each in a:
if not each.string is None:
names.append(each.string)
urls.append(server + each.get('href'))
dl.append(downloader(server + each.get('href'),server + each.get('href'),each.string))
dl[nums].auto_download()
nums +=1
网友评论