前文请见
https://www.jianshu.com/p/c6cb2684ec35
上一次写爬小说网站已经是去年了,发现之前写的还是有些简单的;
这次又找了一个难度比上次大些, 不支持小说下载的网站来试试水;
写了大概3小时,期间查了下 Beautiful Soup 的文档,支持中文! 如下:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#find-all-tag
最终使用如图


其中运行这个脚本需要的第三方库如下, lxml 是可选的,因为他解析速度快些,所以安装了
pip install requests
pip install beautifulsoup4
pip install lxml
写的这个还是存在一些可以改进的点, 单线程爬起来有些慢,最近在学python3异步,后续用异步来提升爬取速度;
爬取棉花糖网站的小说,测试小说地址为,爬取不同地址只需要改下id就行了
如这次的id为105940

脚本如下
#!/usr/bin/env python3
#-*- coding:UTF-8 -*-
import os, sys
import requests, bs4
def init_soup(url):
r = requests.get(url)
# html.parser 为python自带解析库 lxml运行解析更加快需要另外安装
# d = bs4.BeautifulSoup(r.content.decode('utf-8'), features='html.parser')
d = bs4.BeautifulSoup(r.content.decode('utf-8'), 'lxml')
return d
def gen_article_index(soup):
lists = soup.find_all(['dt', 'dd'])
article_index = []
flag = 0
for i in lists:
urls = i.find_all('a')
if len(urls) > 0:
url = base_url + str(urls[0].get('href'))
if flag == 1:
res = {}
res['name'] = str(i.text)
res['url'] = str(url)
article_index.append(res)
elif '正文' in i.string:
flag = 1
return article_index
def get_context(url, base_url, save_file):
#取出当前页的文章内容
soup = init_soup(url)
res = str(soup.find_all('div', id='content')[0]).replace('<br/>', '\n').split('\n')
for line in res:
if '<' not in line and '>' not in line:
line = " " + line.lstrip() + '\n'
save_file.write(line)
#由于是分页 用递归判断取出所有页
res = soup.find_all('p', attrs={'data-id':"10000"})
if len(res) > 0:
next_page_url = base_url + res[0].find('a').get('href')
get_context(next_page_url, base_url, save_file)
else:
#print('last page!')
return
def start_download(id, base_url):
#获取文章名 及保存文件对象
home_url = str(base_url) + '/' + str(id) + '/'
soup = init_soup(home_url)
article_name = soup.find_all('div', id='info')[0].find('h1').string
file_name = article_name + '.txt'
if os.path.isfile(file_name):
os.remove(file_name)
save_file = open(file_name, 'a', encoding='utf-8')
article_index = gen_article_index(soup)
last_name = article_index[-1]['name']
for i in article_index:
name = '\n' + i['name'] + '\n'
url = i['url']
save_file.write(name)
get_context(url, base_url, save_file)
print("%s -- %s" % (name, last_name))
save_file.close()
if __name__ == '__main__':
os.chdir(sys.path[0])
id = '105940'
base_url = 'https://www.mht.tw'
start_download(id, base_url)
运行结果


打开查看 成功!

增加进度条功能

利用了print 的end='\r',经过测试,windows,linux都可以
#!/usr/bin/env python3
#-*- coding:UTF-8 -*-
import os, sys, time
import requests, bs4
def init_soup(url):
r = requests.get(url)
# html.parser 为python自带解析库 lxml运行解析更加快需要另外安装
# d = bs4.BeautifulSoup(r.content.decode('utf-8'), features='html.parser')
d = bs4.BeautifulSoup(r.content.decode('utf-8'), 'lxml')
return d
def process_bar_text(total_num, index, start_time):
end_tag = '\r'
total_num_bar = 30
percent = (index)/total_num
i = int(percent*total_num_bar)
space = ' '*(total_num_bar-i)
if total_num_bar == i:
end_tag = '\n'
bar = '='*(i+1) + '>'
total_str = '[' + bar + space + '] ' + str("%.2f" % round(percent*100, 2)) + '% '
total_str = total_str + '(' + str(index) + '/' + str(total_num) + ')'
exec_time = str("%.2f" % round((time.time()-start_time), 2))
total_str = total_str + ' ' + str(exec_time) + 's'
return total_str, end_tag
def gen_article_index(soup):
lists = soup.find_all(['dt', 'dd'])
article_index = []
flag = 0
for i in lists:
urls = i.find_all('a')
if len(urls) > 0:
url = base_url + str(urls[0].get('href'))
if flag == 1:
res = {}
res['name'] = str(i.text)
res['url'] = str(url)
article_index.append(res)
elif '正文' in i.string:
flag = 1
return article_index
def get_context(url, base_url, save_file):
#取出当前页的文章内容
soup = init_soup(url)
res = str(soup.find_all('div', id='content')[0]).replace('<br/>', '\n').split('\n')
for line in res:
if '<' not in line and '>' not in line:
line = " " + line.lstrip() + '\n'
save_file.write(line)
#由于是分页 用递归判断取出所有页
res = soup.find_all('p', attrs={'data-id':"10000"})
if len(res) > 0:
next_page_url = base_url + res[0].find('a').get('href')
get_context(next_page_url, base_url, save_file)
else:
#print('last page!')
return
def start_download(id, base_url):
#获取文章名 及保存文件对象
home_url = str(base_url) + '/' + str(id) + '/'
soup = init_soup(home_url)
article_name = soup.find_all('div', id='info')[0].find('h1').string
file_name = article_name + '.txt'
if os.path.isfile(file_name):
print('文件已存在 ==> %s' % file_name)
bak_tag = time.strftime("%Y%m%d_%H%M%S", time.localtime())
bak_name = file_name + '-bak' + str(bak_tag)
#os.remove(file_name)
try:
os.rename(file_name, bak_name)
except Exception as e:
print('备份文件失败!')
return
else:
print('备份文件为 ==> %s' % bak_name)
save_file = open(file_name, 'a', encoding='utf-8')
article_index = gen_article_index(soup)
last_name = article_index[-1]['name']
total_num = len(article_index)
start_time = time.time()
pre_text = '开始下载请稍等\n' + '当前目录:' + str(os.getcwd()) + '\n'
pre_text = pre_text + '保存至文件:' + file_name
end_text = '下载完成!'
print(pre_text)
#for i in article_index:
for li in range(total_num):
i = article_index[li]
name = '\n' + i['name'] + '\n'
url = i['url']
save_file.write(name)
get_context(url, base_url, save_file)
total_str, end_tag = process_bar_text(total_num, li+1, start_time)
print(total_str, end=end_tag)
save_file.close()
print(end_text)
if __name__ == '__main__':
os.chdir(sys.path[0])
id = '96465'
base_url = 'https://www.mht.tw'
start_download(id, base_url)
网友评论