环境:python3.6
目标:从http://www.biqukan.com抓取小说《一念永恒》。
import requests
from bs4 import BeautifulSoup
import re
class Downloader(object):
def __init__(self, baseLink):
self.baseLink = baseLink
self.chapterNames = []
self.chapterLinks = []
self.chapterCount = 0
def downloadMenu(self):
try:
html = requests.get(self.baseLink).text
bf = BeautifulSoup(html, "lxml")
listBf = bf.find("div", {"class": "listmain"})
# 找到章节link,title,results的元素是元组
results = re.compile("<a href=\"(.+\.html)\">(.+)</a>").findall(str(listBf))
# 前15个link为重复信息
self.chapterCount = len(results[15:])
for link in results[15:]:
self.chapterNames.append(link[1])
self.chapterLinks.append(self.baseLink + "/" + link[0].split("/")[-1])
return True
except:
print("can not download menu")
return False
def downloadContent(self, chapterLink):
try:
print('start download %s' % chapterLink)
html = requests.get(chapterLink).text
bf = BeautifulSoup(html, "lxml")
content = bf.find("div", {"class": "showtxt"})
# 替换到一些空格标识符
return content.text.replace("\xa0" * 8, "\n\n")
except:
print("can not download content")
return ""
def writer(self, name, path, text):
with open(path, "a", encoding="utf-8") as f:
f.write(name + "\n")
f.writelines(text)
f.write('\n\n')
if __name__ == "__main__":
dl = Downloader("http://www.biqukan.com/1_1094")
if(dl.downloadMenu()):
print("start download...")
# 单线程,下载的太慢了,仅仅测试下载前10章。
for i in range(10):
dl.writer(dl.chapterNames[i], "一念永恒.txt", dl.downloadContent(dl.chapterLinks[i]))
print("end download!")
else:
print("download failure")
网友评论