python爬虫(3)-下载一本小说

作者: 初灬终 | 来源:发表于2017-11-27 21:10 被阅读12次

python爬虫(3)-下载一本小说
python各类爬虫案例，爬到你手软！（附代码）
Python3 网络爬虫与开发实战
《Python 3网络爬虫开发实战》高清中文版PDF+源代码免费
想要玩爬虫！正则表达式是你的必修课程！这篇足以你玩转爬虫了！
想玩好爬虫！正则表达式是必须精通的！带来正则表达式大全！
想玩好爬虫！正则表达式是必须精通的！带来正则表达式大全！
5.3黑客成长日记——爬虫篇(1)
Python爬虫下载Bilibili番剧弹幕
Python爬虫开发与项目实战

环境：python3.6
目标：从http://www.biqukan.com抓取小说《一念永恒》。

import requests
from bs4 import BeautifulSoup
import re


class Downloader(object):

    def __init__(self, baseLink):
        self.baseLink = baseLink
        self.chapterNames = []
        self.chapterLinks = []
        self.chapterCount = 0

    def downloadMenu(self):
        try:
            html = requests.get(self.baseLink).text
            bf = BeautifulSoup(html, "lxml")
            listBf = bf.find("div", {"class": "listmain"})
            # 找到章节link，title，results的元素是元组
            results = re.compile("<a href=\"(.+\.html)\">(.+)</a>").findall(str(listBf))
            # 前15个link为重复信息
            self.chapterCount = len(results[15:])
            for link in results[15:]:
                self.chapterNames.append(link[1])
                self.chapterLinks.append(self.baseLink + "/" + link[0].split("/")[-1])
            return True
        except:
            print("can not download menu")
            return False


    def downloadContent(self, chapterLink):
        try:
            print('start download %s' % chapterLink)
            html = requests.get(chapterLink).text
            bf = BeautifulSoup(html, "lxml")
            content = bf.find("div", {"class": "showtxt"})
            # 替换到一些空格标识符&nbsp;
            return content.text.replace("\xa0" * 8, "\n\n")
        except:
            print("can not download content")
            return ""

    def writer(self, name, path, text):
        with open(path, "a", encoding="utf-8") as f:
            f.write(name + "\n")
            f.writelines(text)
            f.write('\n\n')


if __name__ == "__main__":

    dl = Downloader("http://www.biqukan.com/1_1094")
    if(dl.downloadMenu()):
        print("start download...")

        # 单线程，下载的太慢了，仅仅测试下载前10章。
        for i in range(10):
            dl.writer(dl.chapterNames[i], "一念永恒.txt", dl.downloadContent(dl.chapterLinks[i]))

        print("end download!")
    else:
        print("download failure")