美文网首页
python爬虫(3)-下载一本小说

python爬虫(3)-下载一本小说

作者: 初灬终 | 来源:发表于2017-11-27 21:10 被阅读12次

环境:python3.6
目标:从http://www.biqukan.com抓取小说《一念永恒》。

import requests
from bs4 import BeautifulSoup
import re


class Downloader(object):

    def __init__(self, baseLink):
        self.baseLink = baseLink
        self.chapterNames = []
        self.chapterLinks = []
        self.chapterCount = 0

    def downloadMenu(self):
        try:
            html = requests.get(self.baseLink).text
            bf = BeautifulSoup(html, "lxml")
            listBf = bf.find("div", {"class": "listmain"})
            # 找到章节link,title,results的元素是元组
            results = re.compile("<a href=\"(.+\.html)\">(.+)</a>").findall(str(listBf))
            # 前15个link为重复信息
            self.chapterCount = len(results[15:])
            for link in results[15:]:
                self.chapterNames.append(link[1])
                self.chapterLinks.append(self.baseLink + "/" + link[0].split("/")[-1])
            return True
        except:
            print("can not download menu")
            return False


    def downloadContent(self, chapterLink):
        try:
            print('start download %s' % chapterLink)
            html = requests.get(chapterLink).text
            bf = BeautifulSoup(html, "lxml")
            content = bf.find("div", {"class": "showtxt"})
            # 替换到一些空格标识符&nbsp;
            return content.text.replace("\xa0" * 8, "\n\n")
        except:
            print("can not download content")
            return ""

    def writer(self, name, path, text):
        with open(path, "a", encoding="utf-8") as f:
            f.write(name + "\n")
            f.writelines(text)
            f.write('\n\n')


if __name__ == "__main__":

    dl = Downloader("http://www.biqukan.com/1_1094")
    if(dl.downloadMenu()):
        print("start download...")

        # 单线程,下载的太慢了,仅仅测试下载前10章。
        for i in range(10):
            dl.writer(dl.chapterNames[i], "一念永恒.txt", dl.downloadContent(dl.chapterLinks[i]))

        print("end download!")
    else:
        print("download failure")

相关文章

网友评论

      本文标题:python爬虫(3)-下载一本小说

      本文链接:https://www.haomeiwen.com/subject/dxhjbxtx.html