美文网首页
爬取 笔趣网 小说

爬取 笔趣网 小说

作者: 52_St | 来源:发表于2018-05-01 18:54 被阅读23次
    import os
    
    import requests
    from lxml import etree
    
    '''
    下载网站 www.biquw.com 的小说
    '''
    
    
    class BiquwNovel:
    
        # 初始化
        def __init__(self, url):
            self.__url = url
    
        # 定制的下载网页并解析的解析器
        def __parse(self, url):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
            }
            try:
                self.__html = requests.get(url, headers=headers).text
            except Exception as e:
                print(e)
                return False
            self.__tree = etree.HTML(self.__html)
    
        # 下载并保存小说到本地,并保存到指定路径
        def download(self, file_path):
            # 获取网页发生异常,直接结束
            detect = self.__parse(self.__url)
            if detect is False:
                return
            # 提取书名
            book_name = self.__tree.xpath('//h1/text()')[0]
            # 提取 章节链接
            chapter_links = self.__tree.xpath('//div[@class="book_list"]/ul//li/a/@href')
            # 以书的名字建立存储目录
            file_path = os.path.join(file_path, book_name)
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            print('开始下载...{}'.format(book_name.strip()))
            # 循环下载所有章节的内容
            for link in chapter_links:
                detect = self.__parse(self.__url + link)
                if detect is False:
                    continue
                chapter_name = self.__tree.xpath('//h1/text()')[0]
                content = self.__tree.xpath('//div[@id="htmlContent"]/text()')
                # 以章节名字存储已下载的每一章节的内容
                self.__process_text(file_path, chapter_name, content)
                print(chapter_name.strip() + '...下载完成!')
                # return
    
        # 存储下载的章节内容的函数
        @staticmethod
        def __process_text(file_path, chapter_name, content):
            with open(os.path.join(file_path, '{}.txt'.format(chapter_name)), 'w') as f:
                for text in content:
                    text = text.strip()
                    if text:
                        f.write(text + '\n\n')
    
    
    # 根据URL创建一个实例对象
    book = BiquwNovel('http://www.biquw.com/book/900/')
    # 调用download方法下载并保存小说
    book.download('E:\Python\python_work')
    
    QQ图片20180501185605.png

    相关文章

      网友评论

          本文标题:爬取 笔趣网 小说

          本文链接:https://www.haomeiwen.com/subject/rcrfrftx.html