美文网首页
爬取起点中文网的免费图书

爬取起点中文网的免费图书

作者: 哼Bug | 来源:发表于2019-06-24 17:10 被阅读0次

    爬取起点中文网的免费图书


    17553828-c507eef2269e4b24.png
    import requests, os
    from multiprocessing import Pool
    from bs4 import BeautifulSoup
    
    
    class QidianSpider:
        # 初始化
        def __init__(self, pages, url, localPath):
            self.pages = pages
            self.url = url
            self.localPath = localPath
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
                'Connection': 'close'
            }
    
        # 一页一页的下载图书,每页有20本
        def download_book(self):
            self.create_folder()
            for i in range(self.pages):
                param = {
                    "orderId": '',
                    "vip": 'hidden',
                    "style": 1,
                    'pageSize': 20,
                    "siteid": 1,
                    "pubflag": 0,
                    "hiddenField": 1,
                    "page": i + 1
                }
                try:
                    # 访问每页获取的数据
                    data_responses = self.get_responses(param)
                    # 从每页结果中获取所有图书信息,后面根据ID获取每本书的具体内容
                    book_info_list = self.get_book_info(data_responses)
                    # 多进程下载
                    self.multiprocess_download(book_info_list, 10)
                except Exception as e:
                    print(e)
    
        # 判断文件夹是否存在,不存在创建文件夹
        def create_folder(self):
            if not os.path.exists(self.localPath):
                try:
                    os.makedirs(self.localPath)
                except Exception as e:
                    raise (e)
    
        # 访问每页获取的数据
        def get_responses(self, param):
            try:
                data_responses = requests.get(self.url, params=param, headers=self.headers)
                return data_responses
            except Exception as e:
                print(e)
    
        # 从每页结果中获取图书信息,后面根据ID获取每本书的具体内容
        def get_book_info(self, data_responses):
            soup = BeautifulSoup(data_responses.text, 'lxml')
            book_info_raw = soup.select('div.book-mid-info')
            book_info_list = []
            for book_info_raw_single in book_info_raw:
                book_info_dict = dict()
                book_info_dict["title"] = book_info_raw_single.select('h4 > a')[0].get_text()
                book_info_dict["id"] = book_info_raw_single.select('h4 > a')[0].get('data-bid')
                book_info_dict['author'] = book_info_raw_single.select('.name')[0].get_text()
                book_info_list.append(book_info_dict)
            return book_info_list
    
        # 多进程下载
        def multiprocess_download(self, book_info_list, process):
            pool = Pool(process)
            for book_info_dict in book_info_list:
                pool.apply_async(self.download_one, (book_info_dict,))
            pool.close()
            pool.join()
    
        # 单个进程下,下载图书详细信息
        def download_one(self, book_info_dict):
            if os.path.exists(self.localPath + book_info_dict["title"]):
                print('exists:', self.localPath + book_info_dict["title"])
                return
            # 捕获异常
            try:
                book_catalog_responses = requests.get("https://book.qidian.com/info/%s#Catalog" % book_info_dict["id"],
                                                      timeout=10, headers=self.headers)
                if book_catalog_responses.status_code == 200:
                    print("当前进程ID:{},图书信息:{}".format(os.getpid(), book_info_dict))
                    self.get_book_catalog_url(book_catalog_responses, book_info_dict)
                    self.save_book_content(book_info_dict)
            except Exception as e:
                print("异常:{}".format(book_info_dict), e)
    
        # 获取目录url
        def get_book_catalog_url(self, book_catalog_responses, book_info_dict):
            soup = BeautifulSoup(book_catalog_responses.text, 'html.parser')
            book_catalog_info_raw = soup.select('.volume-wrap li[data-rid] a[href]')
            book_catalog_url_list = []
            for book_catalog_info_raw_single in book_catalog_info_raw:
                book_catalog_url_list.append(book_catalog_info_raw_single['href'])
            book_info_dict['bookCatalogUrl'] = book_catalog_url_list
    
        # 获取图书内容并保存
        def save_book_content(self, book_info_dict):
            with open(self.localPath + '{}.txt'.format(book_info_dict["title"]), 'w', encoding='utf-8') as f:
                f.write(book_info_dict['title'] + '\n')
                f.write("作者:" + book_info_dict['author'] + '\n')
                for url in book_info_dict['bookCatalogUrl']:
                    try:
                        book_content_responses = requests.get("https:" + url, headers=self.headers)
                        if book_content_responses.status_code == 200:
                            soup = BeautifulSoup(book_content_responses.text, 'html.parser')
                            book_catalog = soup.find('h3', attrs={'class': 'j_chapterName'}).get_text()
                            f.write('\n' + book_catalog + '\n')
                            book_content = soup.find('div', attrs={'class': "read-content j_readContent"})
                            f.write('\t' + book_content.get_text() + '\n')
                    except Exception as e:
                        print('异常:{}章节获取失败'.format(book_info_dict['title']), e)
    
    
    if __name__ == '__main__':
        spider = QidianSpider(100, 'https://www.qidian.com/free/all', 'E://book//')
        spider.download_book()
    
    

    相关文章

      网友评论

          本文标题:爬取起点中文网的免费图书

          本文链接:https://www.haomeiwen.com/subject/ucxnfctx.html