美文网首页
Python爬取笔趣阁小说

Python爬取笔趣阁小说

作者: 言蹊灬 | 来源:发表于2020-12-11 15:46 被阅读0次

    注意:本教程基于python3.9,不一定适用于2.x版本。
    需要用到requests,BeautifulSoup,HTMLSession
    pip3 install requests

    import sys
    import requests
    from collections import deque
    from bs4 import BeautifulSoup
    from requests_html import HTMLSession
    
    searchkey = '唐家三少'
    
    bookList = []
    mainUrl = 'http://www.xbiquge.la/'
    session = HTMLSession()
    sys.setrecursionlimit(100000)
    
    bookListReq = requests.post("http://www.xbiquge.la/modules/article/waps.php", data={'searchkey': searchkey})
    bookListReq.encoding = 'utf-8'
    bookListHt = BeautifulSoup(bookListReq.text, 'html.parser')
    for aTag in bookListHt.find_all('a'):
        if f'{aTag}'.find('target') > -1 & f'{aTag}'.find('//') > -1:
            # print(f'{aTag["href"]} ===> {aTag.text}')
            bookList.append({'href': aTag["href"], 'text': aTag.text})
    
    bookList = deque(bookList)
    length: int = len(bookList)
    idx: int = 0
    
    
    # 获取文本
    class GetTxt:
        response = None
        text = None
        title = None
        url = None
    
        def __init__(self, url):
            self.response = session.get(url)
            self.title = self.response.html.find('body > #wrapper > div.content_read > div > div.bookname > h1')[0].text
            self.text = self.response.html.find('body > #wrapper > div.content_read > div > div#content')[0].text
    
        def getNextUrl(self):
            return self.response.html.find(
                'body > #wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(4)')[0].links
    
    
    def getBook(url, book_name):
        global idx
        global length
        book = GetTxt(url)
        print(f'{idx}/{length} {book_name} 正在写入章节 -----> ' + book.title)
        with open(f'{book_name}.txt', "a", encoding='utf-8') as f:
            f.write('\r' + book.title + '\r' + book.text)
        for u in book.getNextUrl():
            if len(u.split('/')) >= 3:
                getBook(f'{mainUrl}{u}', book_name)
            else:
                readBookWeb()
    
    
    # 读取小说详情
    def readBookWeb():
        global idx
        idx = idx + 1
        book = bookList.popleft()
        temp_book_name = book['text']
        temp_book_href = book['href']
        response = session.get(temp_book_href)
        response.encoding = 'utf-8'
        book_detail_html = BeautifulSoup(response.text, 'html.parser')
        first_chapter_href = book_detail_html.find('dd').find('a')['href']
        f = open(f'{temp_book_name}.txt', 'w')
        f.close()
        getBook(f'{mainUrl}{first_chapter_href}', temp_book_name)
    
    
    readBookWeb()
    
    
    image.png

    相关文章

      网友评论

          本文标题:Python爬取笔趣阁小说

          本文链接:https://www.haomeiwen.com/subject/rlijgktx.html