美文网首页
Python爬取笔趣阁小说

Python爬取笔趣阁小说

作者: 言蹊灬 | 来源:发表于2020-12-11 15:46 被阅读0次

注意:本教程基于python3.9,不一定适用于2.x版本。
需要用到requests,BeautifulSoup,HTMLSession
pip3 install requests

import sys
import requests
from collections import deque
from bs4 import BeautifulSoup
from requests_html import HTMLSession

searchkey = '唐家三少'

bookList = []
mainUrl = 'http://www.xbiquge.la/'
session = HTMLSession()
sys.setrecursionlimit(100000)

bookListReq = requests.post("http://www.xbiquge.la/modules/article/waps.php", data={'searchkey': searchkey})
bookListReq.encoding = 'utf-8'
bookListHt = BeautifulSoup(bookListReq.text, 'html.parser')
for aTag in bookListHt.find_all('a'):
    if f'{aTag}'.find('target') > -1 & f'{aTag}'.find('//') > -1:
        # print(f'{aTag["href"]} ===> {aTag.text}')
        bookList.append({'href': aTag["href"], 'text': aTag.text})

bookList = deque(bookList)
length: int = len(bookList)
idx: int = 0


# 获取文本
class GetTxt:
    response = None
    text = None
    title = None
    url = None

    def __init__(self, url):
        self.response = session.get(url)
        self.title = self.response.html.find('body > #wrapper > div.content_read > div > div.bookname > h1')[0].text
        self.text = self.response.html.find('body > #wrapper > div.content_read > div > div#content')[0].text

    def getNextUrl(self):
        return self.response.html.find(
            'body > #wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(4)')[0].links


def getBook(url, book_name):
    global idx
    global length
    book = GetTxt(url)
    print(f'{idx}/{length} {book_name} 正在写入章节 -----> ' + book.title)
    with open(f'{book_name}.txt', "a", encoding='utf-8') as f:
        f.write('\r' + book.title + '\r' + book.text)
    for u in book.getNextUrl():
        if len(u.split('/')) >= 3:
            getBook(f'{mainUrl}{u}', book_name)
        else:
            readBookWeb()


# 读取小说详情
def readBookWeb():
    global idx
    idx = idx + 1
    book = bookList.popleft()
    temp_book_name = book['text']
    temp_book_href = book['href']
    response = session.get(temp_book_href)
    response.encoding = 'utf-8'
    book_detail_html = BeautifulSoup(response.text, 'html.parser')
    first_chapter_href = book_detail_html.find('dd').find('a')['href']
    f = open(f'{temp_book_name}.txt', 'w')
    f.close()
    getBook(f'{mainUrl}{first_chapter_href}', temp_book_name)


readBookWeb()

image.png

相关文章

网友评论

      本文标题:Python爬取笔趣阁小说

      本文链接:https://www.haomeiwen.com/subject/rlijgktx.html