美文网首页
爬虫18:练习之小说下载

爬虫18:练习之小说下载

作者: _百草_ | 来源:发表于2022-09-14 18:13 被阅读0次
# -*- coding:utf-8 -*-
"""
@author:百草Lily
@file:test_fiction.py
@time:2022/9/14
"""
import time
import random
from faker import Faker
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from queue import Queue


class Fiction:

    def __init__(self):
        self.base_url = "https://www.shicimingju.com"
        self.url = self.base_url+"/book/sanguoyanyi.html"
        fake = Faker(locale="zh_CN")
        self.headers = {
            "user-agent": fake.user_agent()
        }
        self.q = Queue()  # maxsize=3)  # 存储url的队列;暂定前3回

    # 获取请求
    def get_request(self, url):
        req = Request(url, headers=self.headers)
        resp = urlopen(req)
        return resp.read()

    @staticmethod
    def save_file(content):
        with open("fiction.txt", "a", encoding="utf-8") as f:
            f.write(content)

    def get_urls(self):
        html = self.get_request(self.url)
        soup = BeautifulSoup(html, "lxml")
        mulus = soup.select(".book-mulu a")
        for mulu in mulus:
            mulu_href = self.base_url + mulu["href"]
            self.q.put(mulu_href)
            print(f"self.q={self.q}")

    # 解析子url,获取文章内容
    def parse_html(self, url):
        html = self.get_request(url).decode("utf8")  # 编码
        soup = BeautifulSoup(html, "lxml")
        eles = soup.select("#main_left .card")
        # <div id="main_left">
        #   <div class="card bookmark-list">
        #       <h1>章节名称</h1>
        #       <div class="chapter_content">章节内容</div>
        #   </div>
        #   <div class="book-page-nav">翻页</div>
        # </div>
        content = "".join([ele.get_text() for ele in eles])  # ele.text 也可以获取文本
        # print(f"content={content}")
        return content

    def run(self):
        self.get_urls()
        while not self.q.empty():
            url = self.q.get()
            print(f"-----开始下载:{url}-----")
            content = self.parse_html(url)
            self.save_file(content)
            time.sleep(random.uniform(1, 3))  # 避免请求频繁
            print(f"-----结束下载:{url}-----")


if __name__ == "__main__":
    fc = Fiction()
    fc.run()

反思:

  1. 每次请求的时间间隔time.sleep(random.uniform(1, 3)) # 避免请求频繁
  2. 文本保存异常html = self.get_request(url).decode("utf8") # 编码

参考

  1. Pyhon爬虫下载小说

相关文章

网友评论

      本文标题:爬虫18:练习之小说下载

      本文链接:https://www.haomeiwen.com/subject/mmtfirtx.html