美文网首页
起点免费小说爬取

起点免费小说爬取

作者: 凉风有信2020 | 来源:发表于2020-06-15 22:26 被阅读0次

写了一个小爬虫,用来爬取起点的每日免费小说,保存在本地文件夹中。

之后会做进一步的改善,包括增加IP池,Header池,容错机制,多线程爬取和将小说章节保存到数据库等,让爬虫更加健壮。

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_free_book(start_url):
    # 爬取每日免费小说列表页的小说链接,返回链接的列表book_list
    book_list = []
    res = requests.get(start_url)
    soup = BeautifulSoup(res.text,"html.parser")
    books = soup.select('div.limit-book-list li')
    for book in books:
        book_list.append(urljoin(start_url,book.select('div.book-mid-info h4 a')[0].get("href")))
    return book_list

def get_chapters(book_url):
    # 爬取章节链接,返回chapter_list
    chapter_list = []
    chapter_list_url = book_url + "#Catalog"
    res = requests.get(chapter_list_url)
    soup = BeautifulSoup(res.text, "html.parser")
    chapter_tag_list = soup.select("div.volume ul.cf li a")
    for chapter_tag in chapter_tag_list:
        chapter_list.append(urljoin(book_url,chapter_tag.get("href")))
    return chapter_list

def parse_bookname(first_chapter_url):
    # 返回小说名,作者。供download_book函数生成“小说名.txt”文件,如果当前页没有,则返回None。
    try:
        res = requests.get(first_chapter_url)
        soup = BeautifulSoup(res.text, "html.parser")
        name = soup.select('.book-cover-wrap h1')[0].get_text()
        author = soup.select('.book-cover-wrap h2 a')[0].get_text()
        return name,author
    except:
        return None,None

def parse_detail(chapter_url):
    # 解析小说内容并返回
    content = ""
    res = requests.get(chapter_url)
    soup = BeautifulSoup(res.text, "html.parser")
    title = soup.select('.content-wrap')[0].get_text()
    num = soup.select('.j_chapterWordCut')[0].get_text()
    time = soup.select('.j_updateTime')[0].get_text()
    p_list = soup.select('.read-content p')[0].text.split()
    for p in p_list:
        content += "\t" + p + "\n"
    return title,content,num,time

def download_book(chapter_list):
    #如果返回name为None,说明当前小说详情页没有爬取到小说名,作者信息,此时传入下一页链接,直到爬取到。
    page = 0
    name,author = parse_bookname(chapter_list[page])
    while not name:
        page += 1
        name, author = parse_bookname(chapter_list[page])
    with open("%s.txt" % (name,), "w", encoding="utf-8") as file:
        pass
    with open("%s.txt" % (name,), "a", encoding="utf-8") as file:
        for chapter_url in chapter_list:
            title,content,num,time = parse_detail(chapter_url)
            file.write("%s 字数:%s 发表时间:%s \n%s" % (title,num,time,content))

if __name__ == '__main__':
    start_url = 'https://www.qidian.com/free'
    book_list = get_free_book(start_url)
    for book_url in book_list:
        chapter_list = get_chapters(book_url)
        download_book(chapter_list)

相关文章

网友评论

      本文标题:起点免费小说爬取

      本文链接:https://www.haomeiwen.com/subject/bhcnzhtx.html