写了一个小爬虫,用来爬取起点的每日免费小说,保存在本地文件夹中。
之后会做进一步的改善,包括增加IP池,Header池,容错机制,多线程爬取和将小说章节保存到数据库等,让爬虫更加健壮。
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_free_book(start_url):
# 爬取每日免费小说列表页的小说链接,返回链接的列表book_list
book_list = []
res = requests.get(start_url)
soup = BeautifulSoup(res.text,"html.parser")
books = soup.select('div.limit-book-list li')
for book in books:
book_list.append(urljoin(start_url,book.select('div.book-mid-info h4 a')[0].get("href")))
return book_list
def get_chapters(book_url):
# 爬取章节链接,返回chapter_list
chapter_list = []
chapter_list_url = book_url + "#Catalog"
res = requests.get(chapter_list_url)
soup = BeautifulSoup(res.text, "html.parser")
chapter_tag_list = soup.select("div.volume ul.cf li a")
for chapter_tag in chapter_tag_list:
chapter_list.append(urljoin(book_url,chapter_tag.get("href")))
return chapter_list
def parse_bookname(first_chapter_url):
# 返回小说名,作者。供download_book函数生成“小说名.txt”文件,如果当前页没有,则返回None。
try:
res = requests.get(first_chapter_url)
soup = BeautifulSoup(res.text, "html.parser")
name = soup.select('.book-cover-wrap h1')[0].get_text()
author = soup.select('.book-cover-wrap h2 a')[0].get_text()
return name,author
except:
return None,None
def parse_detail(chapter_url):
# 解析小说内容并返回
content = ""
res = requests.get(chapter_url)
soup = BeautifulSoup(res.text, "html.parser")
title = soup.select('.content-wrap')[0].get_text()
num = soup.select('.j_chapterWordCut')[0].get_text()
time = soup.select('.j_updateTime')[0].get_text()
p_list = soup.select('.read-content p')[0].text.split()
for p in p_list:
content += "\t" + p + "\n"
return title,content,num,time
def download_book(chapter_list):
#如果返回name为None,说明当前小说详情页没有爬取到小说名,作者信息,此时传入下一页链接,直到爬取到。
page = 0
name,author = parse_bookname(chapter_list[page])
while not name:
page += 1
name, author = parse_bookname(chapter_list[page])
with open("%s.txt" % (name,), "w", encoding="utf-8") as file:
pass
with open("%s.txt" % (name,), "a", encoding="utf-8") as file:
for chapter_url in chapter_list:
title,content,num,time = parse_detail(chapter_url)
file.write("%s 字数:%s 发表时间:%s \n%s" % (title,num,time,content))
if __name__ == '__main__':
start_url = 'https://www.qidian.com/free'
book_list = get_free_book(start_url)
for book_url in book_list:
chapter_list = get_chapters(book_url)
download_book(chapter_list)
网友评论