小说的爬取（1）

作者: 楚糖的糖 | 来源:发表于2018-10-29 21:23 被阅读0次

2019-02.24（review）
小说的爬取（1）
scrapy对爬取的内容进行更新爬取
爬取小说
爬取小说
十二. 爬虫实战（Xpath）- 起点中文网的作品信息
Python爬虫层层递进，从爬取一章小说到爬取全站小说
Python 学习笔记 094
利用Python爬取妙笔阁小说网站的小说信息并保存为txt和cs
实战爬取起点中文网全部作品信息（基于lxml）

本章讲了小说爬取下来的两种存储方式：（1）按章节（2）按整本书
1.下面的这种方式是将小说按章节进行保存

import re
import sys
import urllib.request
import time


def Novel(url):
    html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r',
                                                                                                        '').replace(
        '<br />', '').replace('&nbsp;', '')
    chapturl, bookname = re.findall(
        '投票推荐</span></a></span><span class="btopt"><a href="(.*?)" title="(.*?)" target="_blank"><span>开始阅读</span></a></span>',
        html)[0]
    # description = re.findall(r'内容简介.*?intro.*?>(.*?)</div>', html)[0]
    status = re.findall(r'float:right">(.*?)</div>', html)[0]
    author = re.findall(r'作者：(.*?)   最新章节', html)[0]
    # print(chapturl,bookname,description,imgurl,status,author)
    chaptList(chapturl, author,  status, bookname)


def chaptList(chapturl, author, status, bookname):  # 获取了章节的部分链接和章节的名字
    html = urllib.request.urlopen(chapturl).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    # print(html)
    reg = r'mulu_list(.*?)show_index3'
    chapthtml = re.findall(reg, html)
    for chapt in chapthtml:
        chaptlist = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', chapt, re.S)
        # print(len(chaptlist))
        allNum = len(chaptlist)
        num = 0
        for url1, chaptname in chaptlist:
            chaptcontent(url1, chapturl, chaptname, author, status, bookname)



def chaptcontent(url1, chapturl, chaptname, author, status, bookname):
    url = chapturl + url1
    html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    reg = r'class="contentbox">(.*?)<div class="ad00">'
    content = re.findall(reg, html)[0].replace('<br />', '').replace('&nbsp;', '').replace('>', '').replace('<',
                                                                                                            '').replace(
        '[..]', '').replace('-a', '').replace('/a ', '')
    print("正在保存%s"%chaptname)
 with open('%s(''{}.txt.format(chaptname),'w', encoding='utf-8') as f:
        f.write(chaptname)
        f.write('\n')
        f.writelines(content)
        f.write('\n\n')

    Novel(url)

2.如下方式是将一本小说完全下载到一个txt文件中去,并且加入了进度条，爬取时间等

import re
import sys
import urllib.request
import time


def Novel(url):
    html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r',
                                                                                                        '').replace(
        '<br />', '').replace('&nbsp;', '')
    chapturl, bookname = re.findall(
        '投票推荐</span></a></span><span class="btopt"><a href="(.*?)" title="(.*?)" target="_blank"><span>开始阅读</span></a></span>',
        html)[0]
    # description = re.findall(r'内容简介.*?intro.*?>(.*?)</div>', html)[0]
    status = re.findall(r'float:right">(.*?)</div>', html)[0]
    author = re.findall(r'作者：(.*?)   最新章节', html)[0]
    # print(chapturl,bookname,description,imgurl,status,author)
    chaptList(chapturl, author,  status, bookname)


def chaptList(chapturl, author, status, bookname):  # 获取了章节的部分链接和章节的名字
    html = urllib.request.urlopen(chapturl).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    # print(html)
    reg = r'mulu_list(.*?)show_index3'
    chapthtml = re.findall(reg, html)
    for chapt in chapthtml:
        chaptlist = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', chapt, re.S)
        # print(len(chaptlist))
        allNum = len(chaptlist)
        num = 0
        for url1, chaptname in chaptlist:
            print(chaptname)
            num += 1
            downRate = num / allNum
            rate_num=(downRate * 100)
            r = '\r%s>%.1f%%' % ('=' * round(rate_num), rate_num,)
            sys.stdout.write(r)
            sys.stdout.flush
            # print("\r 小说下载进度为：%.1f%%" % (rate_num), end="")
            chaptcontent(url1, chapturl, chaptname, author, status, bookname)



def chaptcontent(url1, chapturl, chaptname, author, status, bookname):
    url = chapturl + url1
    html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    reg = r'class="contentbox">(.*?)<div class="ad00">'
    content = re.findall(reg, html)[0].replace('<br />', '').replace('&nbsp;', '').replace('>', '').replace('<',
                                                                                                            '').replace(
        '[..]', '').replace('-a', '').replace('/a ', '')
    with open('%s(作者：%s,状态：%s).txt' % (bookname, author, status), 'a', encoding='utf-8') as f:
        f.write(chaptname)
        f.write('\n')
        f.writelines(content)
        f.write('\n\n')


if __name__ == '__main__':
    starttime = time.time()
    url = "https://www.duquanben.com/xiazai/22/22319/"
    Novel(url)
    endtime = time.time()
    print("Total use time: %.4f" % (endtime - starttime))

输出的内容有三个，爬取的进度，爬取的内容章节，总的爬取时间

爬取的进度我使用了两种方法:

#(1)无进度条的
            num += 1
            downRate = num / allNum
            rate_num=(downRate * 100)
            # print("\r 小说下载进度为：%.1f%%" % (rate_num), end="")

#（2）有进度条的
            num += 1
            downRate = num / allNum
            rate_num=(downRate * 100)
            r = '\r%s>%.1f%%' % ('=' * round(rate_num), rate_num,)
            sys.stdout.write(r)
            sys.stdout.flush

总的爬取时间计算

if __name__ == '__main__':
    starttime = time.time()
    url = "https://www.duquanben.com/xiazai/22/22319/"
    Novel(url)
    endtime = time.time()
    print("Total use time: %.4f" % (endtime - starttime))

爬取过程如下：
![S{N]D7PQVYL[2X}$6]CJVXQ.png](https://img.haomeiwen.com/i11616627/8aa6582a9f8742bf.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/900)

爬取结果如下：
![[M(GRQKWG9DA`UYGW@H1JP.png

爬取完成