美文网首页
小说的爬取(1)

小说的爬取(1)

作者: 楚糖的糖 | 来源:发表于2018-10-29 21:23 被阅读0次

本章讲了小说爬取下来的两种存储方式:(1)按章节(2)按整本书
1.下面的这种方式是将小说按章节进行保存

import re
import sys
import urllib.request
import time


def Novel(url):
    html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r',
                                                                                                        '').replace(
        '<br />', '').replace('&nbsp;', '')
    chapturl, bookname = re.findall(
        '投票推荐</span></a></span><span class="btopt"><a href="(.*?)" title="(.*?)" target="_blank"><span>开始阅读</span></a></span>',
        html)[0]
    # description = re.findall(r'内容简介.*?intro.*?>(.*?)</div>', html)[0]
    status = re.findall(r'float:right">(.*?)</div>', html)[0]
    author = re.findall(r'作者:(.*?)   最新章节', html)[0]
    # print(chapturl,bookname,description,imgurl,status,author)
    chaptList(chapturl, author,  status, bookname)


def chaptList(chapturl, author, status, bookname):  # 获取了章节的部分链接和章节的名字
    html = urllib.request.urlopen(chapturl).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    # print(html)
    reg = r'mulu_list(.*?)show_index3'
    chapthtml = re.findall(reg, html)
    for chapt in chapthtml:
        chaptlist = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', chapt, re.S)
        # print(len(chaptlist))
        allNum = len(chaptlist)
        num = 0
        for url1, chaptname in chaptlist:
            chaptcontent(url1, chapturl, chaptname, author, status, bookname)



def chaptcontent(url1, chapturl, chaptname, author, status, bookname):
    url = chapturl + url1
    html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    reg = r'class="contentbox">(.*?)<div class="ad00">'
    content = re.findall(reg, html)[0].replace('<br />', '').replace('&nbsp;', '').replace('>', '').replace('<',
                                                                                                            '').replace(
        '[..]', '').replace('-a', '').replace('/a ', '')
    print("正在保存%s"%chaptname)
 with open('%s(''{}.txt.format(chaptname),'w', encoding='utf-8') as f:
        f.write(chaptname)
        f.write('\n')
        f.writelines(content)
        f.write('\n\n')

    Novel(url)

2.如下方式是将一本小说完全下载到一个txt文件中去,并且加入了进度条,爬取时间等

import re
import sys
import urllib.request
import time


def Novel(url):
    html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r',
                                                                                                        '').replace(
        '<br />', '').replace('&nbsp;', '')
    chapturl, bookname = re.findall(
        '投票推荐</span></a></span><span class="btopt"><a href="(.*?)" title="(.*?)" target="_blank"><span>开始阅读</span></a></span>',
        html)[0]
    # description = re.findall(r'内容简介.*?intro.*?>(.*?)</div>', html)[0]
    status = re.findall(r'float:right">(.*?)</div>', html)[0]
    author = re.findall(r'作者:(.*?)   最新章节', html)[0]
    # print(chapturl,bookname,description,imgurl,status,author)
    chaptList(chapturl, author,  status, bookname)


def chaptList(chapturl, author, status, bookname):  # 获取了章节的部分链接和章节的名字
    html = urllib.request.urlopen(chapturl).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    # print(html)
    reg = r'mulu_list(.*?)show_index3'
    chapthtml = re.findall(reg, html)
    for chapt in chapthtml:
        chaptlist = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', chapt, re.S)
        # print(len(chaptlist))
        allNum = len(chaptlist)
        num = 0
        for url1, chaptname in chaptlist:
            print(chaptname)
            num += 1
            downRate = num / allNum
            rate_num=(downRate * 100)
            r = '\r%s>%.1f%%' % ('=' * round(rate_num), rate_num,)
            sys.stdout.write(r)
            sys.stdout.flush
            # print("\r 小说下载进度为:%.1f%%" % (rate_num), end="")
            chaptcontent(url1, chapturl, chaptname, author, status, bookname)



def chaptcontent(url1, chapturl, chaptname, author, status, bookname):
    url = chapturl + url1
    html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
    reg = r'class="contentbox">(.*?)<div class="ad00">'
    content = re.findall(reg, html)[0].replace('<br />', '').replace('&nbsp;', '').replace('>', '').replace('<',
                                                                                                            '').replace(
        '[..]', '').replace('-a', '').replace('/a ', '')
    with open('%s(作者:%s,状态:%s).txt' % (bookname, author, status), 'a', encoding='utf-8') as f:
        f.write(chaptname)
        f.write('\n')
        f.writelines(content)
        f.write('\n\n')


if __name__ == '__main__':
    starttime = time.time()
    url = "https://www.duquanben.com/xiazai/22/22319/"
    Novel(url)
    endtime = time.time()
    print("Total use time: %.4f" % (endtime - starttime))

输出的内容有三个,爬取的进度,爬取的内容章节,总的爬取时间

爬取的进度我使用了两种方法:
#(1)无进度条的
            num += 1
            downRate = num / allNum
            rate_num=(downRate * 100)
            # print("\r 小说下载进度为:%.1f%%" % (rate_num), end="")
#(2)有进度条的
            num += 1
            downRate = num / allNum
            rate_num=(downRate * 100)
            r = '\r%s>%.1f%%' % ('=' * round(rate_num), rate_num,)
            sys.stdout.write(r)
            sys.stdout.flush
       
总的爬取时间计算
if __name__ == '__main__':
    starttime = time.time()
    url = "https://www.duquanben.com/xiazai/22/22319/"
    Novel(url)
    endtime = time.time()
    print("Total use time: %.4f" % (endtime - starttime))

爬取过程如下:
![S{N]D7PQVYL[2X}$6]CJVXQ.png](https://img.haomeiwen.com/i11616627/8aa6582a9f8742bf.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/900)

爬取结果如下:
![[M(GRQKWG9DA`UYGW@H1JP.png

爬取完成


7U`1DB$R_D}C3%I$`L(OKAC.png VOA63[]PT1()Q))HE(JHD4F.png

进度条部分参考的代码如下:

相关文章

网友评论

      本文标题:小说的爬取(1)

      本文链接:https://www.haomeiwen.com/subject/sqcitqtx.html