美文网首页
小说的爬取(1)

小说的爬取(1)

作者: 楚糖的糖 | 来源:发表于2018-10-29 21:23 被阅读0次

    本章讲了小说爬取下来的两种存储方式:(1)按章节(2)按整本书
    1.下面的这种方式是将小说按章节进行保存

    import re
    import sys
    import urllib.request
    import time
    
    
    def Novel(url):
        html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r',
                                                                                                            '').replace(
            '<br />', '').replace('&nbsp;', '')
        chapturl, bookname = re.findall(
            '投票推荐</span></a></span><span class="btopt"><a href="(.*?)" title="(.*?)" target="_blank"><span>开始阅读</span></a></span>',
            html)[0]
        # description = re.findall(r'内容简介.*?intro.*?>(.*?)</div>', html)[0]
        status = re.findall(r'float:right">(.*?)</div>', html)[0]
        author = re.findall(r'作者:(.*?)   最新章节', html)[0]
        # print(chapturl,bookname,description,imgurl,status,author)
        chaptList(chapturl, author,  status, bookname)
    
    
    def chaptList(chapturl, author, status, bookname):  # 获取了章节的部分链接和章节的名字
        html = urllib.request.urlopen(chapturl).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
        # print(html)
        reg = r'mulu_list(.*?)show_index3'
        chapthtml = re.findall(reg, html)
        for chapt in chapthtml:
            chaptlist = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', chapt, re.S)
            # print(len(chaptlist))
            allNum = len(chaptlist)
            num = 0
            for url1, chaptname in chaptlist:
                chaptcontent(url1, chapturl, chaptname, author, status, bookname)
    
    
    
    def chaptcontent(url1, chapturl, chaptname, author, status, bookname):
        url = chapturl + url1
        html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
        reg = r'class="contentbox">(.*?)<div class="ad00">'
        content = re.findall(reg, html)[0].replace('<br />', '').replace('&nbsp;', '').replace('>', '').replace('<',
                                                                                                                '').replace(
            '[..]', '').replace('-a', '').replace('/a ', '')
        print("正在保存%s"%chaptname)
     with open('%s(''{}.txt.format(chaptname),'w', encoding='utf-8') as f:
            f.write(chaptname)
            f.write('\n')
            f.writelines(content)
            f.write('\n\n')
    
        Novel(url)
    

    2.如下方式是将一本小说完全下载到一个txt文件中去,并且加入了进度条,爬取时间等

    import re
    import sys
    import urllib.request
    import time
    
    
    def Novel(url):
        html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r',
                                                                                                            '').replace(
            '<br />', '').replace('&nbsp;', '')
        chapturl, bookname = re.findall(
            '投票推荐</span></a></span><span class="btopt"><a href="(.*?)" title="(.*?)" target="_blank"><span>开始阅读</span></a></span>',
            html)[0]
        # description = re.findall(r'内容简介.*?intro.*?>(.*?)</div>', html)[0]
        status = re.findall(r'float:right">(.*?)</div>', html)[0]
        author = re.findall(r'作者:(.*?)   最新章节', html)[0]
        # print(chapturl,bookname,description,imgurl,status,author)
        chaptList(chapturl, author,  status, bookname)
    
    
    def chaptList(chapturl, author, status, bookname):  # 获取了章节的部分链接和章节的名字
        html = urllib.request.urlopen(chapturl).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
        # print(html)
        reg = r'mulu_list(.*?)show_index3'
        chapthtml = re.findall(reg, html)
        for chapt in chapthtml:
            chaptlist = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', chapt, re.S)
            # print(len(chaptlist))
            allNum = len(chaptlist)
            num = 0
            for url1, chaptname in chaptlist:
                print(chaptname)
                num += 1
                downRate = num / allNum
                rate_num=(downRate * 100)
                r = '\r%s>%.1f%%' % ('=' * round(rate_num), rate_num,)
                sys.stdout.write(r)
                sys.stdout.flush
                # print("\r 小说下载进度为:%.1f%%" % (rate_num), end="")
                chaptcontent(url1, chapturl, chaptname, author, status, bookname)
    
    
    
    def chaptcontent(url1, chapturl, chaptname, author, status, bookname):
        url = chapturl + url1
        html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
        reg = r'class="contentbox">(.*?)<div class="ad00">'
        content = re.findall(reg, html)[0].replace('<br />', '').replace('&nbsp;', '').replace('>', '').replace('<',
                                                                                                                '').replace(
            '[..]', '').replace('-a', '').replace('/a ', '')
        with open('%s(作者:%s,状态:%s).txt' % (bookname, author, status), 'a', encoding='utf-8') as f:
            f.write(chaptname)
            f.write('\n')
            f.writelines(content)
            f.write('\n\n')
    
    
    if __name__ == '__main__':
        starttime = time.time()
        url = "https://www.duquanben.com/xiazai/22/22319/"
        Novel(url)
        endtime = time.time()
        print("Total use time: %.4f" % (endtime - starttime))
    
    

    输出的内容有三个,爬取的进度,爬取的内容章节,总的爬取时间

    爬取的进度我使用了两种方法:
    #(1)无进度条的
                num += 1
                downRate = num / allNum
                rate_num=(downRate * 100)
                # print("\r 小说下载进度为:%.1f%%" % (rate_num), end="")
    
    #(2)有进度条的
                num += 1
                downRate = num / allNum
                rate_num=(downRate * 100)
                r = '\r%s>%.1f%%' % ('=' * round(rate_num), rate_num,)
                sys.stdout.write(r)
                sys.stdout.flush
           
    
    总的爬取时间计算
    if __name__ == '__main__':
        starttime = time.time()
        url = "https://www.duquanben.com/xiazai/22/22319/"
        Novel(url)
        endtime = time.time()
        print("Total use time: %.4f" % (endtime - starttime))
    

    爬取过程如下:
    ![S{N]D7PQVYL[2X}$6]CJVXQ.png](https://img.haomeiwen.com/i11616627/8aa6582a9f8742bf.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/900)

    爬取结果如下:
    ![[M(GRQKWG9DA`UYGW@H1JP.png

    爬取完成


    7U`1DB$R_D}C3%I$`L(OKAC.png VOA63[]PT1()Q))HE(JHD4F.png

    进度条部分参考的代码如下:

    相关文章

      网友评论

          本文标题:小说的爬取(1)

          本文链接:https://www.haomeiwen.com/subject/sqcitqtx.html