美文网首页
Python3爬取小说章节2020-09-24

Python3爬取小说章节2020-09-24

作者: 番茄晓蛋 | 来源:发表于2020-09-25 00:05 被阅读0次
    #coding=utf-8
    import urllib
    import re
    import os
    import glob
    import urllib.request
    
    
    from urllib.parse import urljoin
    from urllib.parse import urlparse
    
    def cleanhtml(raw_html):
      cleanr = re.compile('<.*?>')
      cleantext = re.sub(cleanr, '', raw_html)
      return cleantext
    
    
    def fetch_section(webroot):
        for page in range(8,9):
            print('正在下载第'+str(page)+'页小说')
    
            url = 'http://www.cangshubao.net/forum-915-'+str(page)+'.html'
            headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  }
            try:
                request = urllib.request.Request(url,headers=headers)
                response = urllib.request.urlopen(request,timeout=180)
                #print (response.read().decode('gbk'))
            except urllib.error.URLError as e:
                if hasattr(e,"code"):
                    print(e.code)
                if hasattr(e,"reason"):
                    print(e.reason)
    
            # html = response.read().decode('utf-8')
            html = response.read().decode('gbk')
            #print html
            # pattern = re.compile(u'<li>.*?<div class="s">.*?target="_blank">(.*?)</a><br />大小:(.*?)<br>.*?</em><br>更新:(.*?)</div>.*?<a href="(.*?)"><img.*?>(.*?)</a>.*?<div class="u">(.*?)</div>',re.S)
            # pattern = re.compile(u'<a href="thread-1453325-1-3.html">第205章行内人的密报</a>')
            #  pattern = re.compile(u'<span id="(.*?)"><a href="(.*?)"></a></span>')
            pattern = re.compile(u'<span id="(.*?)"><a href="(.*?)">第(.*?)</a></span>')
            items = re.findall(pattern,html)
            # print (items)
    
            for item in items:
                try:
                    book_thread = item[0].encode('gbk')
                    book_link = item[1]
                    book_name = item[2]
                    book_full_link = urljoin(webroot, book_link)   # 构建书的绝对地址
    
                    #请求地址
                    try:
                        request = urllib.request.Request(book_full_link,headers=headers)
                        response =urllib.request.urlopen(request,timeout=180)
                    except urllib.error.URLError as e:
                        if hasattr(e,"code"):
                            print(e.code)
                        if hasattr(e,"reason"):
                            print(e.reason)
                    html = response.read().decode('gbk')
                    
                    pattern = re.compile('<table cellspacing="0" cellpadding="0"><tr><td class="t_msgfont" id="(.*?)"><font size="5">(.*?)</font></td></tr></table>',re.S)
                    section_content = re.findall(pattern,html)
    
                    # down txt
                    try:
                        fp = open(book_name+'.txt','w')
                    except IOError as e:
                        pattern = re.compile('<font size="5">(.*?)</font><br />',re.S)
                        book_name = re.findall(pattern,book_name)
                        fp = open(book_name[0]+'.txt','w')
                    print('start download')
                    cleanTxt = cleanhtml(str(section_content[0][1]))
                    fp.write(cleanTxt)
                    print('down finish\n')
                    fp.close()
                except Exception as e:
                    print('该条目解析出现错误,忽略')
                    print(e)
                    print('')
                    fp = open('error.log','a')
                    fp.write('page:'+str(page)+'\n')
                    fp.write(item[4].encode('gbk'))
                    fp.write('\nThere is an error in parsing process.\n\n')
                    fp.close()
    
    def merge_txt_files():
        read_files = glob.glob("*.txt")
        # print (read_files)
        with open("result.txt", "wb") as outfile:
            for f in read_files:
                
                with open(f, "rb") as infile:
                    print(f)
                    outfile.write(f.encode('gbk'))
                    outfile.write(infile.read())
    
    def main():
        # 获取排行榜首页内容
        webroot = 'http://www.cangshubao.net/'
        # fetch_section(webroot)
        merge_txt_files()
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:Python3爬取小说章节2020-09-24

          本文链接:https://www.haomeiwen.com/subject/qirxuktx.html