美文网首页python自学
python爬取笔趣阁的热门小说并保存

python爬取笔趣阁的热门小说并保存

作者: 刘年 | 来源:发表于2020-03-27 21:54 被阅读0次

    过程比较简单,但是思路一定要清晰


    第一、获取该页的小说名和地址 第二、获取小说章节名和地址 第三、解析章节内容,获取文本
    import requests
    from lxml import etree
    import time
    base_url ='https://www.biquge5200.cc/xiuzhenxiaoshuo/'
    headers={
        'referer': 'https://www.biquge5200.cc/',
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36',
    }
    #获取热门小说页面的所有小说地址和名字
    def many_novel_message():
        novel_dict={}
        main_sourse = requests.get(base_url,headers=headers).text
        main_html = etree.HTML(main_sourse)
        novel_urls =main_html.xpath('//div[@class="l"]//span[@class="s2"]//a/@href')
        novel_titles =main_html.xpath('//div[@class="l"]//span[@class="s2"]//a/text()')
        for index,novel_url in enumerate(novel_urls):
            novel_dict[novel_titles[index]] = novel_url
    
        return novel_dict
    #获得小说的章节和地址
    def chap_message(novel_url):
        # novel_url ='https://www.biquge5200.cc/46_46254/'
        chap_dict ={}
        chap_sourse = requests.get(novel_url,headers=headers).text
        chap_html = etree.HTML(chap_sourse)
        # chap_urls =chap_html.xpath('//div[@class="box_con"]')[1].xpath('.//dl/dd/a/text()')[9:]
        chap_titles =chap_html.xpath('//div[@class="box_con"][2]//dl/dd/a/text()')[9:]
        chap_urls =chap_html.xpath('//div[@class="box_con"][2]//dl/dd/a/@href')[9:]
        for index,chap_url in enumerate(chap_urls):
            # chap_dict.update([(chap_titles[index],chap_url)])
            chap_dict[chap_titles[index]]=chap_url
        # print(chap_dict)
        return chap_dict
    #输入地址参数,获取小说的内容
    def chap_cont(chap_url):
        # chap_url ='https://www.biquge5200.cc/46_46254/17700048.html'
        detail_sourse = requests.get(chap_url,headers=headers).text
        detail_html = etree.HTML(detail_sourse)
        chap_conts =detail_html.xpath('//div[@id="content"]/p/text()')
        conts=[]
        conts='\n'.join(chap_conts)
        time.sleep(1)
        return conts
    
    if __name__ == '__main__':
        #得到众多小说页的各小说名字和链接
        novel_dict = many_novel_message()
        #遍历每个小说和链接
        for novel_title in novel_dict:
            # ff=open('小说\\{0}.txt'.format(novel_title),'a',encoding='utf-8',errors='ignore')
            ff =open('小说\\{0}.txt'.format(novel_title), 'a', encoding='utf-8', errors='ignore')
            novel_url =novel_dict[novel_title]
            #打开每个链接获取章节信息(名字和章节链接)
            chap_dict =chap_message(novel_url)
            for chap_title in chap_dict:
                chap_url =chap_dict[chap_title]
                #打开每个章节链接,获取内容
                conts=chap_cont(chap_url)
                # print(conts)
                # with open('小说\\{0}.txt'.format(novel_title), 'a', encoding='utf-8', errors='ignore') as ff:
                #
                ff.write(chap_title+'\n'+conts+'\n')
            ff.close()
    
    

    相关文章

      网友评论

        本文标题:python爬取笔趣阁的热门小说并保存

        本文链接:https://www.haomeiwen.com/subject/txtduhtx.html