爬小说

作者: 机会call | 来源:发表于2019-04-07 20:05 被阅读0次

    '''
    import requests
    import os
    from lxml import etree
    class Spider(object): #创建类

    def start_request(self):   #定义函数
    
        response=requests.get("https://www.qidian.com/all")
    
        html=etree.HTML(response.content.decode())     #html树节点关系
    
        Bigtit_list=html.xpath('//div[@class="book-mid-info"]/h4/a/text()')  #属性为class="book-mid-info"]的div下h4标签下a标签的文本
    
        Bigtit_src=html.xpath('//div[@class="book-mid-info"]/h4/a/@href')  #属性为class="book-mid-info"]的div下h4标签下a标签href信息
    
        for bigtit,bigsrc in zip(Bigtit_list, Bigtit_src):  #建立一一对应关系
           if os.path.exists(bigtit)==False:
               os.mkdir(bigtit)
               self.file_data(bigtit,bigsrc)##被下面函数调用
    def file_data(self,bigtit,bigsrc):
        response = requests.get("https:"+bigsrc)
        html = etree.HTML(response.content.decode())  # html树节点关系
        Little_list = html.xpath('//ul[@class="cf"]/li/a/text()')  # 属性为class="book-mid-info"]的div下h4标签下a标签的文本
        Little_src= html.xpath('//ul[@class="cf"]/li/a/@href')  # 属性为class="book-mid-info"]的div下h4标签下a标签href信息
        for littit, litsrc in zip(Little_list, Little_src):  # 建立一一对应关系
            self.finally_file(littit,litsrc,bigtit)
    def finally_file(self,littit,litsrc,bigtit):
        response = requests.get("https:" + litsrc)
        html = etree.HTML(response.content.decode())  # html树节点关系
        content="\n".join(html.xpath('//div[@class="read-content j_readContent"]/p/text()'))
        file_name=bigtit+"\\"+littit+".txt"
        print("正在抓取文章"+file_name)
        with open(file_name,"a",encoding="utf-8") as f:##多媒体文件写入用"wb"
            f.write(content)
    

    spider=Spider()
    spider.start_request() #运行Spider类下start_request函数
    '''

    相关文章

      网友评论

          本文标题:爬小说

          本文链接:https://www.haomeiwen.com/subject/zhheiqtx.html