美文网首页
爬取小说

爬取小说

作者: Suria007 | 来源:发表于2019-12-30 21:43 被阅读0次
    from pyquery import PyQuery as pq 
    import requests
    import jieba
    import os
    
    Count = {} # 设置全局变量
    # 很遗憾。全局变量打印不完全
    # def Conn()
    
    class GetContent:
        def __init__(self, url, mainurl, plabel, clabel, ecode):
            self.url = url
            self.mainurl = mainurl
            self.plabel = plabel
            self.clabel = clabel
            self.ecode = ecode
            global Count
    
        def GetPages(self):
            r = requests.get(self.url)
            r.encoding = self.ecode
            html = r.text
            doc = pq(html)
            y = doc(self.plabel)
            title = doc('h1').text()
            SaveFile(title)
            j = 0
            for i in y.items():
                u = i('a').attr.href
                curl = self.mainurl + u # 获得跳转页面的url
                j += 1
                try:
                # 因为有的网页不能正常打开
                    c = GetContent.Content(self, curl) #类中方法相互调用
                    # name = i('a').text() # 区分不出章节
                    Save(str(j), c)
                    w = GetContent.CutContent(self, c) # 对每个章节的内容进行切词处理
                except:
                    pass
                # j += 1
                # print(j, w)
    
    
        def Content(self, curl):
            r = requests.get(curl)
            r.encoding = self.ecode
            html = r.text
            doc =pq(html)
            c = doc(self.clabel).text()
            return c
    
        def SaveWolds(self, content):
            li = jieba.cut(self, cut_all=False)
    
    
        # 因为使用全局变量打印不完全,所以放弃这种方法
        def CutContent(self, content):
            li = jieba.cut(content, cut_all=False)
            for i in list(li):
                if i in Count:
                    Count[i] += 1
                else:
                    Count[i] = 1
            return Count
    
    def SaveFile(title):
        path = os.getcwd()
        if title not in os.listdir():
            os.mkdir(title)
        os.chdir(path+'\\'+title)
    
    def Save(name, content):
        c = open(name+'.txt', 'w', encoding='utf-8')
        c.write(content)
        c.close()
    
    
    
    if __name__ == "__main__":
        surl = 'https://www.zhuaji.org/read/548/'
        smainurl = 'https://www.zhuaji.org'
        splabel = 'dd'
        sclabel = '#content'
        turl = 'http://www.t7yyw.com/97/97685/'
        tmainurl = turl
        tplabel = '.ml_list li'
        tclabel = '.novelcontent'
        # San = GetContent(surl, smainurl, splabel, sclabel, 'utf-8')
        # San.GetPages()
        Tao = GetContent(turl, tmainurl, tplabel, tclabel, 'gbk')
        Tao.GetPages()
        # items = list(Count.items())
        # items.sort(key = lambda x:x[1], reverse=True)
    
        # # print(Count)
        # print(items)
        
        # 桃花寨 44 文件
        # 三生三世 60 文件
    
    
    不知道为什么全局变量打印不完全
    打印结果
    我也不知道为什么,只能分章分析了。

    相关文章

      网友评论

          本文标题:爬取小说

          本文链接:https://www.haomeiwen.com/subject/rohvoctx.html