爬取小说

作者: Suria007 | 来源:发表于2019-12-30 21:43 被阅读0次

scrapy对爬取的内容进行更新爬取
2019-02.24（review）
爬取小说
爬取小说
Python爬虫层层递进，从爬取一章小说到爬取全站小说
实战爬取起点中文网全部作品信息（基于lxml）
利用Python爬取妙笔阁小说网站的小说信息并保存为txt和cs
行外兴趣
Python爬虫实战
1024！爬取小说

from pyquery import PyQuery as pq 
import requests
import jieba
import os

Count = {} # 设置全局变量
# 很遗憾。全局变量打印不完全
# def Conn()

class GetContent:
    def __init__(self, url, mainurl, plabel, clabel, ecode):
        self.url = url
        self.mainurl = mainurl
        self.plabel = plabel
        self.clabel = clabel
        self.ecode = ecode
        global Count

    def GetPages(self):
        r = requests.get(self.url)
        r.encoding = self.ecode
        html = r.text
        doc = pq(html)
        y = doc(self.plabel)
        title = doc('h1').text()
        SaveFile(title)
        j = 0
        for i in y.items():
            u = i('a').attr.href
            curl = self.mainurl + u # 获得跳转页面的url
            j += 1
            try:
            # 因为有的网页不能正常打开
                c = GetContent.Content(self, curl) #类中方法相互调用
                # name = i('a').text() # 区分不出章节
                Save(str(j), c)
                w = GetContent.CutContent(self, c) # 对每个章节的内容进行切词处理
            except:
                pass
            # j += 1
            # print(j, w)


    def Content(self, curl):
        r = requests.get(curl)
        r.encoding = self.ecode
        html = r.text
        doc =pq(html)
        c = doc(self.clabel).text()
        return c

    def SaveWolds(self, content):
        li = jieba.cut(self, cut_all=False)


    # 因为使用全局变量打印不完全，所以放弃这种方法
    def CutContent(self, content):
        li = jieba.cut(content, cut_all=False)
        for i in list(li):
            if i in Count:
                Count[i] += 1
            else:
                Count[i] = 1
        return Count

def SaveFile(title):
    path = os.getcwd()
    if title not in os.listdir():
        os.mkdir(title)
    os.chdir(path+'\\'+title)

def Save(name, content):
    c = open(name+'.txt', 'w', encoding='utf-8')
    c.write(content)
    c.close()



if __name__ == "__main__":
    surl = 'https://www.zhuaji.org/read/548/'
    smainurl = 'https://www.zhuaji.org'
    splabel = 'dd'
    sclabel = '#content'
    turl = 'http://www.t7yyw.com/97/97685/'
    tmainurl = turl
    tplabel = '.ml_list li'
    tclabel = '.novelcontent'
    # San = GetContent(surl, smainurl, splabel, sclabel, 'utf-8')
    # San.GetPages()
    Tao = GetContent(turl, tmainurl, tplabel, tclabel, 'gbk')
    Tao.GetPages()
    # items = list(Count.items())
    # items.sort(key = lambda x:x[1], reverse=True)

    # # print(Count)
    # print(items)
    
    # 桃花寨 44 文件
    # 三生三世 60 文件