from pyquery import PyQuery as pq
import requests
import jieba
import os
Count = {} # 设置全局变量
# 很遗憾。全局变量打印不完全
# def Conn()
class GetContent:
def __init__(self, url, mainurl, plabel, clabel, ecode):
self.url = url
self.mainurl = mainurl
self.plabel = plabel
self.clabel = clabel
self.ecode = ecode
global Count
def GetPages(self):
r = requests.get(self.url)
r.encoding = self.ecode
html = r.text
doc = pq(html)
y = doc(self.plabel)
title = doc('h1').text()
SaveFile(title)
j = 0
for i in y.items():
u = i('a').attr.href
curl = self.mainurl + u # 获得跳转页面的url
j += 1
try:
# 因为有的网页不能正常打开
c = GetContent.Content(self, curl) #类中方法相互调用
# name = i('a').text() # 区分不出章节
Save(str(j), c)
w = GetContent.CutContent(self, c) # 对每个章节的内容进行切词处理
except:
pass
# j += 1
# print(j, w)
def Content(self, curl):
r = requests.get(curl)
r.encoding = self.ecode
html = r.text
doc =pq(html)
c = doc(self.clabel).text()
return c
def SaveWolds(self, content):
li = jieba.cut(self, cut_all=False)
# 因为使用全局变量打印不完全,所以放弃这种方法
def CutContent(self, content):
li = jieba.cut(content, cut_all=False)
for i in list(li):
if i in Count:
Count[i] += 1
else:
Count[i] = 1
return Count
def SaveFile(title):
path = os.getcwd()
if title not in os.listdir():
os.mkdir(title)
os.chdir(path+'\\'+title)
def Save(name, content):
c = open(name+'.txt', 'w', encoding='utf-8')
c.write(content)
c.close()
if __name__ == "__main__":
surl = 'https://www.zhuaji.org/read/548/'
smainurl = 'https://www.zhuaji.org'
splabel = 'dd'
sclabel = '#content'
turl = 'http://www.t7yyw.com/97/97685/'
tmainurl = turl
tplabel = '.ml_list li'
tclabel = '.novelcontent'
# San = GetContent(surl, smainurl, splabel, sclabel, 'utf-8')
# San.GetPages()
Tao = GetContent(turl, tmainurl, tplabel, tclabel, 'gbk')
Tao.GetPages()
# items = list(Count.items())
# items.sort(key = lambda x:x[1], reverse=True)
# # print(Count)
# print(items)
# 桃花寨 44 文件
# 三生三世 60 文件
不知道为什么全局变量打印不完全
打印结果
我也不知道为什么,只能分章分析了。
网友评论