import requests
import lxml.html
from multiprocessing.dummy import Pool
class Novespider(object):
def __init__(self):
self.novel_dict={}
def get_content(self,url):
html = requests.get(url).content
self.get_novel(html)
def get_novel(self,html):
secetor = lxml.html.fromstring(html)
data = secetor.xpath('//div[@class="readtext"]')
for each_paragraph in data:
title = each_paragraph.xpath('h4/text()')[0]
# print(title)
paragraph = each_paragraph.xpath('p/text()')
#print(paragraph)
paragraph = '\n'.join(paragraph)
self.novel_dict[title]=paragraph
#novel[title]=paragraph
def write_txt(self):
novel_paragraph_list=sorted(self.novel_dict.items(),key=lambda x:x[0])
with open('东野圭吾.txt','w',encoding='utf-8') as file:
for i in novel_paragraph_list:
file.write(i[0])
file.write('\n')
file.write(i[1])
file.write('\n')
if __name__=='__main__':
urls = ['http://www.dongyeguiwu.com/books/baiyexing/53.html/{}'.format(str(i)) for i in range(1,8,1)]
#for i in url:
#print(i)
pool = Pool()
data = Novespider()
pool.map(data.get_content, urls)
data.write_txt()
print('搞掂了')
效果图片
网友评论