"""
精神心理科:
"https://youlai.cn/dise/pk_9_0_1.html"
"""
from requests_html import HTMLSession
import csv
import re
class Spider:
def __init__(self):
self.session = HTMLSession()
self.level1_url = "https://youlai.cn/dise/pk_9_0_1.html" # 一级疾病
self.HomeUrl = "https://youlai.cn" # 主页
def parseLevel1(self):
"""解析一级中所有的疾病分类"""
response = self.session.get(url=self.level1_url)
level2_urlTitle = response.html.xpath('//dl[@class="textList"]//a/text()')
level2_urlList = response.html.xpath('//dl[@class="textList"]//a/@href')
for index in range(0,len(level2_urlList)):
self.parseArticleHome(level2_urlList[index],level2_urlTitle[index])
# break
def parseArticleHome(self,url,title):
"""解析二级中相关文章页数"""
articleHomeUrl = (self.HomeUrl + url).replace('dise/','dise/articlelist/').replace('.html','_%s.html')
page = 1
isPage = True
TrailerPage = 1
while isPage:
if page > TrailerPage: # 如果超过最大页数
break
response = self.session.get(url = articleHomeUrl % page)
if TrailerPage == 1:
TrailerPage = int(response.html.xpath('//div[@id="pages"]//li[last()-1]/a/text()')[0]) # 获取最后一页数目
# print(TrailerPage)
articleList = self.getArticleList(response)
self.parseArticle(articleList)
page += 1
# break
def parseArticle(self,articleList):
"""解析文章中的详细信息"""
for url in articleList:
response = self.session.get(url=self.HomeUrl + url)
title = response.html.xpath('//h3[@class="v_title"]/text()')[0] # 标题
createTime = response.html.xpath('//span[@class="fl_left time"]/text()')[0] # 创建时间
readingQuantity = response.html.xpath('//span[@class="fl_left num"]/text()')[0].replace('阅读:','') # 阅读量
doctorName = response.html.xpath('//dl[contains(@class,"doc_pic_box")]/dd//li/strong/text()') # 医生名
hospitalName = response.html.xpath('//dl[contains(@class,"doc_pic_box")]/dd//p[1]/text()') # 医院名
officeName = response.html.xpath('//dl[contains(@class,"doc_pic_box")]/dd//p[2]/text()') # 办公室名
content = re.findall('<div class="text">(.*?)</div>',response.text,re.S)[0].strip() # 文本内容
# (a:附加写方式打开,不可读;a+:附加读写方式打开)
with open('内科文章.csv','a+',encoding='utf-8',newline='') as fp:
writer = csv.writer(fp)
writer.writerow((title,createTime,readingQuantity,doctorName,hospitalName,officeName,content))
print("******",title,createTime,readingQuantity,doctorName,hospitalName,officeName,"******")
print(content)
# break
def getArticleList(self,response):
"""获取相关文章的URL 列表链接"""
return response.html.xpath('//ul[@class="article_left article_l_list bd_none"]//h3/a/@href')
def run(self):
self.parseLevel1()
if __name__ == '__main__':
spider = Spider()
headers = ('文章标题','发表时间','阅读量','医生名','院名','科室','正文')
with open('内科文章.csv','w',encoding='utf-8',newline='') as fp:
writer = csv.writer(fp)
writer.writerow(headers)
spider.run()
"""
精神分裂症:
https://youlai.cn/dise/480.html
https://youlai.cn/dise/articlelist/480_1.html
自闭症:
https://youlai.cn/dise/481.html
https://youlai.cn/dise/articlelist/481_1.html
"""
文章到这里就结束了!希望大家能多多支持Python(系列)!六个月带大家学会Python,私聊我,可以问关于本文章的问题!以后每天都会发布新的文章,喜欢的点点关注!一个陪伴你学习Python的新青年!不管多忙都会更新下去,一起加油!
Editor:Lonelyroots
网友评论