闲来无事,找点段子一乐呵,就逛到糗事百科,这次爬取没有什么难度,唯一值得说道的是增加了一点点的代码健壮性。
import requests
from lxml import etree
class Spider():
def __get_page(self,url,headers):
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except Exception:
return None
def __parse_page(self,html):
results = []
data = etree.HTML(html)
items = data.xpath('//div[@id="content-left"]/div')
for item in items:
#获取作者
author = item.xpath('./div[1]/a[2]/h2/text()')
if author:
results.append(author[0].strip())
else:
results.append('匿名用户')
#获取内容
content = item.xpath('./a[1]/div/span/text()')
if content:
results.append(''.join(content).replace('\n',''))
else:
results.append('此用户没有内容')
#获取好笑数
number = item.xpath('./div[2]/span[1]/i/text()')
if number:
results.append(number[0])
else:
results.append('0')
return results
def __save_to_txt(self,data):
with open('data.txt','w',encoding='utf-8') as f:
f.write(data)
def run(self):
for i in range(1,13):
url ='https://www.qiushibaike.com/text/page/' + str(i)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'
' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
html = self.get_page(url,headers)
result = self.parse_page(html)
self.save_to_txt(str(result))
#实例化类
spider = Spider()
spider.run()
- 其中类里的方法是私有方法,外部不可调用。
- 解析网页部分增加了判断获取字段为空的处理方法。
- 欢迎大家跟我交流学习。
网友评论