学习Python后,就想着自己能够做一些小玩意来实践一下学习效果。下面是简单的爬虫实现。
目标:
1.抓取糗事百科的段子
2.将段子保存到本地文件
我们首先先分析糗事百科的页面
每个段子都是以author clearfix开始,下面的div分别是内容,用户名,点赞数等。今天这个例子中获取用户名,内容以及点赞数。从html文件中获取这些信息就要用到正则表达式了,python中提供了RE库完美解决了这个问题。
pattern = re.compile('<div class="author.*?>.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<span class="stats.*? class="number">(.*?)</i>',re.S)
items = re.findall(pattern, respone.text)
上面这段代码就是从html文件中解析我们所需要的内容。(.*?)是一个group,就是我们需要保存的内容。
获取页面的话用requests就可以了。下面是部分代码。
import re
import requests
# url = "http://www.qiushibaike.com/hot/"
url = "http://www.qiushibaike.com/8hr/page/2/?s=4975313"
session = requests.session()
agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
host = "www.qiushibaike.com"
headers = {
'Host':host,
'User-Agent':agent
}
respone = session.get(url)
pattern = re.compile('<div class="author.*?>.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<span class="stats.*? class="number">(.*?)</i>',re.S)
items = re.findall(pattern, respone.text)
for item in items:
print("author="+ item[0]+ "\n" + "articl="+ item[1] + "\n" + "vote=" + item[2])
完整代码:
import re
import requests
import time
import random
class QSBK:
def __init__(self):
self.pageIndex = 1
self.user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
self.host = url = "http://www.qiushibaike.com/8hr/page/"+str(pageIndex) +"/?s=4975313"
self.headers = {
'Host':self.host,
'User-Agent':self.user_agent
}
self.stories = []
self.session = requests.session()
self.enable = False
def getPage(self,pageIndex):
url = "http://www.qiushibaike.com/"
# + str(random.randint(100000,999999))
try:
resopne = self.session.get(url)
if resopne.text:
return resopne.text
except:
print("network wrong")
def getPageItems(self, pageIndex):
pageStories = []
pageCode = self.getPage(pageIndex)
if not pageCode:
print("获取页面失败!")
pattern = re.compile('<div class="author.*?>.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<span class="stats.*? class="number">(.*?)</i>',re.S)
items = re.findall(pattern, pageCode)
for item in items:
pageStories.append([item[0], item[1], item[2]])
return pageStories
# 加载并提取页面的内容,加入到列表中
def loadPage(self):
# 如果当前未看的页数少于2页,则加载新一页
if self.enable == True:
if len(self.stories) < 2:
# 获取新一页
pageStories = self.getPageItems(self.pageIndex)
# 将该页的段子存放到全局list中
if pageStories:
self.stories.append(pageStories)
# 获取完之后页码索引加一,表示下次读取下一页
self.pageIndex += 1
def getOneStoty(self, pageStories, page):
for story in pageStories:
game = input()
self.loadPage()
if game =="Q":
self.enable = False
return
#将段子保存到文件,write中a参数表示文件中增加内容
try:
f = open('qsbks.txt','a')
f.write(story[0] + '\n' + story[1] + '\n' + story[2]+'\n\n')
finally:
f.close()
print("第%s页\t发布人:%s\t\n%s\n赞:%s" %(page, story[0],story[1], story[2]))
def start(self):
print("正在读取糗事百科")
self.enable = True
self.loadPage()
nowPage = 0
while self.enable:
if len(self.stories) > 0:
pageStories = self.stories[0]
nowPage+=1
del self.stories[0]
self.getOneStoty(pageStories, nowPage)
spider = QSBK()
spider.start()
网友评论