最简单的爬取网络页面数据方式
根据链接的规律进行id遍历,但是有时id的变化不一定是连续的,加个小判断可以解决这种情况
import urllib.request
import urllib.parse
def download(url, headers={}, repeatTimes=5):
if (repeatTimes <= 0):
return None
print("downloading:" + url)
request = urllib.request.Request(url, headers=headers)
try:
response = urllib.request.urlopen(request)
except urllib.error.URLError as e:
print(e)
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, headers, repeatTimes-1)
else:
return None
return response.read()
urlFormat = 'http://127.0.0.1/places/default/view/{0}'
countryId = 1
emptyCount = 0
while True:
# 通过循环传递id参数
realUrl = urlFormat.format(countryId)
# 循环一次id加1
countryId += 1
# 下载页面
html = download(realUrl)
# 如果页面为空,则记录空页面次数
if html == None:
emptyCount += 1
# 当空页面次数超过五次,结束循环
if emptyCount >= 5:
break
else:
# 如果得到数据,重新置emptyCount为空,等待下次记录
emptyCount = 0
print('finished')
网友评论