反爬情况利用代理ip进行爬取
import urllib.request
import urllib.parse
def download(url, user_agent='Mozilla/5.0', proxy=None, num_retries=2):
print('download to。。', url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url, headers=headers)
# 如果有代理ip
if proxy:
# 使用选择的代理构建代理处理器对象
proxy_handler = urllib.request.ProxyHandler({"http": "http://%(host)s:%(port)d" % proxy})
# 代理模式
opener = urllib.request.build_opener(proxy_handler)
urllib.request.install_opener(opener)
try:
html = urllib.request.urlopen(request).read()
except urllib.request.URLError as e:
print('url_error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, user_agent, proxy, num_retries-1)
return html
print(download("http://xxxx/",
proxy={"host": "xxxx", "port": xxxx},
user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'))
网友评论