from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
import random
from .usergents import USER_AGENT_LIST
from scrapy.exceptions import NotConfigured
from .proxyip import ProxyList
import time
class UserAgentDownloadMiddleware(object):
"""
随机使用 User Agent 中间件
"""
def process_request(self, request, spider):
"""
每次请求都会添加一个随机的 UA
:param request:
:param spider:
:return:
"""
user_agent = random.choice(USER_AGENT_LIST)
request.headers['User-Agent'] = user_agent
spider.logger.debug("[User-Agent] ", user_agent)
class RandomProxyMiddleware(object):
"""
随机 ip 中间件
"""
def __init__(self, settings):
self.proxies = settings.get("IP_PROXY_LIST")
self.proxy_list = ProxyList()
@classmethod
def from_crawler(cls, crawler):
"""
获取 settings 配置中的 proxy 配置,如果未设置则报错
:param crawler:
:return:
"""
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
raise NotConfigured
return cls(crawler.settings)
def process_request(self, request, spider):
while 1:
proxy = self.proxy_list.get_proxy()
# 如果没有获取到 ip,说明获取 ip list 出了问题,等待 5 秒再次获取
if not proxy:
time.sleep(5)
continue
request.meta["proxy"] = proxy
spider.logger.debug("-"*120)
spider.logger.debug(proxy)
spider.logger.debug("-"*120)
class RetryFailedUrl(RetryMiddleware): # 继承自 RetryMiddleware 类
"""
继承并自定义 RetryMiddleware,用于在遇到异常的时候处理错误
"""
def __init__(self):
self.proxy_list = ProxyList()
super().__init__()
def process_response(self, request, response, spider):
"""
将爬取失败的URL存下来,你也可以存到别的存储
:param request:
:param response:
:param spider:
:return:
"""
# 在之前构造的 request 中可以加入 meta 信息 dont_retry 来决定是否重试
if request.meta.get('dont_retry', False):
return response
# 如果 rsponse 返回码是需要进行重试的返回码,则在此处进行处理
if response.status in self.retry_http_codes: # 只处理在 retry_http_codes 中出现的状态码
reason = response_status_message(status=response.status)
# 保存至文本用于分析查看
self.save_into_txt(spider.name, response.url)
# 删除该 proxy_ip
self.delete_proxy_ip(request.meta.get('proxy', False))
self.logger.info('请求不成功, 更换代理IP进行重试 ...')
return self._retry(request, reason, spider)
return response
def process_exception(self, request, exception, spider):
"""
出现异常的处理
:param request:
:param exception:
:param spider:
:return:
"""
if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
# 保存至文本用于分析查看
self.save_into_txt(spider.name, request.url)
# 删除该 proxy_ip
self.delete_proxy_ip(request.meta.get('proxy', False))
self.logger.info('连接异常, 更换代理IP进行重试 ...')
return self._retry(request, exception, spider)
def save_into_txt(self, name, text):
"""
逐行保存文本到 txt
:param text:
:return:
"""
with open(str(name) + ".txt", "a") as f:
f.write(str(text) + "\n")
def fetch_proxy_ip(self):
"""
获取新的 proxy_ip
:return:
"""
self.proxy_list.get_proxy()
def delete_proxy_ip(self, proxy_ip):
"""
删除不可用的 proxy_ip
:param proxy_ip:
:return:
"""
self.proxy_list.delete_proxy_ip(proxy_ip)
import requests
import time
import random
"""
ProxyList 类对外提供 get_proxy 方法,用于获取一个可用的 proxy_ip;
ProxyList 类对外提供 delete_proxy_ip 方法,用于剔除不可用的 proxy_ip;
main 主程序维护一个由 proxy_ip 组成的 list;
当 list 中元素少于 5 个时就获取一个新的 proxy_ip,检测可用以后,加入 list 中,直到够 5 个为止;
"""
class ProxyList():
def __init__(self):
self.proxy_list = []
def get_proxy(self):
"""
从 redis 数据库中取出
:return:
"""
try:
proxy = random.choice(self.proxy_list) # 每次随机选用一个代理 ip,这样全部请求就不会都是用一个 ip,降低了一个 ip 挂掉全部请求都报错的可能
return {'http': proxy, 'https': proxy}
except:
return None
def add_proxy(self, proxy):
"""
将新获取的 ip 写入 redis
:param proxy:
:return:
"""
self.proxy_list.append(proxy)
print(f"将新代理 {proxy} 写入 redis")
def delete_proxy_ip(self, proxy):
"""
删除 redis 中不可用的 ip
:param proxy:
:return:
"""
self.proxy_list.remove(proxy)
print(f"删除不可用的代理 {proxy}")
def get_new_proxy(self):
"""
获取新的 ip 地址
:return:
"""
# 此处获取新的 ip 代理
proxy_api = "xxx" # 此处为 proxy api
proxy = requests.get(url=proxy_api)
proxy_ip = proxy.text
return proxy_ip
def test_ip(self, proxy_ip):
print(f"TEST PROXY {proxy_ip}")
test_url = 'http://www.qq.com/'
try:
response = requests.get(test_url, proxies={"http": proxy_ip}, timeout=10, verify=False)
if response.status_code == 200:
print(f"PROXY {proxy_ip} AVAILABLE")
return True
else:
return False
except:
print("PROXY {proxy_ip} NOT AVAILABLE")
return False
def main():
p_list = ProxyList()
while 1:
if len(p_list.proxy_list) < 5:
try:
new_proxy_ip = p_list.get_new_proxy()
except:
print("[ip]: 获取新 ip 失败,等待 5 秒重新获取 ...")
time.sleep(5)
continue
if p_list.test_ip(new_proxy_ip):
p_list.add_proxy(new_proxy_ip)
if __name__ == '__main__':
main()
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36"
]
网友评论