美文网首页
Scrapy:Scrapy 中 UA、IP 代理、Retry 中

Scrapy:Scrapy 中 UA、IP 代理、Retry 中

作者: dex0423 | 来源:发表于2020-08-06 09:33 被阅读0次
  • middlewares.py
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
import random
from .usergents import USER_AGENT_LIST
from scrapy.exceptions import NotConfigured
from .proxyip import ProxyList
import time


class UserAgentDownloadMiddleware(object):
    """
    随机使用 User Agent 中间件
    """
    def process_request(self, request, spider):
        """
        每次请求都会添加一个随机的 UA
        :param request:
        :param spider:
        :return:
        """
        user_agent = random.choice(USER_AGENT_LIST)
        request.headers['User-Agent'] = user_agent
        spider.logger.debug("[User-Agent] ", user_agent)


class RandomProxyMiddleware(object):
    """
    随机 ip 中间件
    """
    def __init__(self, settings):
        self.proxies = settings.get("IP_PROXY_LIST")
        self.proxy_list = ProxyList()

    @classmethod
    def from_crawler(cls, crawler):
        """
        获取 settings 配置中的 proxy 配置,如果未设置则报错
        :param crawler:
        :return:
        """
        if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
            raise NotConfigured

        return cls(crawler.settings)

    def process_request(self, request, spider):
        while 1:
            proxy = self.proxy_list.get_proxy()
            # 如果没有获取到 ip,说明获取 ip list 出了问题,等待 5 秒再次获取
            if not proxy:
                time.sleep(5)
                continue
        request.meta["proxy"] = proxy
        spider.logger.debug("-"*120)
        spider.logger.debug(proxy)
        spider.logger.debug("-"*120)


class RetryFailedUrl(RetryMiddleware):      # 继承自 RetryMiddleware 类
    """
    继承并自定义 RetryMiddleware,用于在遇到异常的时候处理错误
    """
    def __init__(self):
        self.proxy_list = ProxyList()
        super().__init__()

    def process_response(self, request, response, spider):
        """
        将爬取失败的URL存下来,你也可以存到别的存储
        :param request:
        :param response:
        :param spider:
        :return:
        """
        # 在之前构造的 request 中可以加入 meta 信息 dont_retry 来决定是否重试
        if request.meta.get('dont_retry', False):
            return response
        # 如果 rsponse 返回码是需要进行重试的返回码,则在此处进行处理
        if response.status in self.retry_http_codes:    # 只处理在 retry_http_codes 中出现的状态码
            reason = response_status_message(status=response.status)
            # 保存至文本用于分析查看
            self.save_into_txt(spider.name, response.url)
            # 删除该 proxy_ip
            self.delete_proxy_ip(request.meta.get('proxy', False))
            self.logger.info('请求不成功, 更换代理IP进行重试 ...')
            return self._retry(request, reason, spider)
        return response

    def process_exception(self, request, exception, spider):
        """
        出现异常的处理
        :param request:
        :param exception:
        :param spider:
        :return:
        """
        if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
            # 保存至文本用于分析查看
            self.save_into_txt(spider.name, request.url)
            # 删除该 proxy_ip
            self.delete_proxy_ip(request.meta.get('proxy', False))
            self.logger.info('连接异常, 更换代理IP进行重试 ...')
            return self._retry(request, exception, spider)

    def save_into_txt(self, name, text):
        """
        逐行保存文本到 txt
        :param text:
        :return:
        """
        with open(str(name) + ".txt", "a") as f:
            f.write(str(text) + "\n")

    def fetch_proxy_ip(self):
        """
        获取新的 proxy_ip
        :return:
        """
        self.proxy_list.get_proxy()

    def delete_proxy_ip(self, proxy_ip):
        """
        删除不可用的 proxy_ip
        :param proxy_ip:
        :return:
        """
        self.proxy_list.delete_proxy_ip(proxy_ip)

  • prixy_ip.py
import requests
import time
import random


"""
ProxyList 类对外提供 get_proxy 方法,用于获取一个可用的 proxy_ip;
ProxyList 类对外提供 delete_proxy_ip 方法,用于剔除不可用的 proxy_ip;

main 主程序维护一个由 proxy_ip 组成的 list;
当 list 中元素少于 5 个时就获取一个新的 proxy_ip,检测可用以后,加入 list 中,直到够 5 个为止;
"""


class ProxyList():
    def __init__(self):
        self.proxy_list = []

    def get_proxy(self):
        """
        从 redis 数据库中取出
        :return:
        """
        try:
            proxy = random.choice(self.proxy_list)      # 每次随机选用一个代理 ip,这样全部请求就不会都是用一个 ip,降低了一个 ip 挂掉全部请求都报错的可能
            return {'http': proxy, 'https': proxy}
        except:
            return None

    def add_proxy(self, proxy):
        """
        将新获取的 ip 写入 redis
        :param proxy:
        :return:
        """
        self.proxy_list.append(proxy)
        print(f"将新代理 {proxy} 写入 redis")

    def delete_proxy_ip(self, proxy):
        """
        删除 redis 中不可用的 ip
        :param proxy:
        :return:
        """
        self.proxy_list.remove(proxy)
        print(f"删除不可用的代理 {proxy}")

    def get_new_proxy(self):
        """
        获取新的 ip 地址
        :return:
        """
        # 此处获取新的 ip 代理
        proxy_api = "xxx"           # 此处为 proxy api
        proxy = requests.get(url=proxy_api)
        proxy_ip = proxy.text
        return proxy_ip

    def test_ip(self, proxy_ip):
        print(f"TEST PROXY {proxy_ip}")
        test_url = 'http://www.qq.com/'
        try:
            response = requests.get(test_url, proxies={"http": proxy_ip}, timeout=10, verify=False)
            if response.status_code == 200:
                print(f"PROXY {proxy_ip} AVAILABLE")
                return True
            else:
                return False
        except:
            print("PROXY {proxy_ip} NOT AVAILABLE")
            return False


def main():
    p_list = ProxyList()
    while 1:
        if len(p_list.proxy_list) < 5:
            try:
                new_proxy_ip = p_list.get_new_proxy()
            except:
                print("[ip]: 获取新 ip 失败,等待 5 秒重新获取 ...")
                time.sleep(5)
                continue
            if p_list.test_ip(new_proxy_ip):
                p_list.add_proxy(new_proxy_ip)


if __name__ == '__main__':
    main()

  • useragents.py
USER_AGENT_LIST = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
    "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36"
]

相关文章

网友评论

      本文标题:Scrapy:Scrapy 中 UA、IP 代理、Retry 中

      本文链接:https://www.haomeiwen.com/subject/srvorktx.html