Python构建代理池

作者: Orient_ZY | 来源:发表于2017-04-23 14:39 被阅读128次

    用 Python 爬取网站内容的时候,容易受到反爬虫机制的限制,而突破反爬虫机制的一个重要措施就是使用IP代理。我们可以在网络上找到许多IP代理,但稳定的IP代理成本都较高。因此利用免费代理构建自己的代理池就非常有必要了。本文将介绍如何用Python构建自己的IP代理池。

    先推荐两个免费的IP代理:
    http://www.haoip.cc
    http://www.xicidaili.com

    本文以www.haoip.cc/tiqu.htm为例构建代理池

    先导入程序必要的模块
    import requests
    import re
    import random
    import time
    
    爬取代理网站提供的IP存入数组ip_list
    url = 'www.haoip.cc/tiqu.htm'
    ip_list =[]
    ip_list_washed = []
    
    def get_ip_list(url):
        html = requests.get(url)
        ip_listn = re.findall(r'r/>(.*?)<b', html.text, re.S)
        for ipn in ip_listn:
            ip = re.sub('\n', '', ipn)    # 去除换行符
            ip_list.append(ip.strip())
    
    检测ip_list存储的ip
    # 由于我们使用www.baidu.com进行ip代理有效性的检测,因此先设置headers
    user_agent_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    
    def check_ip(ip):
        test_url = 'https://www.baidu.com'
        proxy = {'http': ip}
        user_agent = random.choice(user_agent_list)
        headers = {'User-Agent': user_agent}
        try:
            response = requests.get(test_url, headers=headers, proxies=proxy, timeout=5)
            time.sleep(5)
            if response.status_code == 200:
                return True
            else:
                return False
        except Exception as e:
            print(e)
            return False
            time.sleep(5)
    
    上完整代码

    IPProxyPool.py

    import requests
    import re
    import random
    import time
    
    
    class IPProxyPool:
    
        # 初始化,定义一个空数组ip_list用于存储ip代理
        def __init__(self):
            self.ip_list = []
            # self.ip_list_washed = []
            self.user_agent_list = [
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
                "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
                "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
                "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
                "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
            ]
    
        def get_ip_list(self, haoip_url = 'http://www.haoip.cc/tiqu.htm'):
            html = requests.get(haoip_url)
            ip_listn = re.findall(r'r/>(.*?)<b', html.text, re.S)
            for ipn in ip_listn:
                ip = re.sub('\n', '', ipn)
                # ip代理有效性检验
                statu = self.check_ip(ip)
                print(statu)
                if statu:
                    # 将有效ip代理存储至数组ip_list中
                    self.ip_list.append(ip.strip())
                print(self.ip_list)
    
        def check_ip(self, ip):
            test_url = 'https://www.baidu.com'
            proxy = {'http': ip}
            user_agent = random.choice(self.user_agent_list)
            headers = {'User-Agent': user_agent}
            try:
                response = requests.get(test_url, headers=headers, proxies=proxy, timeout=5)
                time.sleep(5)
                if response.status_code == 200:
                    return True
                else:
                    return False
            except Exception as e:
                print(e)
                return False
                time.sleep(5)
    
    
    IPProxyPool = IPProxyPool()
    IPProxyPool.get_ip_list()
    
    

    完整代码请点击:<a href='https://github.com/Orient-ZY/IPProxyPool'>github</a>(后续仍会继续添加新功能及优化,如果对您有帮助,烦请看官您动动手指,点个Star,感激不尽!)
    我的博客:<a href='http://www.orient-zy.cn'>Orient</a>(博客仍在完善中ing...)

    相关文章

      网友评论

        本文标题:Python构建代理池

        本文链接:https://www.haomeiwen.com/subject/zkxyzttx.html