美文网首页开源项目
Python构建私有代理IP库

Python构建私有代理IP库

作者: kaliarch | 来源:发表于2018-07-16 17:34 被阅读51次

    一、背景

    在Python写爬虫时候,经常会遇到爬虫与反爬虫的博弈,高强度、高频次地爬取网页信息,一般会给网站服务器带来巨大压力与性能损坏,故同一个IP不断爬取网页的信息,很快就可能被网站管理员封掉。故我们可以搭建自己的代理IP库,不停的更换自己的IP去爬去网页,不会因为同一IP而影响爬虫的进行。将爬取到的IP信息进行判断筛选可用的代理地址存入数据库MySQL/Redis/Mongodb/Memcache,后期需要使用代理IP,直接从私有库中获取以逸待劳。

    二、相关资料

    2.1 使用的Python模块

    • Requests 获取网络请求
    • BeautifulSoup处理网页文件获取需要的信息
    • configparser读取配置文件信息,获取相关内容信息
    • pymysql用于MySQL数据库操作
    • redis用于Redis的操作
    • pymongo用于Mongodb操作
    • memcache用于Memcache操作

    2.2 相关参考链接

    Redis可参考Redis-3.2主从复制与集群搭建
    Mongodb可参考Mongodb基础
    Memcache可参考Memcached 安装脚本(附服务器自启动)
    Python基础爬虫可参考利用Python搜索51CTO推荐博客并保存至Excel

    三、代码示例

    3.1 github地址

    PROXIES

    3.2 代码

    image

    a.spider.py

    #!/bin/env python
    # -*- coding:utf-8 -*-
    # _author:kaliarch
    
    import requests
    from bs4 import BeautifulSoup
    import random
    
    class GetProxyIP:
    
        def __init__(self, page=10):
            self._page = page
            self.url_head = 'http://www.xicidaili.com/wt/'
    
        def get_ip(self):
            """
            get resouce proxy ip pool
            :return: res_pool list
            """
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
            res_pool = []
            for pagenum in range(1, self._page):
                url = self.url_head + str(pagenum)
                response = requests.get(url, headers=headers)
                soup = BeautifulSoup(response.text, "html.parser")
                soup_tr = soup.find_all('tr')
                for item in soup_tr:
                    try:
                        soup_td = item.find_all('td')
                        # 获取到网页的代理IP信息
                        res_pool.append(soup_td[5].text.lower() + '://' + soup_td[1].text + ':' + soup_td[2].text)
                    except IndexError:
                        pass
            return res_pool
    
        def right_proxies(self, res_pool):
            """
            check available ip
            :param res_pool:
            :return:right_pool list
            """
            right_pool = []
            for ip in res_pool:
                if 'https' in ip:
                    proxies = {'http': ip}
                else:
                    proxies = {"http": ip}
                check_urllist = ['http://www.baidu.com', 'http://www.taobao.com', 'https://cloud.tencent.com/']
                try:
                    response = requests.get(random.choice(check_urllist), proxies=proxies, timeout=1)
                    # 判断筛选可用IP
                    if response.status_code:
                        right_pool.append(proxies)
                        print('add ip %s' % proxies)
                except Exception as e:
                    continue
            return right_pool
    
    if __name__ == '__main__':
        # 实例化类,可以传入page
        proxyhelper = GetProxyIP(2)
        res_pool = proxyhelper.get_ip()
        proxy_ip = proxyhelper.right_proxies(res_pool)
        print(proxy_ip)
    

    b.db.conf

    [mysql]
    HOST = 172.20.6.100
    PORT = 3306
    USER = root
    PASSWD = mysqladmin
    DB = pydb
    TABLE = pytab
    CHARSET = utf8
    
    [redis]
    HOST = 172.20.6.100
    PORT = 6379
    PASSWD = redisadmin
    
    [memcache]
    HOST = 172.20.6.100
    PORT = 11211
    
    [mongodb]
    HOST = 172.20.6.100
    PORT = 27017
    DB = db1
    USER = mongoadmin
    PASSWD = mongopwd
    

    c.save_mysql.py

    #!/bin/env python
    # -*- coding:utf-8 -*-
    # _author:kaliarch
    
    import pymysql
    import configparser
    import spider
    
    class MysqlOper:
        # initial database information
        def __init__(self, result_list):
            #初始化mysql数据库的性格信息
            config = configparser.ConfigParser()
            config.read('db.conf')
            self.host = config['mysql']['HOST']
            self.port = int(config['mysql']['PORT'])
            self.user = config['mysql']['USER']
            self.passwd = config['mysql']['PASSWD']
            self.db = config['mysql']['DB']
            self.table = config['mysql']['TABLE']
            self.charset = config['mysql']['CHARSET']
            self.result_list = result_list
    
        def mysql_save(self):
                
            # create db cursor
            try:
                DB = pymysql.connect(self.host, self.user, self.passwd, self.db, port=self.port, charset=self.charset)
                cursor = DB.cursor()
            except Exception as e:
                print("connect dbserver fail,Please see information:")
                print(e)
                exit(1)
    
            # check and create tables
            cursor.execute('show tables in pydb')
            tables = cursor.fetchall()
            flag = True
            for tab in tables:
                if self.table in tab:
                    flag = False
                    print('%s is exist' % self.table)
            print(flag)
            if flag:
                #创建pytab表
                cursor.execute(
                    '''create table pytab (id int unsigned not null primary key auto_increment, protocol varchar(10),content varchar(50))''')
            else:
                return 0
    
            # 讲获取到的IP写入到mysql数据库
            for values in self.result_list:
                for prot, cont in values.items():
                    try:
                        cursor.execute("insert into pytab (protocol,content) value (%s,%s);", [prot, cont])
                    except Exception as e:
                        print("insert db occer error", e)
    
    if __name__ == "__main__":
        proxyhelper = spider.GetProxyIP(3)
        res_pool = proxyhelper.get_ip()
        proxy_ip = proxyhelper.right_proxies(res_pool)
        dbhelper = MysqlOper(proxy_ip)
        dbhelper.mysql_save()
    

    3.save_redis.py

    #!/bin/env python
    # -*- coding:utf-8 -*-
    # _author:kaliarch
    
    import redis
    import random
    import configparser
    import spider
    
    class RedisOper:
    
        def __init__(self):
            """
            initialization redis infomation
            :param
            """
            config = configparser.ConfigParser()
            config.read('db.conf')
            self.host = config['redis']['HOST']
            self.port = config['redis']['PORT']
            self.passwd = config['redis']['PASSWD']
            self.pool = redis.ConnectionPool(host=self.host, port=self.port, password=self.passwd)
            self.redis_helper = redis.Redis(connection_pool=self.pool)
            self.pipe = self.redis_helper.pipeline(transaction=True)
    
        def redis_save(self, result_list):
            """
            save data
            :return:None
            """
            for num, cont in enumerate(result_list):
                self.redis_helper.set(num, cont)
            self.pipe.execute()
    
        def redis_gain(self):
            """
            gain data
            :return: proxies
            """
            num = random.randint(0, 10)
            ip = self.redis_helper.get(num)
            self.pipe.execute()
            return ip
    
    
    if __name__ == '__main__':
        proxyhelper = spider.GetProxyIP(2)
        res_pool = proxyhelper.get_ip()
        proxy_ip = proxyhelper.right_proxies(res_pool)
        dbhelper = RedisOper()
        dbhelper.redis_save(proxy_ip)
        ip = dbhelper.redis_gain()
        print(ip)
    

    4.save_mongodb.py

    #!/bin/env python
    # -*- coding:utf-8 -*-
    # _author:kaliarch
    
    import configparser
    import spider
    from pymongo import MongoClient
    
    
    class MongodbOper:
    
        def __init__(self):
            """
            initialization redis infomation
            :param
            """
            config = configparser.ConfigParser()
            config.read('db.conf')
            self.host = config['mongodb']['HOST']
            self.port = config['mongodb']['PORT']
            self.db = config['mongodb']['DB']
            self.user = config['mongodb']['USER']
            self.pwd = config['mongodb']['PASSWD']
            self.client = MongoClient(self.host, int(self.port))
            self.db_auth = self.client.admin
            self.db_auth.authenticate(self.user, self.pwd)
            self.DB = self.client[self.db]
            self.collection = self.DB.myset
    
        def mongodb_save(self, result_list):
            """
            save data
            :return:None
            """
            for values in result_list:
                self.collection.insert(values)
    
        def mongodb_gain(self):
            """
            gain data
            :return: proxies
            """
            ip = self.collection.find_one()
            return ip
    
    
    if __name__ == '__main__':
        proxyhelper = spider.GetProxyIP(2)
        res_pool = proxyhelper.get_ip()
        proxy_ip = proxyhelper.right_proxies(res_pool)
        dbhelper = MongodbOper()
        dbhelper.mongodb_save(proxy_ip)
        ip = dbhelper.mongodb_gain()
        print(ip)
    

    5.save_memcache.py

    #!/bin/env python
    # -*- coding:utf-8 -*-
    # _author:kaliarch
    
    import memcache
    import random
    import configparser
    import spider
    
    class MemcacheOper:
    
        def __init__(self):
            """
            initialization redis infomation
            :param
            """
            config = configparser.ConfigParser()
            config.read('db.conf')
            self.host = config['memcache']['HOST']
            self.port = config['memcache']['PORT']
            self.mcoper = memcache.Client([self.host + ':' + self.port], debug=True)
    
        def memcache_save(self, result_list):
            """
            save data
            :return:None
            """
            for num, cont in enumerate(result_list):
                self.mcoper.set(str(num), cont)
    
        def memcache_gain(self):
            """
            gain data
            :return: proxies
            """
            num = random.randint(0, 10)
            ip = self.mcoper.get(str(num))
            return ip
    
    
    if __name__ == '__main__':
        proxyhelper = spider.GetProxyIP(2)
        res_pool = proxyhelper.get_ip()
        proxy_ip = proxyhelper.right_proxies(res_pool)
        dbhelper = MemcacheOper()
        dbhelper.memcache_save(proxy_ip)
        ip = dbhelper.memcache_gain()
        print(ip)
    

    四、效果展示

    单独运行spider.py可以查看到爬取并筛选出的可用ip池

    image
    运行其他保存文件,可以进入对应数据库查看存储的信息。
    MySQL
    image
    Redis
    image
    Mongodb
    image
    Memcache
    image
    至此我们就利用Python构建了一个属于自己的私有代理库,在进行爬去的时候可方便从数据库中获取使用。

    相关文章

      网友评论

        本文标题:Python构建私有代理IP库

        本文链接:https://www.haomeiwen.com/subject/oawbpftx.html