美文网首页Python
西刺代理 IP池

西刺代理 IP池

作者: 领悟悟悟 | 来源:发表于2017-12-26 18:59 被阅读0次
    import requests
    from fake_useragent import UserAgent
    from pymongo import MongoClient
    from bs4 import BeautifulSoup
    import lxml
    import datetime
    from urllib.parse import urljoin
    import threadpool
    
    
    ua = UserAgent()
    
    try:
        # Python 3.x
        from urllib.parse import quote_plus
    except ImportError:
        # Python 2.x
        from urllib import quote_plus
    
    PORT = "27017"
    IP = "192.168.101.203"
    USERNAME = ""
    PASSWORD = ""
    
    class MyMongoOperator(object):
        def __init__(self, userName=USERNAME,       password=PASSWORD, port=PORT, IP=IP):
            if userName:
                uri = "mongodb://%s:%s@%s:%s" % (
    quote_plus(userName), quote_plus(password), IP, port)
            else:
                uri = "mongodb://%s:%s" % (IP, port)
            self.client = MongoClient(uri)
    
        # 指定数据库和集合
        def connect2DB_COLL(self, dbname, collection):
            self.db = eval('''self.client.%s'''%dbname)
            self.collection = eval('''self.db.%s'''%collection)
            return self
    
        def find(self, *args, **kwargs):
            return self.collection.find(*args, **kwargs)
    
        def findOne(self, query):
            return self.collection.find_one(query)
    
        def insert(self,args):
            return self.collection.insert(args)
    
        def update(self,*args):
            return self.collection.update(*args)
    
        def remove(self, args):
            return self.collection.remove(args)
    
        def closeConn(self):
            self.client.close()
    
    def getConn():
        obj = MyMongoOperator()
        obj.connect2DB_COLL('spider', 'IP_pool')
        return obj
    
    # 获取mongo的连接对象
    mongo = getConn()
    
    class XCspider(object):
        def __init__(self):
            self.header = {
                'User-Agent':ua.random,
                'Referer':'http://www.xicidaili.com/'
            }
            self.today = datetime.datetime.today()
            self.dayStart = datetime.datetime(year=self.today.year, month=self.today.month, day=self.today.day)
    
        def getTime(self, timeStr):
            time = datetime.datetime.strptime(timeStr, '%y-%m-%d %H:%M')
            return time
    
        def get(self, url):
            response = requests.get(url,headers=self.header)
            if response.status_code == 200:
                return response
            elif response.status_code == 404:
                raise requests.HTTPError('404 网页未找到')
            elif response.status_code == 500:
                raise requests.HTTPError('500 服务器错误')
            else:
                raise requests.HTTPError('其它相应错误{}'.format(response.status_code))
    
        def parse(self, response):
            soup = BeautifulSoup(response.content, 'lxml')
            ip_list = soup.find_all('tr')
            if ip_list:
                ip_list = ip_list[1:]
            for ip in ip_list:
                info = ip.text.strip("'")
                ip_info = list(filter(lambda each: each, info.split('\n')))
                if len(ip_info) < 7:
                    print(ip_info)
                    continue
                # 获取当天的ip
                if self.getTime(ip_info[-1]) > self.dayStart:
                    # 数据入库
                    mongo.insert({
                        'proxy':{'{}'.format(ip_info[4]): "http://{}:{}".format(ip_info[0],ip_info[1])},
                        'area':ip_info[2],
                        'type':ip_info[3],
                        'time':ip_info[6],
                        'insertTime':datetime.datetime.now().strftime('%y-%m-%d %H:%M'),
                        'inspect':False,
                        'available':False,
                      }
                    )
                else:
                    return False
            next_page = soup.find('a',attrs={'class':'next_page'})
            if next_page:
                next_url = urljoin('http://www.xicidaili.com/', next_page.attrs['href'])
                return next_url
            pass
    
        def clear(self):
            '''
            清空ip池
            :return:
            '''
            mongo.remove({})
    
    
        def verify(self):
            pool = threadpool.ThreadPool(10)
            ipList = mongo.find({'inspect': False})
            requests = threadpool.makeRequests(verify_ip, ipList)
            [pool.putRequest(req) for req in requests]
            pool.wait()
            print('ip检测完毕')
    
    def verify_ip(each):
        headers = {
            'User-Agent': ua.random,
            'Referer': 'http://www.baidu.com/'
        }
    
        try:
            response = requests.get('http://www.dianping.com/', proxies=each['proxy'], timeout=3, headers=headers)
        except Exception as e:
            print(e)
            mongo.update({'_id':each['_id']}, {'$set':{'inspect':True}})
        else:
            if response.status_code == 200:
                mongo.update({'_id': each['_id']}, {'$set': {'inspect': True, 'available':True}})
            else:
                mongo.update({'_id': each['_id']}, {'$set': {'inspect': True}})
        pass
    
    def getProxy():
        xc = XCspider()
        xc.clear()
        response = xc.get(url='http://www.xicidaili.com/nn/')
        url = xc.parse(response)
        while url:
            print('\t正在抓取:%s'%url)
            response = xc.get(url=url)
            url = xc.parse(response)
    
    def verify():
        xc = XCspider()
        xc.verify()
    
    def parse():
        with open('西刺代理.html', 'r', encoding='utf-8') as f:
            data = f.read()
        soup = BeautifulSoup(data, 'lxml')
        ip_list = soup.find_all('tr')
        if ip_list:
            ip_list = ip_list[1:]
        for ip in ip_list:
            info = ip.text.strip("'")
            ip_info = list(filter(lambda each:each, info.split('\n')))
    
    
            # 数据入库
    
        pass
    
    
    if __name__ == '__main__':
    
        # getProxy()
        # verify()
        headers = {
            'User-Agent': ua.random,
            'Referer': 'http://www.baidu.com/'
        }
        response = requests.get('http://www.dianping.com/', proxies={"HTTP" : "http://112.114.98.66:8118"},
                            headers=headers)
        mongo.closeConn()
    

    相关文章

      网友评论

        本文标题:西刺代理 IP池

        本文链接:https://www.haomeiwen.com/subject/fxcsgxtx.html