美文网首页
IP代理池基于mongodb数据库

IP代理池基于mongodb数据库

作者: 天地清霜love橙 | 来源:发表于2017-07-22 09:37 被阅读48次

    代码用的python2.7,抓取xici免费代理,检测放入数据库中,为以后爬虫做准备。下面直接上代码

    ```

    #-*-encoding=utf-8-*-

    importrequests

    fromlxmlimportetree

    importtime

    importpymongo

    frommultiprocessingimportPool

    classGetproxy(object):

    def__init__(self):

    self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}

    self.url ='http://www.xicidaili.com/wt/'

    self.client = pymongo.MongoClient('localhost',27017)

    self.xici =self.client['xici']

    self.xiciipinfo =self.xici['xiciipinfo']

    #self.removeip = '127.0.0.1' #第一次运行会检测该变量,因为下面只有检测失败了才会赋值

    defgetip(self,num):

    #爬西祠所有代理,更新放入数据库

    url =self.url +str(num)

    wb_data = requests.get(url,headers=self.headers)

    html = etree.HTML(wb_data.text)

    # htmls = etree.tostring(html)

    ips = html.xpath('//tr[@class="odd"]/td[2]/text()')

    ports = html.xpath('//tr[@class="odd"]/td[3]/text()')

    protocols = html.xpath('//tr[@class="odd"]/td[6]/text()')

    areas = html.xpath('//tr[@class="odd"]/td[4]/a/text()')

    forip,port,protocol,areainzip(ips,ports,protocols,areas):

    data = {

    'ip': ip,

    'port': port,

    'protocol': protocol,

    'area': area,

    }

    printdata

    #self.xiciipinfo.insert_one(data)

    #if self.removeip != ip: #此处加一个判断,如果是下面检测过的不可用的ip,就不更新进入数据库,可以节省下面的检测时间

    self.xiciipinfo.update({'ip':ip},{'$set':data},True)

    defcount(self,num):

    foriinrange(1,num):

    self.getip(i)

    time.sleep(2)

    defdbclose(self):

    self.client.close()

    defgetiplist(self):

    #将数据库内数据整理放入列表

    ips =self.xiciipinfo.find()

    proxylist = []

    foriinips:

    b ="http"+"://"+ i['ip'] +":"+ i['port']

    proxies = {"http": b}

    # print proxies

    proxylist.append(proxies)

    # print proxylist

    returnproxylist

    defiptest(self,proxy):

    #检测ip,并更新进入数据库,删掉不可用的ip

    ip = proxy['http'][7:].split(':')[0]

    try:

    requests.get('http://wenshu.court.gov.cn/',proxies=proxy,timeout=6)

    except:

    print'field...............>>>>>>>>>>>>>>>>>>>>>>>>'

    #self.removeip = ip #赋值给类属性

    self.xiciipinfo.remove({'ip': ip})#用remove方法,将符合条件的删掉

    print'remove it now.....{}'.format(ip)

    else:

    print'<<<<<<<<<<<<<<<<<.............success'

    printproxy

    if__name__ =='__main__':

    pool = Pool()

    proxy = Getproxy()

    proxy.count(2)

    iplist = proxy.getiplist()

    map(proxy.iptest,iplist)

    proxy.dbclose()

    ```

    相关文章

      网友评论

          本文标题:IP代理池基于mongodb数据库

          本文链接:https://www.haomeiwen.com/subject/vonykxtx.html