美文网首页虫虫
爬虫—代理池的维护一(获取模块)

爬虫—代理池的维护一(获取模块)

作者: 八戒无戒 | 来源:发表于2019-09-29 00:22 被阅读0次

    记录一个免费代理池的维护,主要包含四个模块:
    获取模块:主要负责从各个免费代理网站提取出最新发布的免费代理,获取到本地并解析
    存储模块:负责将获取模块获取到的proxy存储至redis数据库
    检测模块:负责检测redis数据库中proxy的可用代理可不可用代理,并赋以权重
    调度模块:负责将获取模块、存储模块和检测模块关联,并封装

    主要涉及知识点:

    • 元类
    • python操作redis数据库,redis库的使用
    • requests库的使用
    • pyquery的使用
    • aiohttp异步http框架的简单使用
    • 多线程和多进程

    获取模块

    # -*- coding: utf-8 -*-
    """
    __author__ = 'bingo'
    __date__ = '2019/9/7'
    # code is far away from bugs with the god animal protecting
        I love animals. They taste delicious.
                 ┏┓   ┏┓
                ┏┛┻━━━┛┻┓
                ┃     ☃ ┃
                ┃  ┳┛  ┗┳  ┃
                ┃      ┻   ┃
                ┗━┓      ┏━┛
                    ┃    ┗━━━┓
                    ┃  神兽保 ┣┓
                    ┃ 永无BUG┏┛
                    ┗ ┓┏━┳┓┏┛
                      ┃┫┫  ┃┫┫
                      ┗┻┛  ┗┻┛
    """
    import random
    import asyncio
    import requests
    import time
    import redis
    import aiohttp
    from pyquery import PyQuery as pq
    from redis import ResponseError
    from requests import ConnectTimeout
    from concurrent.futures import ThreadPoolExecutor
    from multiprocessing import Process
    from flask import Flask
    
    # 获取模块
    class ProxyMeta(type):
        def __new__(cls, name, bases, attrs):
            crawl_count = 0
            attrs["__CrawlFunc__"] = []
    
            # 获取获取模块中用来爬取代理的所有函数
            for k, v in attrs.items():
                if k.startswith("crawl_"):
                    func = "self.{}()".format(k)
                    attrs["__CrawlFunc__"].append(func)
                    crawl_count += 1
    
            # 获取获取模块中用来爬取代理的函数数量
            attrs["__CrawlFuncCount__"] = crawl_count
            return type.__new__(cls, name, bases, attrs)
    
    
    class CrawlerGetter(object, metaclass=ProxyMeta):
    
        def __init__(self):
            headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}
            self.headers = headers
            self.proxy_count = 0
            self.db_client = ProxyRedisClient()
    
        def get_page(self, url, encoding):
            try:
                res = requests.get(url, headers=self.headers, timeout=2.5)
                if res.status_code == 200:
                    res.encoding = encoding
                    html = res.text
                    return html
                else:
                    return None
            except ConnectTimeout:
                return None
    
        def crawl_66daili(self):
            """
            66代理
            :return:
            """
            i = 0
            url = "http://www.66ip.cn/{page}.html"
            for page in range(1, 11):
                html = self.get_page(url.format(page=page), 'gbk')
                if html:
                    p = pq(html)
                    doc = p(".containerbox table tr:gt(0)")
                    for item in doc.items():
                        proxy_ip = item("td:first-child").text()
                        proxy_port = item("td:nth-child(2)").text()
                        if proxy_ip and proxy_port:
                            proxy = ":".join([proxy_ip, proxy_port])
                            i += 1
                            print("【66代理%s】:%s" % (i, proxy))
                            self.proxy_count += 1
                            yield proxy
                        else:
                            pass
                else:
                    print("【66代理】获取代理失败page:%s" % page)
                    continue
    
        def crawl_iphai(self):
            """
            ip海代理
            :return:
            """
            i = 0
            urls = ["http://www.iphai.com/free/ng", "http://www.iphai.com/free/wg"]
            for url in urls:
                html = self.get_page(url, 'utf8')
                if html:
                    p = pq(html)
                    doc = p(".table-responsive table tr:gt(0)")
                    for item in doc.items():
                        proxy_ip = item("td:first-child").text()
                        proxy_port = item("td:nth-child(2)").text()
                        if proxy_ip and proxy_port:
                            proxy = ":".join([proxy_ip, proxy_port])
                            i += 1
                            print("【IP海代理%s】:%s" % (i, proxy))
                            self.proxy_count += 1
                            yield proxy
                        else:
                            pass
                else:
                    print("【IP海代理】获取代理失败: %s" % url)
                    continue
    
        def crawl_qiyun(self):
            """
            齐云代理
            :return:
            """
            i = 0
            url = "http://www.qydaili.com/free/?action=china&page={page}"
            for page in range(1, 11):
                html = self.get_page(url.format(page=page), "utf8")
                if html:
                    p = pq(html)
                    doc = p(".table tbody tr")
                    for item in doc.items():
                        proxy_ip = item("td:first-child").text()
                        proxy_port = item("td:nth-child(2)").text()
                        if proxy_ip and proxy_port:
                            proxy = ":".join([proxy_ip, proxy_port])
                            i += 1
                            print("【齐云代理%s】:%s" % (i, proxy))
                            self.proxy_count += 1
                            yield proxy
                        else:
                            pass
                else:
                    print("【齐云代理】获取代理失败page:%s" % page)
                    continue
    
        def crawl_89daili(self):
            """
            89免费代理
            :return:
            """
            i = 0
            url = "http://www.89ip.cn/index_{page}.html"
            for page in range(1, 21):
                html = self.get_page(url.format(page=page), "utf8")
                if html:
                    p = pq(html)
                    doc = p(".layui-table tbody tr")
                    for item in doc.items():
                        proxy_ip = item("td:first-child").text()
                        proxy_port = item("td:nth-child(2)").text()
                        if proxy_ip and proxy_port:
                            proxy = ":".join([proxy_ip, proxy_port])
                            i += 1
                            print("【89免费代理%s】:%s" % (i, proxy))
                            self.proxy_count += 1
                            yield proxy
                        else:
                            pass
                else:
                    print("【89免费代理】获取代理失败page:%s" % page)
                    continue
    
        def crawl_kuaidaili(self):
            """
            快代理
            :return:
            """
            i = 0
            url = "https://www.kuaidaili.com/free/inha/{page}/"
            for page in range(1, 11):
                html = self.get_page(url.format(page=page), "utf8")
                if html:
                    p = pq(html)
                    doc = p("table tbody tr")
                    for item in doc.items():
                        proxy_ip = item("td:first-child").text()
                        proxy_port = item("td:nth-child(2)").text()
                        if proxy_ip and proxy_port:
                            proxy = ":".join([proxy_ip, proxy_port])
                            i += 1
                            print("【快代理%s】:%s" % (i, proxy))
                            self.proxy_count += 1
                            yield proxy
                        else:
                            pass
                else:
                    print("【快代理】获取代理失败page:%s" % page)
                    continue
    
        def crawl_yundaili(self):
            """
            云代理
            :return:
            """
            i = 0
            url = "http://www.ip3366.net/free/?stype=1&page={page}"
            for page in range(1, 8):
                html = self.get_page(url.format(page=page), "gb2312")
                if html:
                    p = pq(html)
                    doc = p("table tbody tr")
                    for item in doc.items():
                        proxy_ip = item("td:first-child").text()
                        proxy_port = item("td:nth-child(2)").text()
                        if proxy_ip and proxy_port:
                            proxy = ":".join([proxy_ip, proxy_port])
                            i += 1
                            print("【云代理%s】:%s" % (i, proxy))
                            self.proxy_count += 1
                            yield proxy
                        else:
                            pass
                else:
                    print("【云代理】获取代理失败page:%s" % page)
                    continue
    
        def crawl_xicidaili(self):
            """
            西刺代理
            :return:
            """
            i = 0
            url = "https://www.xicidaili.com/nn/{page}"
            for page in range(1, 6):
                html = self.get_page(url.format(page=page), "utf8")
                if html:
                    p = pq(html)
                    doc = p(".proxies table tr:gt(0)")
                    for item in doc.items():
                        proxy_ip = item("td:nth-child(2)").text()
                        proxy_port = item("td:nth-child(3)").text()
                        if proxy_ip and proxy_port:
                            proxy = ":".join([proxy_ip, proxy_port])
                            i += 1
                            print("【西刺代理%s】:%s" % (i, proxy))
                            self.proxy_count += 1
                            yield proxy
                        else:
                            pass
                else:
                    print("【西刺代理】获取代理失败page:%s" % page)
                    continue
    
        def run(self):
            """
            返回各个网站爬虫接口函数生成器,并以多线程方式存入redis数据库
            :return:
            """
            crawl_funcs_list = []
            try:
                executor = ThreadPoolExecutor(max_workers=10)
                for crawl_func_name in self.__CrawlFunc__:
                    crawl_funcs_list.append(eval(crawl_func_name))
                for crawl_func in crawl_funcs_list:
                    executor.submit(self.to_redis_db, crawl_func)
                executor.shutdown()
            except Exception as e:
                print("ERROR:", e)
    
        def to_redis_db(self, generation):
            """
            接受一个生成代理ip的生成器,将代理存入redis代理池
            :param generation:
            :return:
            """
            proxies_generation = generation
            for proxy in proxies_generation:
                self.db_client.add(proxy)
    
    

    相关文章

      网友评论

        本文标题:爬虫—代理池的维护一(获取模块)

        本文链接:https://www.haomeiwen.com/subject/ioexpctx.html