美文网首页
python 代理ip爬取,ip代理,数据库存储,去重,验证。

python 代理ip爬取,ip代理,数据库存储,去重,验证。

作者: wangcc_sd | 来源:发表于2019-04-17 23:20 被阅读0次

    本文是对goubanjia的网站的一个综合性爬取。

    实现代理ip爬取,存入数据库,使用代理,去重,验证。

    首先是ip_request.py

    # -*-coding:utf-8 -*-
    # BY WANGCC
    from bs4 import BeautifulSoup
    import requests
    
    from bs4 import BeautifulSoup
    import requests
    from ip_to_mysql import mysql_proxies,mysql_delete
    url = 'http://ip.tool.chinaz.com/'
    
    
    def str2header(headers_raw):
        if headers_raw is None:
            return None
        headers = headers_raw.splitlines()
        headers_tuples = [header.split(':', 1) for header in headers]
    
        result_dict = {}
        for header_item in headers_tuples:
            if not len(header_item) == 2:
                continue
            item_key = header_item[0].strip()
            item_value = header_item[1].strip()
            result_dict[item_key] = item_value
        return result_dict
    
    
    # 抓包看到的header字符串
    r_h = ''' 
    
    GET / HTTP/1.1
    Host: www.baidu.com
    Connection: keep-alive
    Cache-Control: max-age=0
    Upgrade-Insecure-Requests: 1
    User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) 
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
    Accept-Encoding: gzip, deflate, br
    Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
    Cookie: *** 
    
    '''
    # headers已经生成 可直接在requests.get(url,headers=headers)
    headers = str2header(r_h)
    
    proxies=mysql_proxies()
    proxies_str=proxies.split(":")
    Agreement=proxies_str[0]
    proxies = {
        Agreement: proxies,
        }
    r = requests.get(url, proxies=proxies)
    soup = BeautifulSoup(r.text, 'lxml')
    parent_node = soup.find(class_="IpMRig-tit")
    a=parent_node.find_all('dd')
    if '58.87.119.xxxx' not in a:
        print('success')
    else:
        mysql_delete(proxies)
    # for i in parent_node.find_all('dd'):
    #     print(i.get_text())
    
    

    这个模块是对ip的一个验证,随机生成header头。

    ip_to_mysql.py

    # -*-coding:utf-8 -*-
    # BY WANGCC
    
    import pymysql,datetime
    import logger
    
    log = logger.Logger("debug")
    
    DB_CONFIG = {
        "host": "127.0.0.1",
        "port": 3306,
        "user": "admin",
        "passwd": "*******",
        "db": "ip_Original",
        "charset": "utf8"
    }
    
    
    def mysql(ip_list):
        # 打开数据库连接
        db = pymysql.connect(
            host=DB_CONFIG["host"],
            port=DB_CONFIG["port"],
            user=DB_CONFIG["user"],
            passwd=DB_CONFIG["passwd"],
            db=DB_CONFIG["db"],
            charset=DB_CONFIG["charset"])
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
        date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        for ip in ip_list:
            check_sql="select count(*) from ip_original where ip='%s'"%(ip)
            insert_sql = "insert into ip_original(ip,date)value ('%s','%s')" % (ip, date)
            cursor.execute(check_sql)
            number=cursor.fetchall()
            new_num=number[0][0]
            if number[0][0] == 0:
                try:
                    # 执行sql语句
                    cursor.execute(insert_sql)
                    log.info(ip+'insert to ip_original success!')
                    # 提交到数据库执行
                    db.commit()
                except Exception as e:
                    log.info('执行sql-->'+insert_sql+'fail')
                    # 发生错误时回滚
                    db.rollback()
            else:
                log.info(ip+': is existence !!',)
        # 关闭数据库连接
        db.close()
    
    #获得一个ip代理
    def mysql_proxies():
        # 打开数据库连接
        db = pymysql.connect(
            host=DB_CONFIG["host"],
            port=DB_CONFIG["port"],
            user=DB_CONFIG["user"],
            passwd=DB_CONFIG["passwd"],
            db=DB_CONFIG["db"],
            charset=DB_CONFIG["charset"])
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
        check_sql="select * from ip_original "
        cursor.execute(check_sql)
        number=cursor.fetchmany(1)
        proxies=number[0][1]
    # 关闭数据库连接
        db.close()
        return proxies
    
    
    #删除一条数据
    def mysql_delete(proxies):
        # 打开数据库连接
        db = pymysql.connect(
            host=DB_CONFIG["host"],
            port=DB_CONFIG["port"],
            user=DB_CONFIG["user"],
            passwd=DB_CONFIG["passwd"],
            db=DB_CONFIG["db"],
            charset=DB_CONFIG["charset"])
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
        check_sql="delete  from ip_original  where ip = '%s'"%(proxies)
        log.info('delete data'+'proxies'+'success')
        cursor.execute(check_sql)
        number=cursor.fetchmany(1)
        proxies=number[0][1]
    
    # 关闭数据库连接
        db.close()
        return proxies
    
    if  __name__=="__main__":
        ip_list = ['http://117.191.11.108:80', 'http://134.209.15.143:8080', 'http://157.230.232.130:80',
                   'http://111.206.6.100:80', 'http://159.138.5.222:80', 'http://178.128.12.118:8080',
                   'http://83.142.126.147:80', 'http://150.109.55.190:83', 'http://165.227.62.167:8080',
                   'http://167.114.153.18:80', 'http://39.137.69.10:8080', 'http://111.206.6.101:80',
                   'http://165.227.29.189:8080', 'http://175.139.252.192:80', 'http://103.42.213.176:8080',
                   'http://211.23.149.29:80', 'http://211.23.149.28:80', 'http://47.94.57.119:80',
                   'http://175.139.252.194:80', 'http://47.94.217.37:80']
        #mysql(ip_list)
        mysql_proxies()
    

    本文是对ip进行存储,和提取ip,删除ip的操作。

    logger.py

    # -*-coding:utf-8 -*-
    # BY WANGCC
    import logging
    import os
    import sys
    import time
    
    
    class Logger:
        def __init__(self, set_level="INFO",
                     name=os.path.split(os.path.splitext(sys.argv[0])[0])[-1],
                     log_name=time.strftime("%Y-%m-%d.log", time.localtime()),
                     log_path=os.path.join(os.path.dirname(os.path.abspath(__file__)), "log"),
                     use_console=True):
            """
            :param set_level: 日志级别["NOTSET"|"DEBUG"|"INFO"|"WARNING"|"ERROR"|"CRITICAL"],默认为INFO
            :param name: 日志中打印的name,默认为运行程序的name
            :param log_name: 日志文件的名字,默认为当前时间(年-月-日.log)
            :param log_path: 日志文件夹的路径,默认为logger.py同级目录中的log文件夹
            :param use_console: 是否在控制台打印,默认为True
            """
            if not set_level:
                set_level = self._exec_type()  # 设置set_level为None,自动获取当前运行模式
            self.__logger = logging.getLogger(name)
            self.setLevel(
                getattr(logging, set_level.upper()) if hasattr(logging, set_level.upper()) else logging.INFO)  # 设置日志级别
            if not os.path.exists(log_path):  # 创建日志目录
                os.makedirs(log_path)
            formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
            handler_list = list()
            handler_list.append(logging.FileHandler(os.path.join(log_path, log_name)))
            if use_console:
                handler_list.append(logging.StreamHandler())
            for handler in handler_list:
                handler.setFormatter(formatter)
                self.addHandler(handler)
    
        def __getattr__(self, item):
            return getattr(self.logger, item)
    
        @property
        def logger(self):
            return self.__logger
    
        @logger.setter
        def logger(self, func):
            self.__logger = func
    
        def _exec_type(self):
            return "DEBUG" if os.environ.get("IPYTHONENABLE") else "INFO"
    
    
    

    这个是对日志的一个封装,这样用起来方便些。

    总结,目前整体程序可优化的空间很大,这算是1.0版本

    相关文章

      网友评论

          本文标题:python 代理ip爬取,ip代理,数据库存储,去重,验证。

          本文链接:https://www.haomeiwen.com/subject/ikkawqtx.html