本文是对goubanjia的网站的一个综合性爬取。
实现代理ip爬取,存入数据库,使用代理,去重,验证。
首先是ip_request.py
# -*-coding:utf-8 -*-
# BY WANGCC
from bs4 import BeautifulSoup
import requests
from bs4 import BeautifulSoup
import requests
from ip_to_mysql import mysql_proxies,mysql_delete
url = 'http://ip.tool.chinaz.com/'
def str2header(headers_raw):
if headers_raw is None:
return None
headers = headers_raw.splitlines()
headers_tuples = [header.split(':', 1) for header in headers]
result_dict = {}
for header_item in headers_tuples:
if not len(header_item) == 2:
continue
item_key = header_item[0].strip()
item_value = header_item[1].strip()
result_dict[item_key] = item_value
return result_dict
# 抓包看到的header字符串
r_h = '''
GET / HTTP/1.1
Host: www.baidu.com
Connection: keep-alive
Cache-Control: max-age=0
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cookie: ***
'''
# headers已经生成 可直接在requests.get(url,headers=headers)
headers = str2header(r_h)
proxies=mysql_proxies()
proxies_str=proxies.split(":")
Agreement=proxies_str[0]
proxies = {
Agreement: proxies,
}
r = requests.get(url, proxies=proxies)
soup = BeautifulSoup(r.text, 'lxml')
parent_node = soup.find(class_="IpMRig-tit")
a=parent_node.find_all('dd')
if '58.87.119.xxxx' not in a:
print('success')
else:
mysql_delete(proxies)
# for i in parent_node.find_all('dd'):
# print(i.get_text())
这个模块是对ip的一个验证,随机生成header头。
ip_to_mysql.py
# -*-coding:utf-8 -*-
# BY WANGCC
import pymysql,datetime
import logger
log = logger.Logger("debug")
DB_CONFIG = {
"host": "127.0.0.1",
"port": 3306,
"user": "admin",
"passwd": "*******",
"db": "ip_Original",
"charset": "utf8"
}
def mysql(ip_list):
# 打开数据库连接
db = pymysql.connect(
host=DB_CONFIG["host"],
port=DB_CONFIG["port"],
user=DB_CONFIG["user"],
passwd=DB_CONFIG["passwd"],
db=DB_CONFIG["db"],
charset=DB_CONFIG["charset"])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
for ip in ip_list:
check_sql="select count(*) from ip_original where ip='%s'"%(ip)
insert_sql = "insert into ip_original(ip,date)value ('%s','%s')" % (ip, date)
cursor.execute(check_sql)
number=cursor.fetchall()
new_num=number[0][0]
if number[0][0] == 0:
try:
# 执行sql语句
cursor.execute(insert_sql)
log.info(ip+'insert to ip_original success!')
# 提交到数据库执行
db.commit()
except Exception as e:
log.info('执行sql-->'+insert_sql+'fail')
# 发生错误时回滚
db.rollback()
else:
log.info(ip+': is existence !!',)
# 关闭数据库连接
db.close()
#获得一个ip代理
def mysql_proxies():
# 打开数据库连接
db = pymysql.connect(
host=DB_CONFIG["host"],
port=DB_CONFIG["port"],
user=DB_CONFIG["user"],
passwd=DB_CONFIG["passwd"],
db=DB_CONFIG["db"],
charset=DB_CONFIG["charset"])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
check_sql="select * from ip_original "
cursor.execute(check_sql)
number=cursor.fetchmany(1)
proxies=number[0][1]
# 关闭数据库连接
db.close()
return proxies
#删除一条数据
def mysql_delete(proxies):
# 打开数据库连接
db = pymysql.connect(
host=DB_CONFIG["host"],
port=DB_CONFIG["port"],
user=DB_CONFIG["user"],
passwd=DB_CONFIG["passwd"],
db=DB_CONFIG["db"],
charset=DB_CONFIG["charset"])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
check_sql="delete from ip_original where ip = '%s'"%(proxies)
log.info('delete data'+'proxies'+'success')
cursor.execute(check_sql)
number=cursor.fetchmany(1)
proxies=number[0][1]
# 关闭数据库连接
db.close()
return proxies
if __name__=="__main__":
ip_list = ['http://117.191.11.108:80', 'http://134.209.15.143:8080', 'http://157.230.232.130:80',
'http://111.206.6.100:80', 'http://159.138.5.222:80', 'http://178.128.12.118:8080',
'http://83.142.126.147:80', 'http://150.109.55.190:83', 'http://165.227.62.167:8080',
'http://167.114.153.18:80', 'http://39.137.69.10:8080', 'http://111.206.6.101:80',
'http://165.227.29.189:8080', 'http://175.139.252.192:80', 'http://103.42.213.176:8080',
'http://211.23.149.29:80', 'http://211.23.149.28:80', 'http://47.94.57.119:80',
'http://175.139.252.194:80', 'http://47.94.217.37:80']
#mysql(ip_list)
mysql_proxies()
本文是对ip进行存储,和提取ip,删除ip的操作。
logger.py
# -*-coding:utf-8 -*-
# BY WANGCC
import logging
import os
import sys
import time
class Logger:
def __init__(self, set_level="INFO",
name=os.path.split(os.path.splitext(sys.argv[0])[0])[-1],
log_name=time.strftime("%Y-%m-%d.log", time.localtime()),
log_path=os.path.join(os.path.dirname(os.path.abspath(__file__)), "log"),
use_console=True):
"""
:param set_level: 日志级别["NOTSET"|"DEBUG"|"INFO"|"WARNING"|"ERROR"|"CRITICAL"],默认为INFO
:param name: 日志中打印的name,默认为运行程序的name
:param log_name: 日志文件的名字,默认为当前时间(年-月-日.log)
:param log_path: 日志文件夹的路径,默认为logger.py同级目录中的log文件夹
:param use_console: 是否在控制台打印,默认为True
"""
if not set_level:
set_level = self._exec_type() # 设置set_level为None,自动获取当前运行模式
self.__logger = logging.getLogger(name)
self.setLevel(
getattr(logging, set_level.upper()) if hasattr(logging, set_level.upper()) else logging.INFO) # 设置日志级别
if not os.path.exists(log_path): # 创建日志目录
os.makedirs(log_path)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler_list = list()
handler_list.append(logging.FileHandler(os.path.join(log_path, log_name)))
if use_console:
handler_list.append(logging.StreamHandler())
for handler in handler_list:
handler.setFormatter(formatter)
self.addHandler(handler)
def __getattr__(self, item):
return getattr(self.logger, item)
@property
def logger(self):
return self.__logger
@logger.setter
def logger(self, func):
self.__logger = func
def _exec_type(self):
return "DEBUG" if os.environ.get("IPYTHONENABLE") else "INFO"
这个是对日志的一个封装,这样用起来方便些。
总结,目前整体程序可优化的空间很大,这算是1.0版本
网友评论