一、验证器从西刺爬取第一页,约100个代理,按照HTTP和HTTPS归类存储在属性里,多线程验证,将结果存储在验证完毕的数据集中。
二、base.ini 是验证网站集,可以试试*网站,国内的一般是十几个验证成功,国外网站成功率基本为0。
三、 完成版代码
# test_proxy.py
import requests
from lxml.etree import HTML
import re
import os
import pandas as pd
from fake_useragent import UserAgent
from threading import Thread
from configparser import ConfigParser
import logging
from datetime import date
from urllib.parse import urlparse
from pprint import pprint
log = logging.getLogger(__name__)
log.level = logging.ERROR
class Verifier():
def __init__(self):
self.ua = UserAgent()
self.headers = {"user-agent":self.ua.ie}
self.conf = ConfigParser()
self.conf.read("base.ini")
self.test_url_http = self.conf["test_urls"]["test_url_http"]
self.test_url_https = self.conf["test_urls"]["test_url_https"]
self.df = self.gen_dataframe()
# print(self.df)
self.df ["proxy"] = self.df[4].str.lower() +"," + self.df[0]+":"+ self.df[1]
self.proxies = [{key:value} for key,value in [i.split(",") for i in self.df ["proxy"].tolist()]]
self.proxies_http = [d for d in self.proxies if list(d.keys())[0] == "http" ]
self.proxies_https = [d for d in self.proxies if list(d.keys())[0] == "https"]
self.verified_proxies_http = set()
self.verified_proxies_https = set()
# self.item = {}
def gen_dataframe(self):
datasets = pd.DataFrame()
r = requests.get("https://www.xicidaili.com/",headers=self.headers)
html = HTML(r.content)
trs = html.xpath("//tr")
for tr in trs:
data = [i.strip() for i in tr.xpath("./td/text()") if i.strip() is not ""]
if data:
datasets = datasets.append(pd.Series(data),ignore_index=True)
return datasets
def _verify(self,test_url,proxy={}):
try:
if urlparse(test_url).scheme == "http" and list(proxy.keys())[0] == "http":
log.debug("test...%s"%proxy)
self.r = requests.get(test_url,proxies=proxy,headers=self.headers,timeout=20)
self.verified_proxies_http.add(proxy["http"])
log.debug("%s success!"%proxy)
elif urlparse(test_url).scheme == "https" and list(proxy.keys())[0] == "https":
log.debug("test...%s"%proxy)
self.r = requests.get(test_url,proxies=proxy,headers=self.headers,timeout=20,verify=False)
self.verified_proxies_https.add(proxy["https"])
log.debug("%s success!"%proxy)
else:
...
except Exception as e:
log.debug(e)
def verify_all(self):
threads = [Thread(target=self._verify,args=(self.test_url_http,proxy)) for proxy in self.proxies_http]
print("*正在测试HTTP")
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("*测试完毕\n测试网站:%s"%self.test_url_http)
threads = [Thread(target=self._verify,args=(self.test_url_https,proxy)) for proxy in self.proxies_https]
print("*正在测试HTTPS")
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("*测试完毕\n测试网站:%s"%self.test_url_https)
print("本次测试HTTP代理{}个,有效 {} 个,HTTPS代理{}个,有效 {} 个".format(len(self.proxies_http),len(self.verified_proxies_http),len(self.proxies_https),len(self.verified_proxies_https)))
if __name__=="__main__":
verifier = Verifier()
verifier.verify_all()
print(verifier.verified_proxies_http)
pprint(verifier.verified_proxies_https)
# base.ini
[test_urls]
test_url_basic = http://www.ip138.com/
url_proxies_source = https://www.xicidaili.com/nn/
test_url_http = http://www.ip138.com/
; test_url_https = https://www.tumblr.com/
test_url_https = https://www.google.com/
; test_url_https = http://www.google.co.uk/
; test_url_https = https://www.baidu.com/
四、验证结果
*正在测试HTTP
*测试完毕
测试网站:http://www.ip138.com/
*正在测试HTTPS
*测试完毕
测试网站:https://www.baidu.com/
本次测试HTTP代理39个,有效 2 个,HTTPS代理39个,有效 4 个
{'163.125.252.118:9797', '182.92.113.183:8118'}
{'111.177.177.158:9999',
'119.102.188.233:9999',
'163.125.70.21:8888',
'60.186.73.201:9999'}
网友评论