1.使用xpath提取数据
2.使用telnet验证代理是否可用
3.将可用数据追加写入csv文件,ip地址、端口各占一列
代码如下:
#!/usr/bin/python3.7.3
# coding:utf-8
# Author:cj
# contact: ingchang@qq.com
# Time:2019/8/26
# desc:爬取代理http://www.89ip.cn,验证可用,保存本地文件
import requests
from lxml import etree
import csv
import telnetlib
def pagesproxies():
# 提取第1到10页数据
urllist = [f'http://www.89ip.cn/index_{page}.html' for page in range(1,11)]
pages = {}
for list in urllist:
onepage = getproxies(list)
pages.update(onepage)
return pages
def getproxies(url):
# 提取一页数据
ip_strip = []
port_strip = []
r = requests.get(url=url)
html = etree.HTML(r.text)
ip = html.xpath("//tr/td[1]/text()")
port = html.xpath("//tr/td[2]/text()")
# 处理提取字段的空格等符号
for i in ip:
i = i.strip()
ip_strip.append(i)
for j in port:
j = j.strip()
port_strip.append(j)
# 将两个列表生成对应的字典
return dict(zip(ip_strip,port_strip))
def telnet(proxiesdata):
# 检查是否可用,将可用代理返回
num = 0
newdict = {}
for key, value in proxiesdata.items():
try:
telnetlib.Telnet(host=key, port=value, timeout=3.0)
except:
print('%s:%s connect failure' % (key,value))
else:
print('%s:%s connect success' % (key,value))
num += 1
newdict[key] = value
print("可用代理共%s个" % num)
return newdict
def download(proxiescheckeddata):
try:
file_path = r"C:\\Users\\cj\\Desktop\\newip.csv"
with open(file_path,"a",newline='') as f:
writer = csv.writer(f)
for key,value in proxiescheckeddata.items():
writer.writerow([key,value])
except Exception:
print("error!")
if __name__ == '__main__':
test = pagesproxies()
check = telnet(test)
download(check)
网友评论