创建scrapy项目:
scrapy startproject 项目名
scrapy genspider 爬虫名 www.baidu.com(网站网址)
注:举例网址https://www.ip.cn
scrapy startproject An #项目名称An
cd An #An目录
scrapy genspider app1 hee #网址随意,进去修改
完整目录:
注:run.py为运行文件
内容
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'app1'])
app1.py
import scrapy
class App1Spider(scrapy.Spider):
name = 'app1'
# allowed_domains = ['hee'] #暂时用不到,注释
# start_urls = ['https://ip.cn']
def start_requests(self):
for i in range(1,1000):
start_urls = 'https://ip.cn'
yield scrapy.Request(start_urls,self.show_ip,dont_filter=True)
def show_ip(self, response):
ip = response.xpath('//*[@id="result"]/div/p[1]/code/text()').extract()
print(ip)
打开下载器中间键
下载器中间键.png代理IP池参考:https://www.jianshu.com/writer#/notebooks/46730504/notes/73115192
修改middlewares.py
import redis
r = redis.Redis(host='127.0.0.1', port=6379,db=0)
def process_request(self, request, spider):
ip = 'https://'+r.rpop('ip_list').decode('utf8') #取出代理
request.meta['proxy'] = ip
request.meta['timeout'] = 5
return None
def process_response(self, request, response, spider):
if response.status==200: #状态码判断
ip = request.meta['proxy'].split('//')[-1] #处理当前IP,从左边插入
r.lpush('ip_list',ip) #从左边插入
print(response.status)
return response
else:
return request #重新回归调度队列
def process_exception(self, request, exception, spider):
return request #重新回归调度队列
网友评论