1.访问动态网页的方法
首先在middlerware中间键中process_request函数中重新写该方法
from selenium import webdriver
from scrapy.http import HtmlResponse
def process_request(self,request,spider):
path = r'example_path'
option = webdriver.ChromeOption()
option.add_argument('--headless')
deriver = webdriver.Chrome(excutable_path=path,option=option)
driver.get("example.com")
url = driver.current_url
body = driver.page_source
res = HtmlResponse(url=url,body=body,encoding='utf-8',request=request)
# 让界面滚动的方法,同时需要在setting中middleware下载中加入
# "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None
js = "var q = document.documentElement.scrollTop = {}"
for i in range(10):
driver.excute_script(js.formate(i*200)
return res
2.ip代理的问题
首先需要在setting配置文件中先定义一个ip池
IPPOOL = [
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
]
在middleware中的process_request函数中重新写
# 需要重新定义ip
def __init__(self,ip=''):
self.ip = ip
# 重新写process_request方法
def process_request(self,request,spider)
ip = random.choice(IPPOOL)['ip']
request.meta['proxy'] = 'https://' + ip
# 需要在setting中配置
DOWNLOADER_MIDDLEWARES = {
# 'daili.middleware.ExampleDownloaderMiddleware': 543,
'daili.middlewares.ExampleDownloaderMiddleware':222,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':333,
}
网友评论