spiders
import scrapy
from selenium import webdriver
from scrapy import signals
from pydispatch import dispatcher
from scrapy.http import HtmlResponse
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['oschina.net/blog']
start_urls = [https://www.oschina.net/blog]
def __init__(self):
self.browser = webdriver.Chrome()
super(ZhihuSpider, self).__init__()
#传递信息,也就是当爬虫关闭时scrapy会发出一个spider_closed的信息,当这个信号发出时就调用closeSpider函数关闭这个浏览器.
dispatcher.connect(self.closeSpider, signals.spider_closed)
def closeSpider(self, spider):
print("spider closed")
# 当爬虫退出的时关闭浏览器
self.browser.quit()
def parse(self, response):
# data=response.css(".SignFlow-accountInpu input[]").extract()
pass
Middleware.py
class JSMiddleware(object):
def process_request(self, request, spider):
if spider.name == "zhihu":
spider.browser.get(request.url)
time.sleep(3)
print(f"访问:{request.url}")
# 请求已完成,所以不用再发送到下载器。用HtmlResponse 之后就不会发送到downlaoder,而是直接返回给spider
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8",request=request)
settings.py
DOWNLOADER_MIDDLEWARES = {
# 'outlook.middlewares.MyCustomDownloaderMiddleware': 543,
'outlook.middlewares.JSPageMiddleware': 1,
网友评论