本文的示例代码参考scrapy-tutorial
目录
Env
这里通过OS环境变量来区分开发和生产环境的配置
rm tutorial/settings.py
mkdir tutorial/settings
vim tutorial/settings/__init__.py
# -*- coding: utf-8 -*-
import os
from .base import *
if os.getenv('SCRAPY_PROD'):
from .production import *
else:
from .development import *
vim tutorial/settings/base.py
# -*- coding: utf-8 -*-
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
ROBOTSTXT_OBEY = True
# Export
FEED_EXPORT_ENCODING = 'utf-8'
# Pipelines
ITEM_PIPELINES = {
'tutorial.pipelines.MongoPipeline': 300,
}
vim tutorial/settings/development.py
# -*- coding: utf-8 -*-
# Mongo
MONGO_URI = 'localhost:27017'
MONGO_DB = 'tutorial-dev'
vimtutorial/settings/production.py
# -*- coding: utf-8 -*-
# Mongo
MONGO_URI = 'localhost:27017'
MONGO_DB = 'tutorial-prod'
- 抓取
scrapy crawl zhgcloud
export SCRAPY_PROD=1
# unset SCRAPY_PROD
env | grep SCRAPY_PROD
# SCRAPY_PROD=1
scrapy crawl zhgcloud
注意: tutorial/settings/production.py不能加入版本控制
Paging
vim tutorial/spiders/zhgcloud.py
# 省略了未修改的代码
class ZhgcloudSpider(scrapy.Spider):
name = 'zhgcloud'
allowed_domains = ['zhgcloud.com']
start_urls = ['https://www.zhgcloud.com/source/']
def parse(self, response):
body = scrapy.Selector(text=response.body)
total_page = body.xpath(
'//ul[@class="el-pager"]/li[last()]/text()').extract()[0]
for page in range(1, int(total_page) + 1):
yield scrapy.Request(url=(self.start_urls[0] + '?page=' + str(page)), callback=self.parse_page)
def parse_page(self, response):
body = scrapy.Selector(text=response.body)
posts = body.xpath('//div[@class="el-card post"]')
# 省略了未修改的代码
Proxy
这里使用阿布云隧道代理
vim tutorial/middlewares.py
# -*- coding: utf-8 -*-
import base64
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = 'http://http-pro.abuyun.com:9010'
request.headers['Proxy-Authorization'] = 'Basic ' + \
base64.b64encode('H2658392JGC7V70P' + ':' + '0717F6EEBD56A544')
vim tutorial/settings/base.py
# -*- coding: utf-8 -*-
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
ROBOTSTXT_OBEY = True
# Export
FEED_EXPORT_ENCODING = 'utf-8'
# Middlewares
DOWNLOADER_MIDDLEWARES = {
'tutorial.middlewares.ProxyMiddleware': 25,
}
# Pipelines
ITEM_PIPELINES = {
'tutorial.pipelines.MongoPipeline': 300,
}
- 抓取
scrapy crawl zhgcloud
关于更多代理服务商 可以参考说说代理IP哪家好?
Selenium
Scrapy抓取的是HTTP请求返回的页面 但是不能抓取JavaScript动态渲染的页面 因此 为解决此类问题需要借助Selenium 关于Selenium的更多介绍可以参考基于Selenium的Web自动化测试
vim tutorial/middlewares.py
# -*- coding: utf-8 -*-
import base64
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from scrapy.http import HtmlResponse
# 省略了未修改的代码
class SeleniumMiddleware():
@classmethod
def from_crawler(cls, crawler):
return cls()
def __init__(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
self.browser = webdriver.Chrome(chrome_options=chrome_options)
def __del__(self):
self.browser.close()
def process_request(self, request, spider):
try:
self.browser.get(request.url)
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request)
vim tutorial/settings/base.py
# 省略了未修改的代码
# Middlewares
DOWNLOADER_MIDDLEWARES = {
# 'tutorial.middlewares.ProxyMiddleware': 25,
'tutorial.middlewares.SeleniumMiddleware': 30,
}
# Pipelines
ITEM_PIPELINES = {
'tutorial.pipelines.MongoPipeline': 300,
}
- 抓取
scrapy crawl zhgcloud
Deploy
- Server
pip install scrapyd
scrapyd --version
# twistd (the Twisted daemon) 18.9.0
scrapyd
# http://localhost:6800/
- Client
pip install scrapyd-client
vim scrapy.cfg
# 省略了未修改的代码
[deploy]
url = http://localhost:6800/
project = tutorial
scrapyd-deploy
Packing version 1542768294
Deploying to project "tutorial" in http://localhost:6800/addversion.json
Server response (200):
{"status": "ok", "project": "tutorial", "version": "1542768294", "spiders": 1, "node_name": "Kevins-iMac.local"}
curl http://localhost:6800/schedule.json -d project=tutorial -d spider=zhgcloud
{"status": "ok", "jobid": "dea392b8ed3711e8a374787b8aaaf9c0", "node_name": "Kevins-iMac.local"}
- Keeper
pip install spiderkeeper
spiderkeeper --server=http://localhost:6800
# http://localhost:5000/
关于spiderkeeper更多参数和用法 可以参考SpiderKeeper
Manage -> Create Project -> tutorial
-> scrapyd-deploy --build-egg output.egg -> submit output.egg
-> Periodic jobs -> Add Job
scrapy-tutorial-01.png
如上 通过SpiderKeeper不仅可以管理Job 还可以管理定时Job
网友评论