美文网首页
Scrapy管道及中间件个人常用配置示例

Scrapy管道及中间件个人常用配置示例

作者: 越大大雨天 | 来源:发表于2019-08-22 13:45 被阅读0次

中间件的使用示例:

随机UserAgent示例
  1. 使用fake_useragent生成随机UA
class UserAgentMiddleware(object):
    def __init__(self):
        self.ua = fake_useragent.UserAgent()
    def process_request(self, request, spider):
        request.headers['User-Agent'] = self.ua.random
  1. 使用scrapy_fake_useragent组件
  • pip install scrapy_fake_useragent
  • 注释掉内置的UserAgentMiddleware,添加scrapy_fake_useragen
DOWNLOADER_MIDDLEWARES = {
    # 关闭内置的UA中间件,使用随机UA组件
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
随机IP代理示例
  • 使用阿布云动态妆发IP示例
import base64

# 阿布云代理服务器
proxyServer = "http://http-dyn.abuyun.com:9020"

# 代理隧道验证信息
proxyUser = "H4U********B3D"
proxyPass = "761*******9BC6"
proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")

# 代理设置
class ProxyMiddleware(object):
    def process_request(self, request, spider):
        request.meta["proxy"] = proxyServer
        request.headers["Proxy-Authorization"] = proxyAuth


响应重试示例
  • 使用process_response重试请求访问错误的页面
def process_response(self, request, response, spider):
    """处理响应"""
    if response.status != 200 or response.body ==b"":
        req = request.copy()
        req.dont_filter = True
        return req
    return response

管道使用示例

自定义去重方式示例:
from scrapy.exceptions import DropItem
class DuplicatesPipleline(object):
    def __init__(self):
        self.card_num_seen = set()
        self.name_seen = set()

    def process_item(self,item,spider):
        if item["age"] == 0:
            if item['name'] in self.name_seen:
                raise DropItem('Duplicate item found: %s' % item)
            else:
                self.name_seen.add(item['name'])
                print(self.name_seen)
        else:
            if item['card_num'] in self.card_num_seen:
                raise DropItem('Duplicate item found: %s' % item)
            else:
                self.card_num_seen.add(item['card_num'])
                print(self.card_num_seen)
        return item
MySQL异步写入Item示例:

from twisted.enterprise import adbapi
class MySQLPipeline(object):

    @classmethod
    def from_crawler(cls, crawler):
        # 从项目的配置文件中读取相应的参数
        cls.MYSQL_DB_NAME = crawler.settings.get("MYSQL_DB", 'dishonest')
        cls.HOST = crawler.settings.get("MYSQL_HOST", 'localhost')
        cls.PORT = crawler.settings.get("MYSQL_PORT", 3306)
        cls.USER = crawler.settings.get("MYSQL_USER", 'root')
        cls.PASSWD = crawler.settings.get("MYSQL_PASSWORD", 'mysql')
        return cls()

    def open_spider(self, spider):
        self.dbpool = adbapi.ConnectionPool('pymysql', host=self.HOST, port=self.PORT, user=self.USER, passwd=self.PASSWD, db=self.MYSQL_DB_NAME, charset='utf8')

    def close_spider(self, spider):
        self.dbpool.close()

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self.insert_db, item)
        query.addErrback(self.handler_error, item, spider)
        return item

    def handler_error(self, failure, item, spider):
        print(failure)

    def insert_db(self, tx, item):
        values = (
            item["name"],
            item["card_num"],
            item["age"],
            item["area"],
            item["business_entity"],
            item["content"],
            item["publish_date"],
            item["publish_unit"],
            item["create_date"],
            item["update_date"],
        )
        sql = 'INSERT INTO dishonest(name,card_num,age,area,business_entity,content,publish_date,publish_unit,create_date,update_date) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        tx.execute(sql, values)
MongoDB写入Item示例
from pymongo import MongoClient

class JdPipeline(object):
    def open_spider(self, spider):  # 在爬虫开启的时候仅执行一次
        if spider.name == 'book':
        # 也可以使用isinstanc函数来区分爬虫类:
            self.client = MongoClient(host='127.0.0.1', port=27017) # 实例化mongoclient
            self.db = self.client["jd"] # 创建数据库名为jd
            self.collection = self.db["book"] # 集合名为book的集合操作对象

    def process_item(self, item, spider):
        if spider.name == 'book':
            item = dict(item)
            self.collection.insert(item)
            # 此时item对象必须是一个字典,再插入
            # 如果此时item是BaseItem则需要先转换为字典:dict(BaseItem)
        item.pop("_id")
        return item

    def close_spider(self, spider):
        self.client.close()

相关文章

网友评论

      本文标题:Scrapy管道及中间件个人常用配置示例

      本文链接:https://www.haomeiwen.com/subject/gxlnsctx.html