美文网首页
(三)分布式爬虫(2)——豆瓣小组爬虫案例

(三)分布式爬虫(2)——豆瓣小组爬虫案例

作者: 爱折腾的胖子 | 来源:发表于2018-09-27 17:53 被阅读0次

    之前写的都是单机爬虫,在一个机器上可以运行,这节就研究一下分布式爬虫应该怎么写。
    scrapy-redis创建项目的过程,与之前scrapy一样,都是命令行创建项目,然后在创建爬虫。最后这个爬虫的主体逻辑都写完了之后,再改一下爬虫继承的类名,修改一下settings.py文件就OK了。


    现在就开始这个案例,默认是安装好了redis和docker。
    如果没有安装好,可以在linux上安装一个docker然后pull一个redis镜像到本地即可。docker安装过程
    OK,现在开始,首先分析一下需要爬取的页面,豆瓣小组讨论

    豆瓣小组
    有网址,还有7个可以点击的标签页,其中精选是就是当前页面,剩下的六个都是有自己的后缀。我们选择使用CrawlSpider进行链接匹配。

    OK,上爬虫代码:

    # -*- coding: utf-8 -*-
    
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from .. import utils, items, LinkExtractorRule
    import re
    
    class SpiderSpider(CrawlSpider):
        name = 'spider'
        allowed_domains = ['www.douban.com']
        # start_urls = ['https://www.douban.com/group/topic/124785371/']
        start_urls = ['https://www.douban.com/group/explore']
    
        # 翻页规则,
        page_link = LinkExtractor(process_value=LinkExtractorRule.page_process_value, allow_domains=('www.douban.com'))
        # 每一页中的讨论话题
        topic_link = LinkExtractor(process_value=LinkExtractorRule.topic_process_value, allow_domains=('www.douban.com'))
        # 小组规则
        group_link = LinkExtractor(process_value=LinkExtractorRule.group_process_value, allow_domains=('www.douban.com'))
        # 评论翻页
        comment_link = LinkExtractor(process_value=LinkExtractorRule.comment_process_value, allow_domains=('www.douban.com'))
    
    
        # 定义匹配规则,callback不可以使用parse()
        rules = (
            Rule(page_link, follow=True),
            Rule(topic_link, callback="parse_topic", follow=True),
            Rule(group_link, callback="parse_group", follow=False),
            Rule(comment_link, callback="parse_comment", follow=True)
        )
    
        def parse_topic(self, response):
            print(response.url)
            topic_item = items.TopicItem()
            id = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
            topic_item["id"] = id
            topic_item["title"] = utils.is_None(response.xpath("//*[@class=\"tablecc\"]"))[0]
            topic_item["person_name"] = utils.is_None(response.xpath("//*[@class=\"from\"]/a/text()"))[0]
            topic_item["content"] = utils.is_None(response.xpath("//*[@class=\"topic-content\"]"))[0]
            topic_item["group_id"] = re.compile(r"/group/\S+/\?ref=sidebar").search(utils.is_None(response.xpath("//*[@class=\"group-item\"]/div[@class=\"info\"]/div[@class=\"title\"]/a/@href"))[0]).group().replace("/group/", "").replace("/?ref=sidebar", "")
            yield topic_item
            comment_item_list = self.get_comments(response, id)
            for comment_item in comment_item_list:
                yield comment_item
    
        def parse_group(self, response):
            group_item = items.GroupItem()
            group_item["id"] = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
            group_item["name"] = utils.is_None(response.xpath("//*[@id=\"group-info\"]/div/h1/text()"))[0]
            group_item["leader"] = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/p/a/text()"))[0]
            time = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/p/text()"))[0]
            group_item["time"] = re.compile("\d+\-\d+\-\d+").search( time if time!=""  else "0000-00-00").group()
            group_item["content"] = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/div[@class=\"group-intro\"]"))[0]
            yield group_item
    
        def parse_comment(self, response):
            topic_id = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
            item_list = self.get_comments(response, topic_id)
            for item in item_list:
                yield item
    
        def get_comments(self, response, topic_id):
            item_list = []
            ul = response.xpath("//*[@id=\"comments\"]")
            li_list = ul.xpath("./li")
            for li in li_list:
                item = items.CommentItem()
                item["id"] = utils.is_None(li.xpath("./@data-cid"))[0]
                item["person_name"] = utils.is_None(li.xpath("./div[@class=\"reply-doc content\"]/div[@class=\"bg-img-green\"]/h4/a/text()"))[0]
                item["content"] = utils.is_None(li.xpath("./div[@class=\"reply-doc content\"]/p/text()"))[0]
                item["topic_id"] = topic_id
                item_list.append(item)
            return item_list
    

    首先保证数据可以正常的爬取到,然后再改成scrapy-redis项目。
    首先修改settings.py

    # -*- coding: utf-8 -*-
    
    BOT_NAME = 'douban_group_spider'
    SPIDER_MODULES = ['douban_group_spider.spiders']
    NEWSPIDER_MODULE = 'douban_group_spider.spiders'
    
    #使用scrapy-redis内置的去重组件
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    #使用scrapy-redis调度器
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    # 使用队列形式
    SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
    #允许暂停,redis请求记录不丢失
    SCHEDULER_PERSIST = True
    
    ROBOTSTXT_OBEY = False #不遵守robot协议
    
    DOWNLOAD_DELAY = 1 #间隔时间
    
    
    COOKIES_ENABLED = False
    
    DOWNLOADER_MIDDLEWARES = {
        'douban_group_spider.middlewares.DoubanGroupSpiderSpiderMiddleware': 543,
        'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 560,
        'douban_group_spider.middlewares.ABProxyMiddleware' : 550,
        'douban_group_spider.middlewares.UserAgentMiddleware': 600,
    }
    
    ITEM_PIPELINES = {
        'douban_group_spider.pipelines.DoubanGroupSpiderPipeline': 300,
        'scrapy_redis.pipelines.RedisPipeline' : 900, #支持将数据存储到redis中,必须启动
    }
    
    
    # 阿布云ip代理配置,包括账号密码
    import base64
    PROXY_USER = "通行证书"
    PROXY_PASS = "通行密钥"
    PROXY_HOST = "HTTP隧道服务器地址"
    PROXY_PORT = "端口"
    # for Python3
    PROXY_AUTH = "Basic " + base64.urlsafe_b64encode(bytes((PROXY_USER + ":" + PROXY_PASS), "ascii")).decode("utf8")
    PROXY_SERVER = "http://" + PROXY_HOST +  ":" + PROXY_PORT
    
    
    #配置redis数据库
    REDIS_HOST = '192.168.1.130'
    REDIS_PORT = 9901
    REDIS_DB = 0
    REDIS_URL = 'redis://' + REDIS_HOST + ': '+ str(REDIS_PORT) + "/" + str(REDIS_DB)
    
    

    然后修改sprider文件

    # -*- coding: utf-8 -*-
    
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import  Rule
    from scrapy_redis.spiders import RedisCrawlSpider
    from .. import utils, items, LinkExtractorRule
    import re
    
    #docker run --name redis-test -i -t -p 0.0.0.0:9001:6379 -d redis /bin/bash   docker后台运行redis命令
    
    class SpiderSpider(RedisCrawlSpider):
        name = 'spider'
        allowed_domains = ['www.douban.com']
        # start_urls = ['https://www.douban.com/group/topic/124785371/']
        # start_urls = ['https://www.douban.com/group/explore']
        redis_key = "douban_spider:start_urls"
    
        #逻辑代码都一样
        #.......
    

    OK此时这个项目就是scrpay-redis项目了,可以放到几个机器上运行。
    首先运行爬虫,multiprocessing是关于多进程运行的库。

    from scrapy import cmdline
    from multiprocessing import Pool as Process_Pool
    from time import sleep
    
    
    def run_spider(number) :
        print("线程" + str(number))
        cmdline.execute("scrapy crawl spider".split())
    
    if __name__ == '__main__':
        p_pool = Process_Pool()
        for i in range(5):
            p_pool.apply_async(run_spider, args=(i,))
            sleep(1)
        p_pool.close()
        p_pool.join()
        sleep(5)
    

    然后添加一条start_url到redis中。

    # coding=utf-8
    from redis import StrictRedis,ConnectionPool
    
    # redis 连接池
    pool = ConnectionPool.from_url("redis://@192.168.1.130:9901/0")
    redis = StrictRedis(connection_pool=pool)
    redis.lpush("douban_spider:start_urls", "https://www.douban.com/group/explore")
    pool.disconnect()
    

    此时所有的爬虫就开始工作了。
    最后就是把爬到的数据从redis中存入到mysql中。
    完整代码

    相关文章

      网友评论

          本文标题:(三)分布式爬虫(2)——豆瓣小组爬虫案例

          本文链接:https://www.haomeiwen.com/subject/ynnjoftx.html