美文网首页
16.由redis客户端控制的scrapy-redis多页抓取

16.由redis客户端控制的scrapy-redis多页抓取

作者: starrymusic | 来源:发表于2019-04-01 17:04 被阅读0次

    修改前的lagou.py文件是这样滴:

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from lagouzhaopin.items import LagouItemLoad, LagouzhaopinItem
    from datetime import datetime
    class LagouSpider(CrawlSpider):
        name = 'lagou'
        allowed_domains = ['www.lagou.com']
        start_urls = ['http://www.lagou.com/']
    
        rules = (
            Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
            Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
            Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
        )
    
        def parse_job(self, response):
            item_loader = LagouItemLoad(item=LagouzhaopinItem(), response=response)
            item_loader.add_css("title", ".job-name::attr(title)")
            item_loader.add_value("url", response.url)
            item_loader.add_css("salary", ".salary::text")
            item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()")
            item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()")
            item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()")
            item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()")
            item_loader.add_css("publish_time", "p.publish_time::text")
            item_loader.add_value("crawl_time", datetime.now())
            item_loader.add_css("job_advantage", ".job-advantage p::text")
            item_loader.add_css("job_desc", ".job-detail")
            item_loader.add_css("job_addr", ".work_addr")
            item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
            item_loader.add_css("company_url", "ul.c_feature li:last-child a::attr(href)")
            return item_loader.load_item()
    

    修改后的lagou.py文件是这样滴:

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from example.items import LagouItemLoad, LagouzhaopinItem
    from datetime import datetime
    from scrapy_redis.spiders import RedisCrawlSpider, RedisMixin
    
    
    class LagouSpider(RedisCrawlSpider):
        name = 'lagou_redis'
        redis_key = 'lagou:start_urls'
    
        rules = (
            Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
            Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
            Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
        )
    
        def set_crawler(self, crawler):
            # Dynamically define the allowed domains list.
            CrawlSpider.set_crawler(self, crawler) #设置默认爬取
            RedisMixin.setup_redis(self) #url交由redis统一调度
    
        def parse_job(self, response):
            item_loader = LagouItemLoad(item=LagouzhaopinItem(), response=response)
            item_loader.add_css("title", ".job-name::attr(title)")
            item_loader.add_value("url", response.url)
            item_loader.add_css("salary", ".salary::text")
            item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()")
            item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()")
            item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()")
            item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()")
            item_loader.add_css("publish_time", "p.publish_time::text")
            item_loader.add_value("crawl_time", datetime.now())
            item_loader.add_css("job_advantage", ".job-advantage p::text")
            item_loader.add_css("job_desc", ".job-detail")
            item_loader.add_css("job_addr", ".work_addr")
            item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
            item_loader.add_css("company_url", "ul.c_feature li:last-child a::attr(href)")
            return item_loader.load_item()
    

    修改后的items.py文件时这样滴:

    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/topics/items.html
    import scrapy
    from scrapy.item import Item, Field
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import MapCompose, TakeFirst, Join
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import MapCompose, TakeFirst, Join
    import scrapy
    from w3lib.html import remove_tags
    
    class LagouItemLoad(ItemLoader):
        default_output_processor = TakeFirst()
    
    
    class LagouzhaopinItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        def remove_splash(value):
            # 去掉工作城市的斜线
            return value.replace("/", "")
    
        def handle_job_addr(value):
            addr_list = value.split("\n")
            addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"]
            return "".join(addr_list)
    
        def handle_publish_time(value):
            return value.replace("发布于拉勾网", "").strip()
    
        def remove_space(value):
            return value.strip()
    
        title = scrapy.Field()
        url = scrapy.Field()
        salary = scrapy.Field(
            input_processor=MapCompose(remove_space),
        )
        job_city = scrapy.Field(
            input_processor=MapCompose(remove_splash),
        )
        work_years = scrapy.Field(
            input_processor=MapCompose(remove_splash, remove_space),
        )
        degree_need = scrapy.Field(
            input_processor=MapCompose(remove_splash),
        )
        job_type = scrapy.Field()
        publish_time = scrapy.Field(
            input_processor=MapCompose(handle_publish_time),
        )
        crawl_time = scrapy.Field()
        job_advantage = scrapy.Field()
        job_desc = scrapy.Field(
            input_processor=MapCompose(remove_tags, remove_space),
        )
        job_addr = scrapy.Field(
            input_processor=MapCompose(remove_tags, handle_job_addr),
        )
        company_name = scrapy.Field()
        company_url = scrapy.Field()
        crawled = Field()
        spider = Field()
    
    class ExampleItem(Item):
        name = Field()
        description = Field()
        link = Field()
        crawled = Field()
        spider = Field()
        url = Field()
    
    
    class ExampleLoader(ItemLoader):
        default_item_class = ExampleItem
        default_input_processor = MapCompose(lambda s: s.strip())
        default_output_processor = TakeFirst()
        description_out = Join()
    

    这时在cmd.exe里就可以切换到项目文件夹执行

    scrapy runspider lagou.py
    

    然后等待在redis客户端输入要爬取的网页,因本次查询是全站爬取招聘信息,所以直接在redis客户端输入

    lpush lagou:start_urls https://www.lagou.com/
    

    需要注意的一点是,如果切换到项目文件夹下,在本例中为“D:\fscrapy\scrapy_redis\example-project-crawl\example”这时执行“scrapy runspider lagou.py”可能会报错说找不到lagou.py文件,可以执行“scrapy crawl lagou”具有相同效果。切换到lagou.py文件所在文件夹,然后执行“scrapy runspider lagou.py”,也能成功。

    上面能达到使用redis进行多页爬取的效果,但不能将拉勾的招聘信息爬取下来,因为拉勾的反爬措施存在,我们还需要在代码中加入代理。只需要三行代码就可以应付拉勾了。

    在settings.py文件中用如下代码:

    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
        'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    }
    

    覆盖原来的代码即可:

    USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'
    

    如果想要有更深层次的理解,可以参考文章(https://baagee.vip/index/article/id/108.html

    相关文章

      网友评论

          本文标题:16.由redis客户端控制的scrapy-redis多页抓取

          本文链接:https://www.haomeiwen.com/subject/xmavbqtx.html