美文网首页项目Python-爬虫
分布式爬虫scrapy-redis的蜘蛛基本配置

分布式爬虫scrapy-redis的蜘蛛基本配置

作者: Joncc | 来源:发表于2017-11-30 23:31 被阅读28次

    scrapy配置


    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    import re
    import datetime
    from datetime import timedelta
    # from Qiji_Project.items import DhinahrItem
    
    from scrapy_redis.spiders import RedisCrawlSpider#爬虫集成 
    

    给redis加一个key 启动爬虫

    lpush chinahrspider:start_urls http://www.chinahr.com/
    
    class ChinahrSpider(RedisCrawlSpider):
        name = 'chinahr'
    
        allowed_domains = ['chinahr.com']
        
        # start_urls = ['http://www.chinahr.com/']
    
        # 启动所有slaver端的爬虫指令,参考格式,建议采用这种格式    
        
        redis_key = 'chinahrspider:start_urls'
        #匹配路径
        rules = (
            Rule(LinkExtractor(allow=r'.chinahr.com/job/\d+.html'), callback='parse_item', follow=False),
            Rule(LinkExtractor(allow=r'.chinahr.com/job/\d+.html?searchplace='), callback='parse_item', follow=False),
            Rule(LinkExtractor(allow=r'.chinahr.com/sou/'), follow=True),
            Rule(LinkExtractor(allow=r'.chinahr.com/.*/jobs/\d+/'), follow=True),
        )
        num_pattern = re.compile(r'\d+')
    
        # 页面解析函数
    
        def parse_item(self, response):
            # item = DhinahrItem()
            item = {}
            # 链接
            url = response.url
            # print(url)
            # 职位名称
            pname = response.xpath('//div[@class="base_info"]//span[@class="job_name"]/text()').extract()
            if pname:
                pname = pname[0]
            else:
               pname = None
            # 工资
            money = response.xpath('//span[@class="job_price"]/text()').extract()
            if money:
                money = money[0]
                smoney,emoney = self.process_money(money)
            else:
                smoney,emoney = None,None
            # 工作城市
            location = response.xpath('//div[@class="job_require"]//span[@class="job_loc"]/text()').extract()
            if location:
                location = location[0]
            else:
                location = None
            # 工作经历
            year = response.xpath('//div[@class="job_require"]//span[@class="job_exp"]/text()').extract()
            if year:
                year = year[0]
                syear, eyear = self.process_year(year)
            else:
                syear, eyear = None,None
            # 学历
            degree = response.xpath('//div[@class="job_require"]//span[4]/text()').extract()
            if degree:
                degree = degree[0]
            else:
                degree = None
            # 工作类型
            ptype = response.xpath('//div[@class="job_require"]//span[3]/text()').extract()
            if ptype:
                ptype = ptype[0]
            else:
                ptype = None
    
            tags = None
            # 发布时间
            date_pub = response.xpath('//p[@class="updatetime"]/text()').extract()
            if date_pub:
                date_pub = date_pub[0]
                date_pub = self.process_date(date_pub)
            else:
               date_pub = None
            # 福利
            advantage = response.xpath('//ul[@class="clear"]//li/text()').extract()
            if advantage:
                advantage = advantage[0]
            else:
                advantage = None
            # 工作描述
            jobdesc = response.xpath('//div[@class="job_intro_info"]/text()').extract()
            if jobdesc:
                jobdesc = self.process_desc(jobdesc)
            else:
                jobdesc = None
            # 工作地点
            jobaddr = response.xpath('//div[@class="job_require"]//span[@class="job_loc"]/text()').extract()
            if jobaddr:
                jobaddr = jobaddr[0]
            else:
                jobaddr = None
            # 公司名称
            company = response.xpath('//div[@class="job-detail-r"]//h4/a/text()').extract()
            if company:
                company = company[0]
            else:
                company = None
            # 抓取时间
            crawl_time = datetime.datetime.now().strftime('%Y-%m-%d')
    
            # print(url,pname,smoney,emoney,eyear,syear,degree,ptype,tags,date_pub,advantage,jobdesc,jobaddr,company,crawl_time)
            item["url"] = url
            item["pname"] = pname
            item["smoney"] = smoney
            item["emoney"] = emoney
            item["location"] = location
            item["syear"] = syear
            item["eyear"] = eyear
            item["degree"] = degree
            item["ptype"] = ptype
            item["tags"] = tags
            item["date_pub"] = date_pub
            item["advantage"] = advantage
            item["jobdesc"] = jobdesc
            item["jobaddr"] = jobaddr
            item["company"] = company
            item["crawl_time"] = crawl_time
            # if item['pname'] != None:
            yield item
    
        # 发布时间处理
        def process_date(self,value):
            if '今天' in value:
                date_pub = datetime.datetime.now().strftime('%Y-%m-%d')
            elif '昨天' in value:
                date_pub = (datetime.datetime.now() - timedelta(days=int(24))).strftime('%Y-%m-%d')
            else:
                res = self.num_pattern.search(value).group()
                date_pub = (datetime.datetime.now() - timedelta(days=int(res))).strftime('%Y-%m-%d')
            return date_pub
        # 工作经历处理
        def process_year(self,value):
            if '应届' in value:
                syear = 0
                eyear = 0
            else:
                res = self.num_pattern.search(value)
                if res is None:
                    syear = None
                    eyear = None
                else:
                    syear = res.group()
                    eyear = res.group()
            return syear,eyear
        # 工资处理
        def process_money(self,value):
            if "面" not in value:
                smoney = value.split('-')[0]
                emoney = value.split('-')[1]
            else:
                smoney = 0
                emoney = 0
            return smoney,emoney
        # 工作详情处理
        def process_desc(self,value):
            jobdesc = ''.join(value).replace('\r\n','').strip()
            return jobdesc
    
    

    相关文章

      网友评论

        本文标题:分布式爬虫scrapy-redis的蜘蛛基本配置

        本文链接:https://www.haomeiwen.com/subject/suzbbxtx.html