美文网首页python-Scrapy
01 ——爬虫模块(Spider.py)

01 ——爬虫模块(Spider.py)

作者: 漓墨泫雨 | 来源:发表于2018-07-03 08:51 被阅读0次

    爬虫模块

    # -*- coding: utf-8 -*-
    
    '''深度爬虫:采集工作信息及分页深度采集'''
    
    # 引入爬虫模块
    
    from scrapy.spiderimport CrawlSpider
    
    # 引入链接提取模块
    
    from scrapy.linkextractorsimport LinkExtractor
    
    # 引入规则模块
    
    from scrapy.spiderimport Rule
    
    #导入item模块
    
    from ..itemsimport MyspiderItem
    
    class ZhilianJobSpider(CrawlSpider):
    
        name ="zhilian1"
    
        allowed_domains = ["zhaopin.com"]
    
        start_urls = ["http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%97%A0%E9%94%A1&kw=python&isadv=0&isfilter=1&p=1&pd=30",]
    
        # 提取信息1:提取工作链接
    
        link_job = LinkExtractor(restrict_xpaths=("//td[@class='zwmc']/div/a[1]"))
    
        # 提起信息2:提取下一页链接
    
        link_next_page = LinkExtractor(restrict_xpaths=("//a[@class='next-page']"))
    
        # 定义规则和指定的处理函数
    
        # 注意:千万千万千万不要重写parse函数
    
        rules = [
            Rule(link_job,callback="parse_job"),
            Rule(link_next_page,follow=True),
        ]
    
        def parse_job(self, response):
    
            '''response中包含的就是详细工作信息'''
    
            print("---------------------job_name---------------------")
    
            item =MyspiderItem()
    
            #职位
    
            job_name = response.xpath("//div[@class='top-fixed-box'][1]/div[@class='fixed-inner-box']/div[@class='inner-left fl']/h1/text()").extract()[0]
    
            if len(job_name) >0:
    
                item['name'] = job_name
    
            # 薪资  .strip去图片
    
            job_salary = response.xpath("//ul[@class='terminal-ul clearfix']/li[1]/strong/text()").extract()[0].strip()
    
            if len(job_salary) >0:
    
                item['salary'] = job_salary
    
            #公司
    
            job_company = response.xpath("//div[@class='top-fixed-box'][1]/div[@class='fixed-inner-box']/div[@class='inner-left fl']/h2/a/text()").extract()[0]
    
            if len(job_company) >0:
    
                item['company'] = job_company
    
            #招聘日期
    
            job_day = response.xpath("//strong/span[@id='span4freshdate']/text()").extract()[0]
    
            if len(job_day) >0:
    
                item['day'] = job_day
    
            #工作经验
    
            job_experience= response.xpath("//ul[@class='terminal-ul clearfix']/li[5]/strong/text()").extract()[0]
    
            if len(job_experience) >0:
    
                item['experience'] = job_experience
    
            #工作地点
    
            job_area = response.xpath("//ul[@class='terminal-ul clearfix']/li[2]/strong/a/text()").extract()[0]
    
            if len(job_area) >0:
    
                item['area'] = job_area
    
            #招聘人数
    
            job_number = response.xpath("//ul[@class='terminal-ul clearfix']/li[7]/strong/text()").extract()[0]
    
            if len(job_number) >0:
    
                item['number'] = job_number
    
            #工作性质
    
            job_nature = response.xpath("//ul[@class='terminal-ul clearfix']/li[4]/strong/text()").extract()[0]
    
            if len(job_nature) >0:
    
                item['nature'] = job_nature
    
            #学历
    
            education_background = response.xpath("//ul[@class='terminal-ul clearfix']/li[6]/strong/text()").extract()[0]
    
            if len(education_background) >0 :
    
                item['education'] = education_background
    
            #职位描述
    
            # job_description = response.xpath("//div[@class='terminalpage-main clearfix']/div[@class='tab-cont-box']/div[@class='tab-inner-cont'][1]")
    
            # if len(job_description) > 0:
    
            # item['description'] = job_description.xpath("string(.)").extract()[0].strip()
    
            job_description = response.xpath("//div[@class='terminalpage-main clearfix']/div[@class='tab-cont-box']/div[@class='tab-inner-cont'][1]")
    
            if len(job_description) >0:
    
                item['description'] = job_description.xpath("string(.)").extract()[0].strip()
    
            # 交给 pipelines管道模块进行处理
    
            yield item
    
    # print (job_name)
    
    # print (job_salary)
    
    # print (job_company)
    
    # print (job_day)
    
    # print (job_experience)
    
    # print (job_area)
    
    # print (job_number)
    
    # print (job_nature)
    
    # print (education_background)
    
    # print (item['description'])
    

    作者:陳CHEN
    链接:https://www.jianshu.com/p/03bf988c88cb
    來源:简书
    简书著作权归作者所有,任何形式的转载都请联系作者获得授权并注明出处。

    相关文章

      网友评论

        本文标题:01 ——爬虫模块(Spider.py)

        本文链接:https://www.haomeiwen.com/subject/zmxduftx.html