01 ——爬虫模块(Spider.py)

作者: 漓墨泫雨 | 来源:发表于2018-07-03 08:51 被阅读0次

01 ——爬虫模块(Spider.py)
Python: 01 Python-爬虫初步
Java面试题：Python中爬虫框架或模块的区别
scrapy深度爬虫——01爬虫模块
楼盘统计爬虫源码
python网络爬虫基础模块安装
python学习笔记（二）——requests模块
Python2.7爬虫-爬取简书文章-入门
十行代码体验爬虫的尖锐之处
Python爬虫入门（urllib+Beautifulsoup）

爬虫模块

# -*- coding: utf-8 -*-

'''深度爬虫：采集工作信息及分页深度采集'''

# 引入爬虫模块

from scrapy.spiderimport CrawlSpider

# 引入链接提取模块

from scrapy.linkextractorsimport LinkExtractor

# 引入规则模块

from scrapy.spiderimport Rule

#导入item模块

from ..itemsimport MyspiderItem

class ZhilianJobSpider(CrawlSpider):

    name ="zhilian1"

    allowed_domains = ["zhaopin.com"]

    start_urls = ["http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%97%A0%E9%94%A1&kw=python&isadv=0&isfilter=1&p=1&pd=30",]

    # 提取信息1：提取工作链接

    link_job = LinkExtractor(restrict_xpaths=("//td[@class='zwmc']/div/a[1]"))

    # 提起信息2：提取下一页链接

    link_next_page = LinkExtractor(restrict_xpaths=("//a[@class='next-page']"))

    # 定义规则和指定的处理函数

    # 注意：千万千万千万不要重写parse函数

    rules = [
        Rule(link_job,callback="parse_job"),
        Rule(link_next_page,follow=True),
    ]

    def parse_job(self, response):

        '''response中包含的就是详细工作信息'''

        print("---------------------job_name---------------------")

        item =MyspiderItem()

        #职位

        job_name = response.xpath("//div[@class='top-fixed-box'][1]/div[@class='fixed-inner-box']/div[@class='inner-left fl']/h1/text()").extract()[0]

        if len(job_name) >0:

            item['name'] = job_name

        # 薪资  .strip去图片

        job_salary = response.xpath("//ul[@class='terminal-ul clearfix']/li[1]/strong/text()").extract()[0].strip()

        if len(job_salary) >0:

            item['salary'] = job_salary

        #公司

        job_company = response.xpath("//div[@class='top-fixed-box'][1]/div[@class='fixed-inner-box']/div[@class='inner-left fl']/h2/a/text()").extract()[0]

        if len(job_company) >0:

            item['company'] = job_company

        #招聘日期

        job_day = response.xpath("//strong/span[@id='span4freshdate']/text()").extract()[0]

        if len(job_day) >0:

            item['day'] = job_day

        #工作经验

        job_experience= response.xpath("//ul[@class='terminal-ul clearfix']/li[5]/strong/text()").extract()[0]

        if len(job_experience) >0:

            item['experience'] = job_experience

        #工作地点

        job_area = response.xpath("//ul[@class='terminal-ul clearfix']/li[2]/strong/a/text()").extract()[0]

        if len(job_area) >0:

            item['area'] = job_area

        #招聘人数

        job_number = response.xpath("//ul[@class='terminal-ul clearfix']/li[7]/strong/text()").extract()[0]

        if len(job_number) >0:

            item['number'] = job_number

        #工作性质

        job_nature = response.xpath("//ul[@class='terminal-ul clearfix']/li[4]/strong/text()").extract()[0]

        if len(job_nature) >0:

            item['nature'] = job_nature

        #学历

        education_background = response.xpath("//ul[@class='terminal-ul clearfix']/li[6]/strong/text()").extract()[0]

        if len(education_background) >0 :

            item['education'] = education_background

        #职位描述

        # job_description = response.xpath("//div[@class='terminalpage-main clearfix']/div[@class='tab-cont-box']/div[@class='tab-inner-cont'][1]")

        # if len(job_description) > 0:

        # item['description'] = job_description.xpath("string(.)").extract()[0].strip()

        job_description = response.xpath("//div[@class='terminalpage-main clearfix']/div[@class='tab-cont-box']/div[@class='tab-inner-cont'][1]")

        if len(job_description) >0:

            item['description'] = job_description.xpath("string(.)").extract()[0].strip()

        # 交给 pipelines管道模块进行处理

        yield item

# print (job_name)

# print (job_salary)

# print (job_company)

# print (job_day)

# print (job_experience)

# print (job_area)

# print (job_number)

# print (job_nature)

# print (education_background)

# print (item['description'])

作者：陳CHEN陈
链接：https://www.jianshu.com/p/03bf988c88cb
來源：简书
简书著作权归作者所有，任何形式的转载都请联系作者获得授权并注明出处。