美文网首页
Scrapy抓取51job

Scrapy抓取51job

作者: whong736 | 来源:发表于2018-03-17 07:24 被阅读15次

    1.创建项目:qcwy

    scrapy startproject qcwy
    
    cd  qcwy
    
    scrapy genspider Qcwyjob 51job.com
    
    image.png

    2.Pycharm 打开qcwy项目,先编写item,列出需要采集的字段

    完整代码

    import scrapy
    
    class QcwyItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
    
        Positionname = scrapy.Field()  #职位名称
        Companyname = scrapy.Field()   #公司名称
        Salary = scrapy.Field()        #薪资福利
        Workplace = scrapy.Field()     #工作地点
        Posttime = scrapy.Field()      #发布时间
        Experience = scrapy.Field()    #工作经验
        Xueli = scrapy.Field()         #学历要求
        Number = scrapy.Field()        #招聘人数
        Link = scrapy.Field()          #链接地址
    
    image.png

    3.编写爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    from qcwy.items import QcwyItem
    
    
    class QcwyjobSpider(scrapy.Spider):
        name = 'Qcwyjob'
        allowed_domains = ['51job.com']
        start_urls = [
            'http://search.51job.com/list/030200,000000,0000,00,9,07%252C08,%25E4%25BA%25A7%25E5%2593%2581%25E7%25BB%258F%25E7%2590%2586%2B,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
    
        def parse(self, response):
            #获取所有的工作内容DIV
            jobs = response.xpath(".//div[@class='el']")[4:]
            for job in jobs:
                item = QcwyItem()
    
                item['Positionname'] = job.xpath(".//p/span/a/text()").extract()[0].strip()
                item['Companyname'] = job.xpath(".//span[@class='t2']/a/text()").extract()[0]
                item['Workplace'] = job.xpath(".//span[@class='t3']/text()").extract()[0]
                item['Workplace'] = job.xpath(".//span[@class='t3']/text()").extract()[0]
                item['Link'] = job.xpath(".//p/span/a/@href").extract()[0]
                try:
                    item['Salary'] = job.xpath(".//span[@class='t4']/text()").extract()[0]
                except:
                    item['Salary'] = '面议'  # 用try是部分公司薪水没写,空列表报错
                item['Posttime'] = job.xpath(".//span[@class='t5']/text()").extract()[0]
                url = job.xpath(".//p/span/a/@href").extract()[0]
                yield scrapy.Request(url, callback=self.parse_detail, dont_filter=True, meta={'key': item})
            next_page = response.xpath(".//li[@class='bk'][2]/a/@href").extract()[0]
            yield scrapy.Request(next_page, callback=self.parse)
    
        def parse_detail(self, response):
            for info in response.xpath(".//div[@class='t1']"):
                try:
                    item = response.meta['key']
                    item['Experience'] = info.xpath(".//span[@class='sp4'][1]/text()").extract()[0]  # 工作经验
                    item['Xueli'] = info.xpath(".//span[@class='sp4'][2]/text()").extract()[0]  # 学历要求
                    item['Number'] = info.xpath(".//span[@class='sp4'][3]/text()").extract()[0]  # 招聘人数
                except:
                    continue
                yield item
    
    image.png

    4.编写Pipeline文件

    import pymysql
    import json
    from scrapy import log
    
    class QcwyPipeline(object):
        def process_item(self, item, spider):
            conn = pymysql.connect(host="127.0.0.1", user="root", passwd="654321", db="qcwy", charset='utf8')
            cursor = conn.cursor()
            cursor.execute('set names utf8')  # 固定格式
            cursor.execute('set autocommit=1')  # 设置自动提交
            Positionname = item["Positionname"]
            Companyname = item["Companyname"]
            Workplace = item["Workplace"]
            Posttime = item["Posttime"]
            Experience = item["Experience"]
            Xueli = item["Xueli"]
            Salary = item["Salary"]
            Number = item["Number"]
            Link = item["Link"]
            sql = "insert into jobs(positionname,companyname,workplace,posttime,experience,xueli,salary,number,link) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            param = (Positionname, Companyname, Workplace, Posttime, Experience,Xueli,Salary,Number,Link)
            print(param)
            cursor.execute(sql, param)
            conn.commit()
            cursor.close()
            conn.close()
            return item
    
    
    

    5.编写setting

    
    BOT_NAME = 'qcwy'
    
    SPIDER_MODULES = ['qcwy.spiders']
    NEWSPIDER_MODULE = 'qcwy.spiders'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    COOKIES_ENABLED = False
    DEFAULT_REQUEST_HEADERS = {
    
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Referer': 'http://www.51job.com/',
    
    }
    ITEM_PIPELINES = {
       'qcwy.pipelines.QcwyPipeline': 300,
    }
    
    image.png

    6.要将爬取的内容写入数据库,创建数据库qcwy

    创建jobs表

    create table jobs(id int AUTO_INCREMENT PRIMARY KEY,positionname VARCHAR(200),companyname VARCHAR(200)unique,salary VARCHAR(200),
    workplace VARCHAR(200),posttime VARCHAR(200),experience VARCHAR(200),xueli VARCHAR(200),number VARCHAR(50));
    
    

    7.运行爬虫

    scrapy crawl Qcwyjob
     
    
    image.png image.png

    抓取到400多条记录后,被拒绝访问了

    参考文章:
    http://blog.csdn.net/DDCooper/article/details/79217499


    觉得文章有用,请用支付宝扫描,领取一下红包!打赏一下

    支付宝红包码

    相关文章

      网友评论

          本文标题:Scrapy抓取51job

          本文链接:https://www.haomeiwen.com/subject/usuxqftx.html