美文网首页
Scrapy抓取51job

Scrapy抓取51job

作者: whong736 | 来源:发表于2018-03-17 07:24 被阅读15次

1.创建项目:qcwy

scrapy startproject qcwy

cd  qcwy

scrapy genspider Qcwyjob 51job.com
image.png

2.Pycharm 打开qcwy项目,先编写item,列出需要采集的字段

完整代码

import scrapy

class QcwyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    Positionname = scrapy.Field()  #职位名称
    Companyname = scrapy.Field()   #公司名称
    Salary = scrapy.Field()        #薪资福利
    Workplace = scrapy.Field()     #工作地点
    Posttime = scrapy.Field()      #发布时间
    Experience = scrapy.Field()    #工作经验
    Xueli = scrapy.Field()         #学历要求
    Number = scrapy.Field()        #招聘人数
    Link = scrapy.Field()          #链接地址
image.png

3.编写爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from qcwy.items import QcwyItem


class QcwyjobSpider(scrapy.Spider):
    name = 'Qcwyjob'
    allowed_domains = ['51job.com']
    start_urls = [
        'http://search.51job.com/list/030200,000000,0000,00,9,07%252C08,%25E4%25BA%25A7%25E5%2593%2581%25E7%25BB%258F%25E7%2590%2586%2B,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']

    def parse(self, response):
        #获取所有的工作内容DIV
        jobs = response.xpath(".//div[@class='el']")[4:]
        for job in jobs:
            item = QcwyItem()

            item['Positionname'] = job.xpath(".//p/span/a/text()").extract()[0].strip()
            item['Companyname'] = job.xpath(".//span[@class='t2']/a/text()").extract()[0]
            item['Workplace'] = job.xpath(".//span[@class='t3']/text()").extract()[0]
            item['Workplace'] = job.xpath(".//span[@class='t3']/text()").extract()[0]
            item['Link'] = job.xpath(".//p/span/a/@href").extract()[0]
            try:
                item['Salary'] = job.xpath(".//span[@class='t4']/text()").extract()[0]
            except:
                item['Salary'] = '面议'  # 用try是部分公司薪水没写,空列表报错
            item['Posttime'] = job.xpath(".//span[@class='t5']/text()").extract()[0]
            url = job.xpath(".//p/span/a/@href").extract()[0]
            yield scrapy.Request(url, callback=self.parse_detail, dont_filter=True, meta={'key': item})
        next_page = response.xpath(".//li[@class='bk'][2]/a/@href").extract()[0]
        yield scrapy.Request(next_page, callback=self.parse)

    def parse_detail(self, response):
        for info in response.xpath(".//div[@class='t1']"):
            try:
                item = response.meta['key']
                item['Experience'] = info.xpath(".//span[@class='sp4'][1]/text()").extract()[0]  # 工作经验
                item['Xueli'] = info.xpath(".//span[@class='sp4'][2]/text()").extract()[0]  # 学历要求
                item['Number'] = info.xpath(".//span[@class='sp4'][3]/text()").extract()[0]  # 招聘人数
            except:
                continue
            yield item
image.png

4.编写Pipeline文件

import pymysql
import json
from scrapy import log

class QcwyPipeline(object):
    def process_item(self, item, spider):
        conn = pymysql.connect(host="127.0.0.1", user="root", passwd="654321", db="qcwy", charset='utf8')
        cursor = conn.cursor()
        cursor.execute('set names utf8')  # 固定格式
        cursor.execute('set autocommit=1')  # 设置自动提交
        Positionname = item["Positionname"]
        Companyname = item["Companyname"]
        Workplace = item["Workplace"]
        Posttime = item["Posttime"]
        Experience = item["Experience"]
        Xueli = item["Xueli"]
        Salary = item["Salary"]
        Number = item["Number"]
        Link = item["Link"]
        sql = "insert into jobs(positionname,companyname,workplace,posttime,experience,xueli,salary,number,link) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        param = (Positionname, Companyname, Workplace, Posttime, Experience,Xueli,Salary,Number,Link)
        print(param)
        cursor.execute(sql, param)
        conn.commit()
        cursor.close()
        conn.close()
        return item


5.编写setting


BOT_NAME = 'qcwy'

SPIDER_MODULES = ['qcwy.spiders']
NEWSPIDER_MODULE = 'qcwy.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {

    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Referer': 'http://www.51job.com/',

}
ITEM_PIPELINES = {
   'qcwy.pipelines.QcwyPipeline': 300,
}
image.png

6.要将爬取的内容写入数据库,创建数据库qcwy

创建jobs表

create table jobs(id int AUTO_INCREMENT PRIMARY KEY,positionname VARCHAR(200),companyname VARCHAR(200)unique,salary VARCHAR(200),
workplace VARCHAR(200),posttime VARCHAR(200),experience VARCHAR(200),xueli VARCHAR(200),number VARCHAR(50));

7.运行爬虫

scrapy crawl Qcwyjob
 
image.png image.png

抓取到400多条记录后,被拒绝访问了

参考文章:
http://blog.csdn.net/DDCooper/article/details/79217499


觉得文章有用,请用支付宝扫描,领取一下红包!打赏一下

支付宝红包码

相关文章

网友评论

      本文标题:Scrapy抓取51job

      本文链接:https://www.haomeiwen.com/subject/usuxqftx.html