1.创建项目:qcwy
scrapy startproject qcwy
cd qcwy
scrapy genspider Qcwyjob 51job.com
image.png
2.Pycharm 打开qcwy项目,先编写item,列出需要采集的字段
完整代码
import scrapy
class QcwyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Positionname = scrapy.Field() #职位名称
Companyname = scrapy.Field() #公司名称
Salary = scrapy.Field() #薪资福利
Workplace = scrapy.Field() #工作地点
Posttime = scrapy.Field() #发布时间
Experience = scrapy.Field() #工作经验
Xueli = scrapy.Field() #学历要求
Number = scrapy.Field() #招聘人数
Link = scrapy.Field() #链接地址
image.png
3.编写爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from qcwy.items import QcwyItem
class QcwyjobSpider(scrapy.Spider):
name = 'Qcwyjob'
allowed_domains = ['51job.com']
start_urls = [
'http://search.51job.com/list/030200,000000,0000,00,9,07%252C08,%25E4%25BA%25A7%25E5%2593%2581%25E7%25BB%258F%25E7%2590%2586%2B,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
def parse(self, response):
#获取所有的工作内容DIV
jobs = response.xpath(".//div[@class='el']")[4:]
for job in jobs:
item = QcwyItem()
item['Positionname'] = job.xpath(".//p/span/a/text()").extract()[0].strip()
item['Companyname'] = job.xpath(".//span[@class='t2']/a/text()").extract()[0]
item['Workplace'] = job.xpath(".//span[@class='t3']/text()").extract()[0]
item['Workplace'] = job.xpath(".//span[@class='t3']/text()").extract()[0]
item['Link'] = job.xpath(".//p/span/a/@href").extract()[0]
try:
item['Salary'] = job.xpath(".//span[@class='t4']/text()").extract()[0]
except:
item['Salary'] = '面议' # 用try是部分公司薪水没写,空列表报错
item['Posttime'] = job.xpath(".//span[@class='t5']/text()").extract()[0]
url = job.xpath(".//p/span/a/@href").extract()[0]
yield scrapy.Request(url, callback=self.parse_detail, dont_filter=True, meta={'key': item})
next_page = response.xpath(".//li[@class='bk'][2]/a/@href").extract()[0]
yield scrapy.Request(next_page, callback=self.parse)
def parse_detail(self, response):
for info in response.xpath(".//div[@class='t1']"):
try:
item = response.meta['key']
item['Experience'] = info.xpath(".//span[@class='sp4'][1]/text()").extract()[0] # 工作经验
item['Xueli'] = info.xpath(".//span[@class='sp4'][2]/text()").extract()[0] # 学历要求
item['Number'] = info.xpath(".//span[@class='sp4'][3]/text()").extract()[0] # 招聘人数
except:
continue
yield item
image.png
4.编写Pipeline文件
import pymysql
import json
from scrapy import log
class QcwyPipeline(object):
def process_item(self, item, spider):
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="654321", db="qcwy", charset='utf8')
cursor = conn.cursor()
cursor.execute('set names utf8') # 固定格式
cursor.execute('set autocommit=1') # 设置自动提交
Positionname = item["Positionname"]
Companyname = item["Companyname"]
Workplace = item["Workplace"]
Posttime = item["Posttime"]
Experience = item["Experience"]
Xueli = item["Xueli"]
Salary = item["Salary"]
Number = item["Number"]
Link = item["Link"]
sql = "insert into jobs(positionname,companyname,workplace,posttime,experience,xueli,salary,number,link) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
param = (Positionname, Companyname, Workplace, Posttime, Experience,Xueli,Salary,Number,Link)
print(param)
cursor.execute(sql, param)
conn.commit()
cursor.close()
conn.close()
return item
5.编写setting
BOT_NAME = 'qcwy'
SPIDER_MODULES = ['qcwy.spiders']
NEWSPIDER_MODULE = 'qcwy.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Referer': 'http://www.51job.com/',
}
ITEM_PIPELINES = {
'qcwy.pipelines.QcwyPipeline': 300,
}
image.png
6.要将爬取的内容写入数据库,创建数据库qcwy
创建jobs表
create table jobs(id int AUTO_INCREMENT PRIMARY KEY,positionname VARCHAR(200),companyname VARCHAR(200)unique,salary VARCHAR(200),
workplace VARCHAR(200),posttime VARCHAR(200),experience VARCHAR(200),xueli VARCHAR(200),number VARCHAR(50));
7.运行爬虫
scrapy crawl Qcwyjob
image.png
image.png
抓取到400多条记录后,被拒绝访问了
参考文章:
http://blog.csdn.net/DDCooper/article/details/79217499
觉得文章有用,请用支付宝扫描,领取一下红包!打赏一下
支付宝红包码
网友评论