scrapy
pip install scrapy
pip install pyOpenSSL
pip install cryptography
pip install CFFI
pip install lxml
pip install cssselect
pip install Twisted
创建爬虫项目
scrapy startproject zhipinSpider
生成爬虫
scrapy genspider job_position "zhipin.com"
image.png目录结构:
items.py :
pipelines.py:处理爬取的内容
settings.py :配置文件
先调试数据
scrapy shell - s USER_AGENT="xx"
https://www.zhipin.com/c101280100/h101280100/
让scrapy伪装成浏览器
XPath语法
/ 匹配根节点
// 任意节点
. 当前节点
.. 父节点
@ 属性
//div[@title="xxx"]/div
extract提取节点内容
image.pngCSS匹配
image.pngitems.py
import scrapy
class ZhipinspiderItem(scrapy.Item):
# 工作名称
title = scrapy.Field()
# 工资
salary = scrapy.Field()
# 招聘公司
company = scrapy.Field()
# 工作详细链接
url = scrapy.Field()
# 工作地点
work_addr = scrapy.Field()
# 行业
industry = scrapy.Field()
# 公司规模
company_size = scrapy.Field()
# 招聘人
recruiter = scrapy.Field()
# 发布时间
publish_date = scrapy.Field()
job_spider.py
import scrapy
from ZhipinSpider.items import ZhipinspiderItem
class JobPositionSpider(scrapy.Spider):
# 定义该Spider的名字
name = 'job_position'
# 定义该Spider允许爬取的域名
allowed_domains = ['zhipin.com']
# 定义该Spider爬取的首页列表
start_urls = ['https://www.zhipin.com/c101280100/h_101280100/']
# 该方法负责提取response所包含的信息
# response代表下载器从start_urls中每个URL下载得到的响应
def parse(self, response):
# 遍历页面上所有//div[@class="job-primary"]节点
for job_primary in response.xpath('//div[@class="job-primary"]'):
item = ZhipinspiderItem()
# 匹配//div[@class="job-primary"]节点下/div[@class="info-primary"]节点
# 也就是匹配到包含工作信息的<div.../>元素
info_primary = job_primary.xpath('./div[@class="info-primary"]')
item['title'] = info_primary.xpath('./h3/a/div[@class="job-title"]/text()').extract_first()
item['salary'] = info_primary.xpath('./h3/a/span[@class="red"]/text()').extract_first()
item['work_addr'] = info_primary.xpath('./p/text()').extract_first()
item['url'] = info_primary.xpath('./h3/a/@href').extract_first()
# 匹配//div[@class="job-primary"]节点下./div[@class="info-company"]节点下
# 的/div[@class="company-text"]的节点
# 也就是匹配到包含公司信息的<div.../>元素
company_text = job_primary.xpath('./div[@class="info-company"]' +
'/div[@class="company-text"]')
item['company'] = company_text.xpath('./h3/a/text()').extract_first()
company_info = company_text.xpath('./p/text()').extract()
if company_info and len(company_info) > 0:
item['industry'] = company_info[0]
if company_info and len(company_info) > 2:
item['company_size'] = company_info[2]
# 匹配//div[@class="job-primary"]节点下./div[@class="info-publis"]节点下
# 也就是匹配到包含发布人信息的<div.../>元素
info_publis = job_primary.xpath('./div[@class="info-publis"]')
item['recruiter'] = info_publis.xpath('./h3/text()').extract_first()
item['publish_date'] = info_publis.xpath('./p/text()').extract_first()
yield item
# 解析下一页的链接
new_links = response.xpath('//div[@class="page"]/a[@class="next"]/@href').extract()
if new_links and len(new_links) > 0:
# 获取下一页的链接
new_link = new_links[0]
# 再次发送请求获取下一页数据
yield scrapy.Request("https://www.zhipin.com" + new_link, callback=self.parse)
pipelines.py
class ZhipinspiderPipeline(object):
def process_item(self, item, spider):
print("工作:" , item['title'])
print("工资:" , item['salary'])
print("工作地点:" , item['work_addr'])
print("详情链接:" , item['url'])
print("公司:" , item['company'])
print("行业:" , item['industry'])
print("公司规模:" , item['company_size'])
print("招聘人:" , item['recruiter'])
print("发布日期:" , item['publish_date'])
settings.py
-- coding: utf-8 --
Scrapy settings for ZhipinSpider project
For simplicity, this file contains only settings considered important or
commonly used. You can find more settings consulting the documentation:
https://doc.scrapy.org/en/latest/topics/settings.html
https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'ZhipinSpider'
SPIDER_MODULES = ['ZhipinSpider.spiders']
NEWSPIDER_MODULE = 'ZhipinSpider.spiders'
Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'ZhipinSpider (+http://www.yourdomain.com)'
Obey robots.txt rules
ROBOTSTXT_OBEY = True
Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
Configure a delay for requests for the same website (default: 0)
See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
Disable cookies (enabled by default)
COOKIES_ENABLED = False
Disable Telnet Console (enabled by default)
TELNETCONSOLE_ENABLED = False
Override the default request headers:
配置默认的请求头
DEFAULT_REQUEST_HEADERS = {
"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8'
}
Enable or disable spider middlewares
See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'ZhipinSpider.middlewares.ZhipinspiderSpiderMiddleware': 543,
}
Enable or disable downloader middlewares
See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'ZhipinSpider.middlewares.ZhipinspiderDownloaderMiddleware': 543,
}
Enable or disable extensions
See https://doc.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS = {
'scrapy.extensions.telnet.TelnetConsole': None,
}
Configure item pipelines
See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
配置使用Pipeline
ITEM_PIPELINES = {
'ZhipinSpider.pipelines.ZhipinspiderPipeline': 300,
}
Enable and configure the AutoThrottle extension (disabled by default)
See https://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
The initial download delay
AUTOTHROTTLE_START_DELAY = 5
The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
The average number of requests Scrapy should be sending in parallel to
each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False
Enable and configure HTTP caching (disabled by default)
See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
启动
scrapy crawl job_position
存入数据库:pipelines.py
导入访问MySQL的模块
import mysql.connector
class ZhipinspiderPipeline(object):
# 定义构造器,初始化要写入的文件
def init(self):
self.conn = mysql.connector.connect(user='root', password='32147',
host='localhost', port='3306',
database='python', use_unicode=True)
self.cur = self.conn.cursor()
# 重写close_spider回调方法,用于关闭数据库资源
def close_spider(self, spider):
print('----------关闭数据库资源-----------')
# 关闭游标
self.cur.close()
# 关闭连接
self.conn.close()
def process_item(self, item, spider):
self.cur.execute("INSERT INTO job_inf VALUES(null, %s, %s, %s, %s, %s,
%s, %s, %s, %s)", (item['title'], item['salary'], item['company'],
item['url'], item['work_addr'], item['industry'],
item.get('company_size'), item['recruiter'], item['publish_date']))
self.conn.commit()
处理反爬虫
更改IP地址:middlewares.py
image.png禁用cookie:settings.py
COOKIES_ENABLED=False
不遵守爬虫规则
image.png设置访问频率
image.pngimage.png
网友评论