参考:http://www.jtahstu.com/blog/scrapy_zhipin_spider.html
制作了多城市抓取
spider文件:BoosZhiPin_Spider.py
path:zhaopin/zhaopin/spiders/BoosZhiPin_Spider.py
import scrapy
from ..items import BoosZhiPinItem
import time
import json
from furl import furl
'''
用途:爬取BOSS直聘数据
参数:地区,职位信息
运行代码:scrapy crawl BoosZhiPin
'''
class BoosZhiPin(scrapy.Spider):
name = 'BoosZhiPin' # 运行时爬虫名称
allowed_domains = ['www.zhipin.com'] # 当 OffsiteMiddleware 启用时, 域名不在列表中的URL不会被跟进。
start_urls = ['https://www.zhipin.com/wapi/zpCommon/data/city.json'] # 默认制定url,获取城市代码url
city_name = ['乌鲁木齐', '喀什'] # 需要抓取的城市
city_code_list = [] # 用于存储城市代码
query = 'python' # 需要查询的职位
F = furl('https://www.zhipin.com/job_detail/?') # URL母版
# 发送 header,伪装为浏览器
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
def parse(self, response):
self.get_city_code(response) # 获取城市code
for c in self.city_code_list: # 根据生成的城市代码 生成请求
yield self.request_city(c)
# 获取城市code
def get_city_code(self, response):
city_code = json.loads(response.body_as_unicode())
for city_name in self.city_name:
for area in city_code['zpData']['cityList']: # 循环地区
for index, city in enumerate(area['subLevelModelList']): # 循环该城市
if city['name'] == city_name: # 查询需要抓取的城市的code
self.city_code_list.insert(index, str(city['code']))
# 生成请求
def request_city(self, city_code, page=0):
'''构造 爬取某个具体的城市 的请求对象'''
page += 1
url_data = {
'city': city_code,
'query': self.query,
'page': page
}
# 要爬取的页面的URL
url = self.F.copy().add(url_data).url
req = scrapy.Request(url, callback=self.get_data, dont_filter=False, headers=self.headers)
# 使用 meta 传递附加数据,在 callback 中可以通过 response.meta 取得
req.meta['city_code'] = city_code
req.meta['page'] = page
return req
# 获取数据
def get_data(self, response):
job_list = response.css('div.job-list > ul > li')
for job in job_list:
item = BoosZhiPinItem()
job_primary = job.css('div.job-primary')
item['pid'] = job.css(
'div.info-primary > h3 > a::attr(data-jobid)').extract_first().strip()
item["positionName"] = job_primary.css(
'div.info-primary > h3 > a::text').extract_first().strip()
item["salary"] = job_primary.css(
'div.info-primary > h3 > a > span::text').extract_first().strip()
info_primary = job_primary.css(
'div.info-primary > p::text').extract()
item['city'] = info_primary[0].strip()
item['workYear'] = info_primary[1].strip()
item['education'] = info_primary[2].strip()
item['companyShortName'] = job_primary.css(
'div.info-company > div.company-text > h3 > a::text'
).extract_first().strip()
company_infos = job_primary.css(
'div.info-company > div.company-text > p::text').extract()
if len(company_infos) == 3: # 有一条招聘这里只有两项,所以加个判断
item['industryField'] = company_infos[0].strip()
item['financeStage'] = company_infos[1].strip()
item['companySize'] = company_infos[2].strip()
item['positionLables'] = job.css(
'li > div.job-tags > span::text').extract()
item['time'] = job.css('span.time::text').extract_first()
item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
yield item
city_name = response.meta['city_code']
page = response.meta['page']
if job_list: # 判断是否有数据
# 发送下一页请求
time.sleep(5) # ip多就可以注释掉了
yield self.request_city(city_name, page=page + 1)
items文件:items.py
path: zhaopin/zhaopin/items.py
class BoosZhiPinItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pid = scrapy.Field()
positionName = scrapy.Field()
positionLables = scrapy.Field()
workYear = scrapy.Field()
salary = scrapy.Field()
city = scrapy.Field()
education = scrapy.Field()
companyShortName = scrapy.Field()
industryField = scrapy.Field()
financeStage = scrapy.Field()
companySize = scrapy.Field()
time = scrapy.Field()
updated_at = scrapy.Field()
settings文件:settings.py
path:zhaopin/zhaopin/settings.py
BOT_NAME = 'zhaopin'
SPIDER_MODULES = ['zhaopin.spiders']
NEWSPIDER_MODULE = 'zhaopin.spiders'
ROBOTSTXT_OBEY = False
#如果有mongo
#ITEM_PIPELINES = {
# 'zhaopin.pipelines.ZhaopinPipeline': 300,
#}
# MONGO_HOST = "127.0.0.1" # 主机IP
# MONGO_PORT = 27017 # 端口号
# MONGO_DB = "scrapy_mongo" # 库名
# MONGO_COLL = "scrapy_collection" # collection名
好了现在可以运行了,记得终端的目录是项目根目录
运行.png
scrapy crawl BoosZhiPin
mongo.png
终端.png
(想把数据存到mongo里面的话看这块)
把settings.py文件关于mongo的取消注释 再在pipelines.py添加
一定要注意scrapy1.6和scrapy1.7不一样,1.7取消了scrapy.conf这个包
换成了
from scrapy.utils.project import get_project_settings
scrapy1.7
from . import settings
import pymongo
from scrapy.utils.project import get_project_settings
class ZhaopinPipeline(object):
def __init__(self):
settings = get_project_settings()
# 链接数据库
client = pymongo.MongoClient(host=settings.get('MONGO_HOST'), port=settings.get('MONGO_PORT'))
self.db = client[settings.get('MONGO_DB')] # 获得数据库的句柄
self.coll = self.db[settings.get('MONGO_COLL')] # 获得collection的句柄
# 数据库登录需要帐号密码的话
# self.db.authenticate(settings['MONGO_USER'], settings['MONGO_PSW'])
def process_item(self, item, spider):
postItem = dict(item) # 把item转化成字典形式
self.coll.insert(postItem) # 向数据库插入一条记录
return item # 会在控制台输出原item数据,可以选择不写
scrapy1.6
from . import settings
import pymongo
from scrapy.conf import settings
class ZhaopinPipeline(object):
def __init__(self):
# 链接数据库
client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
self.db = client[settings['MONGO_DB']] # 获得数据库的句柄
self.coll = self.db[settings['MONGO_COLL']] # 获得collection的句柄
# 数据库登录需要帐号密码的话
# self.db.authenticate(settings['MONGO_USER'], settings['MONGO_PSW'])
def process_item(self, item, spider):
postItem = dict(item) # 把item转化成字典形式
self.coll.insert(postItem) # 向数据库插入一条记录
return item # 会在控制台输出原item数据,可以选择不写
网友评论