美文网首页django web工程
scrapy BOSS直聘(多城市抓取)

scrapy BOSS直聘(多城市抓取)

作者: Xmaxdev | 来源:发表于2019-07-21 12:21 被阅读0次

    参考:http://www.jtahstu.com/blog/scrapy_zhipin_spider.html
    制作了多城市抓取

    项目目录.png

    spider文件:BoosZhiPin_Spider.py
    path:zhaopin/zhaopin/spiders/BoosZhiPin_Spider.py

    import scrapy
    from ..items import BoosZhiPinItem
    import time
    import json
    from furl import furl
    
    '''
    用途:爬取BOSS直聘数据
    参数:地区,职位信息
    运行代码:scrapy crawl BoosZhiPin
    '''
    
    
    class BoosZhiPin(scrapy.Spider):
        name = 'BoosZhiPin'  # 运行时爬虫名称
        allowed_domains = ['www.zhipin.com']  # 当 OffsiteMiddleware 启用时, 域名不在列表中的URL不会被跟进。
        start_urls = ['https://www.zhipin.com/wapi/zpCommon/data/city.json']  # 默认制定url,获取城市代码url
        city_name = ['乌鲁木齐', '喀什']  # 需要抓取的城市
        city_code_list = []  # 用于存储城市代码
        query = 'python'  # 需要查询的职位
        F = furl('https://www.zhipin.com/job_detail/?')  # URL母版
    
        # 发送 header,伪装为浏览器
        headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
    
        def parse(self, response):
            self.get_city_code(response)  # 获取城市code
            for c in self.city_code_list:  # 根据生成的城市代码 生成请求
                yield self.request_city(c)
    
        # 获取城市code
        def get_city_code(self, response):
            city_code = json.loads(response.body_as_unicode())
            for city_name in self.city_name:
                for area in city_code['zpData']['cityList']:  # 循环地区
                    for index, city in enumerate(area['subLevelModelList']):  # 循环该城市
                        if city['name'] == city_name:  # 查询需要抓取的城市的code
                            self.city_code_list.insert(index, str(city['code']))
    
        # 生成请求
        def request_city(self, city_code, page=0):
            '''构造 爬取某个具体的城市 的请求对象'''
            page += 1
            url_data = {
                'city': city_code,
                'query': self.query,
                'page': page
            }
            # 要爬取的页面的URL
            url = self.F.copy().add(url_data).url
            req = scrapy.Request(url, callback=self.get_data, dont_filter=False, headers=self.headers)
            # 使用 meta 传递附加数据,在 callback 中可以通过 response.meta 取得
            req.meta['city_code'] = city_code
            req.meta['page'] = page
            return req
    
        # 获取数据
        def get_data(self, response):
            job_list = response.css('div.job-list > ul > li')
            for job in job_list:
                item = BoosZhiPinItem()
                job_primary = job.css('div.job-primary')
                item['pid'] = job.css(
                    'div.info-primary > h3 > a::attr(data-jobid)').extract_first().strip()
                item["positionName"] = job_primary.css(
                    'div.info-primary > h3 > a::text').extract_first().strip()
                item["salary"] = job_primary.css(
                    'div.info-primary > h3 > a > span::text').extract_first().strip()
                info_primary = job_primary.css(
                    'div.info-primary > p::text').extract()
                item['city'] = info_primary[0].strip()
                item['workYear'] = info_primary[1].strip()
                item['education'] = info_primary[2].strip()
                item['companyShortName'] = job_primary.css(
                    'div.info-company > div.company-text > h3 > a::text'
                ).extract_first().strip()
                company_infos = job_primary.css(
                    'div.info-company > div.company-text > p::text').extract()
                if len(company_infos) == 3:  # 有一条招聘这里只有两项,所以加个判断
                    item['industryField'] = company_infos[0].strip()
                    item['financeStage'] = company_infos[1].strip()
                    item['companySize'] = company_infos[2].strip()
                item['positionLables'] = job.css(
                    'li > div.job-tags > span::text').extract()
                item['time'] = job.css('span.time::text').extract_first()
                item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                yield item
    
            city_name = response.meta['city_code']
            page = response.meta['page']
            if job_list:  # 判断是否有数据
                # 发送下一页请求
                time.sleep(5)  # ip多就可以注释掉了
                yield self.request_city(city_name, page=page + 1)
    
    

    items文件:items.py
    path: zhaopin/zhaopin/items.py

    class BoosZhiPinItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        pid = scrapy.Field()
        positionName = scrapy.Field()
        positionLables = scrapy.Field()
        workYear = scrapy.Field()
        salary = scrapy.Field()
        city = scrapy.Field()
        education = scrapy.Field()
        companyShortName = scrapy.Field()
        industryField = scrapy.Field()
        financeStage = scrapy.Field()
        companySize = scrapy.Field()
        time = scrapy.Field()
        updated_at = scrapy.Field()
    

    settings文件:settings.py
    path:zhaopin/zhaopin/settings.py

    BOT_NAME = 'zhaopin'
    SPIDER_MODULES = ['zhaopin.spiders']
    NEWSPIDER_MODULE = 'zhaopin.spiders'
    ROBOTSTXT_OBEY = False
    #如果有mongo
    #ITEM_PIPELINES = {
    #   'zhaopin.pipelines.ZhaopinPipeline': 300,
    #}
    # MONGO_HOST = "127.0.0.1"  # 主机IP
    # MONGO_PORT = 27017  # 端口号
    # MONGO_DB = "scrapy_mongo"  # 库名
    # MONGO_COLL = "scrapy_collection"  # collection名
    

    好了现在可以运行了,记得终端的目录是项目根目录


    运行.png
    scrapy crawl BoosZhiPin
    
    mongo.png
    终端.png

    (想把数据存到mongo里面的话看这块)
    把settings.py文件关于mongo的取消注释 再在pipelines.py添加
    一定要注意scrapy1.6和scrapy1.7不一样,1.7取消了scrapy.conf这个包
    换成了

    from scrapy.utils.project import get_project_settings
    

    scrapy1.7

    from . import settings
    import pymongo
    from scrapy.utils.project import get_project_settings
    
    
    class ZhaopinPipeline(object):
    
        def __init__(self):
            settings = get_project_settings()
            # 链接数据库
            client = pymongo.MongoClient(host=settings.get('MONGO_HOST'), port=settings.get('MONGO_PORT'))
            self.db = client[settings.get('MONGO_DB')]  # 获得数据库的句柄
            self.coll = self.db[settings.get('MONGO_COLL')]  # 获得collection的句柄
            # 数据库登录需要帐号密码的话
            # self.db.authenticate(settings['MONGO_USER'], settings['MONGO_PSW'])
    
        def process_item(self, item, spider):
            postItem = dict(item)  # 把item转化成字典形式
            self.coll.insert(postItem)  # 向数据库插入一条记录
            return item  # 会在控制台输出原item数据,可以选择不写
    
    

    scrapy1.6

    from . import settings
    import pymongo
    from scrapy.conf import settings
    
    
    class ZhaopinPipeline(object):
        def __init__(self):
            # 链接数据库
            client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
            self.db = client[settings['MONGO_DB']]  # 获得数据库的句柄
            self.coll = self.db[settings['MONGO_COLL']]  # 获得collection的句柄
            # 数据库登录需要帐号密码的话
            # self.db.authenticate(settings['MONGO_USER'], settings['MONGO_PSW'])
    
        def process_item(self, item, spider):
            postItem = dict(item)  # 把item转化成字典形式
            self.coll.insert(postItem)  # 向数据库插入一条记录
            return item  # 会在控制台输出原item数据,可以选择不写
    

    相关文章

      网友评论

        本文标题:scrapy BOSS直聘(多城市抓取)

        本文链接:https://www.haomeiwen.com/subject/yvbtkctx.html