美文网首页
爬拉勾网的数据分析职位

爬拉勾网的数据分析职位

作者: ygquincy | 来源:发表于2018-08-13 11:27 被阅读0次
    import requests
    import json
    from fake_useragent import UserAgent
    from multiprocessing import Pool
    from functools import partial
    import pymongo
    
    mongo_url = 'localhost'
    mongo_db = 'lagou'
    
    client = pymongo.MongoClient(mongo_url, connect=False)
    db = client[mongo_db]
    
    url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'
    ua = UserAgent()
    headers = {
        'Cookie': '',
        'Host': 'www.lagou.com',
        'Origin': 'https://www.lagou.com',
        'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
        'User-Agent': ua.random
    }
    
    
    def get_page(page, city):
        postdata = {'fitst':'true',
                    'pn': page,
                    'kd': '数据分析'
                    }
        response = requests.post(url.format(city), data=postdata, headers=headers)
        html = response.text
        return html
    
    def parse_page(html):
        # json转化为字典
        content = json.loads(html)
        results= content.get('content').get('positionResult').get('result')
        for item in results:
            company = item.get('companyFullName').strip()
            id = item.get('companyId')
            size = item.get('companySize')
            district = item.get('district')
            position = item.get('positionName')
            workyear = item.get('workYear')
            education = item.get('education')
            createtime = item.get('createTime')
            field = item.get('industryField')
            salary = item.get('salary')
            yield {'company':company,
                   'id':id,
                   'size':size,
                   'district':district,
                   'position':position,
                   'workyear':workyear,
                   'education':education,
                   'createtime':createtime,
                   'field':field,
                   'salary':salary
                   }
    
    def save_to_mongo(item):
        if db['数据分析'].insert(item):
            print('存储成功')
    
    
    def main(page, city):
        html = get_page(page, city)
        for item in parse_page(html):
            save_to_mongo(item)
    
    if __name__ == '__main__':
        pool = Pool()
        par_main = partial(main, city='上海')
        pool.map(par_main, [i for i in range(1,31)])
    

    相关文章

      网友评论

          本文标题:爬拉勾网的数据分析职位

          本文链接:https://www.haomeiwen.com/subject/ygbybftx.html