美文网首页
爬拉勾网的数据分析职位

爬拉勾网的数据分析职位

作者: ygquincy | 来源:发表于2018-08-13 11:27 被阅读0次
import requests
import json
from fake_useragent import UserAgent
from multiprocessing import Pool
from functools import partial
import pymongo

mongo_url = 'localhost'
mongo_db = 'lagou'

client = pymongo.MongoClient(mongo_url, connect=False)
db = client[mongo_db]

url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'
ua = UserAgent()
headers = {
    'Cookie': '',
    'Host': 'www.lagou.com',
    'Origin': 'https://www.lagou.com',
    'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
    'User-Agent': ua.random
}


def get_page(page, city):
    postdata = {'fitst':'true',
                'pn': page,
                'kd': '数据分析'
                }
    response = requests.post(url.format(city), data=postdata, headers=headers)
    html = response.text
    return html

def parse_page(html):
    # json转化为字典
    content = json.loads(html)
    results= content.get('content').get('positionResult').get('result')
    for item in results:
        company = item.get('companyFullName').strip()
        id = item.get('companyId')
        size = item.get('companySize')
        district = item.get('district')
        position = item.get('positionName')
        workyear = item.get('workYear')
        education = item.get('education')
        createtime = item.get('createTime')
        field = item.get('industryField')
        salary = item.get('salary')
        yield {'company':company,
               'id':id,
               'size':size,
               'district':district,
               'position':position,
               'workyear':workyear,
               'education':education,
               'createtime':createtime,
               'field':field,
               'salary':salary
               }

def save_to_mongo(item):
    if db['数据分析'].insert(item):
        print('存储成功')


def main(page, city):
    html = get_page(page, city)
    for item in parse_page(html):
        save_to_mongo(item)

if __name__ == '__main__':
    pool = Pool()
    par_main = partial(main, city='上海')
    pool.map(par_main, [i for i in range(1,31)])

相关文章

网友评论

      本文标题:爬拉勾网的数据分析职位

      本文链接:https://www.haomeiwen.com/subject/ygbybftx.html