美文网首页
爬虫:拉勾职位

爬虫:拉勾职位

作者: Tim_Chen | 来源:发表于2016-11-10 19:51 被阅读0次
        #main.py
    
        import requests
        import json
        import pymongo
        from multiprocessing import Pool
        import time
        import random
    
    
        url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
        headers = {
            'Accept':'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':'gzip, deflate',
            'Accept-Language':'zh-CN,zh;q=0.8',
            'Connection':'keep-alive',
            'Content-Length':'65',
            'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie':'',
            'Host':'www.lagou.com',
            'Origin':'https://www.lagou.com',
            'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88?px=default&city=%E5%85%A8%E5%9B%BD',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
            'X-Anit-Forge-Code':'0',
            'X-Anit-Forge-Token':'None',
            'X-Requested-With':'XMLHttpRequest'
        }
    
    
        def postonepage(pn):
            client = pymongo.MongoClient('localhost', 27017)
            Lagou = client['Lagou']
            position = Lagou['position']
            print(pn)
            postdata = {
                'first': 'false',
                'pn': pn,
                'kd': '数据分析师'
            }
            try:
                wbdata = requests.post(url,headers = headers,data = postdata)
                jdata = wbdata.json()
                # print(jdata)
                positionResult = jdata['content']['positionResult']
                time.sleep(random.randint(4, 8))
                # print(positionResult)
                for j in positionResult:
                    i = j['position']
                    adata = {
                        'createTime':i['createTime'],
                        'positionId': i['positionId'],
                        'positionName':i['positionName'],
                        'firstType':i['firstType'],
                        'secondType':i['secondType'],
                        'education':i['education'],
                        'city':i['city'],
                        'salary':i['salary'],
                        'jobNature':i['jobNature'],
                        'workYear':i['workYear'],
    
                        'companyId':i['companyId'],
                        'companyFullName':i['companyFullName'],
                        'financeStage':i['financeStage'],
                        'companySize':i['companySize'],
                        'industryField':i['industryField']
                    }
                    position.insert_one(adata)
    
                print('success')
            except:
                print('one error occurred')
    
    
        if __name__ == '__main__':
            alist = list(range(1,500))
            pool = Pool(processes=4)
            pool.map(postonepage,alist)
    

    相关文章

      网友评论

          本文标题:爬虫:拉勾职位

          本文链接:https://www.haomeiwen.com/subject/fqmxpttx.html