import requests
import json
from fake_useragent import UserAgent
from multiprocessing import Pool
from functools import partial
import pymongo
mongo_url = 'localhost'
mongo_db = 'lagou'
client = pymongo.MongoClient(mongo_url, connect=False)
db = client[mongo_db]
url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'
ua = UserAgent()
headers = {
'Cookie': '',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
'User-Agent': ua.random
}
def get_page(page, city):
postdata = {'fitst':'true',
'pn': page,
'kd': '数据分析'
}
response = requests.post(url.format(city), data=postdata, headers=headers)
html = response.text
return html
def parse_page(html):
# json转化为字典
content = json.loads(html)
results= content.get('content').get('positionResult').get('result')
for item in results:
company = item.get('companyFullName').strip()
id = item.get('companyId')
size = item.get('companySize')
district = item.get('district')
position = item.get('positionName')
workyear = item.get('workYear')
education = item.get('education')
createtime = item.get('createTime')
field = item.get('industryField')
salary = item.get('salary')
yield {'company':company,
'id':id,
'size':size,
'district':district,
'position':position,
'workyear':workyear,
'education':education,
'createtime':createtime,
'field':field,
'salary':salary
}
def save_to_mongo(item):
if db['数据分析'].insert(item):
print('存储成功')
def main(page, city):
html = get_page(page, city)
for item in parse_page(html):
save_to_mongo(item)
if __name__ == '__main__':
pool = Pool()
par_main = partial(main, city='上海')
pool.map(par_main, [i for i in range(1,31)])
网友评论