# _*_ coding:utf-8 _*_
import requests
import json
import time
from bs4import BeautifulSoup
def crawl_detail(id):
url='https://www.lagou.com/jobs/%s.html' % id
headers={
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88?labelWords=sug&fromSearch=true&suginput=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
req = requests.get(url,headers=headers)
#指定lxml解析器
soup=BeautifulSoup(req.content,'lxml')
job_bt=soup.find('dd',attrs={'class':'job_bt'})
#text将h3等标签去除,只提取字符串
return job_bt.text
def main():
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88?labelWords=sug&fromSearch=true&suginput=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':None,
'X-Requested-With':'XMLHttpRequest'
}
data={
'first':'true',
'pn':'1',
'kd':'数据分析师'
}
positions =[]
for xin range(1,31):
form_data={
'first':'true',
'pn': x,
'kd':'数据分析师'
}
#替换真实URL、headers,由get改为post,注意:post有数据体Form Data,需要添加进去
result=requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0',headers=headers,data=form_data)
json_result=result.json()
print('-'*30)
print(json_result)
print('-' *30)
page_positions=json_result['content']['positionResult']['result']
#取每一个详情页的信息
for positionin page_positions:
position_dict={
'position_name':position['positionName'],
'education_level':position['education'],
'company_Name':position['companyShortName'],
'salary':position['salary'],
'workYear':position['workYear'],
'companyFullName':position['companyFullName']
}
#position_id单独放出
position_id= position['positionId']
position_detail=crawl_detail(position_id)
position_dict['position_detail']=position_detail
positions.append(position_dict)
time.sleep(3)
line=json.dumps(positions,ensure_ascii=False)
with open('lagou.json','wb+')as fp:
fp.write(line.encode('utf-8'))
if __name__ =='__main__':
main()
注意,list类型的append()和extend()的区别:
append()只能在list中添加一个元素;
extend()只能添加另外一个list。

网友评论