# _*_ coding:utf-8 _*_
import requests
import json
import time
from bs4import BeautifulSoup
def crawl_detail(id):
url='https://www.lagou.com/jobs/%s.html' % id
headers={
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88?labelWords=sug&fromSearch=true&suginput=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
req = requests.get(url,headers=headers)
#指定lxml解析器
soup=BeautifulSoup(req.content,'lxml')
job_bt=soup.find('dd',attrs={'class':'job_bt'})
#text将h3等标签去除,只提取字符串
return job_bt.text
def main():
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88?labelWords=sug&fromSearch=true&suginput=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':None,
'X-Requested-With':'XMLHttpRequest'
}
data={
'first':'true',
'pn':'1',
'kd':'数据分析师'
}
positions =[]
for xin range(1,31):
form_data={
'first':'true',
'pn': x,
'kd':'数据分析师'
}
#替换真实URL、headers,由get改为post,注意:post有数据体Form Data,需要添加进去
result=requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0',headers=headers,data=form_data)
json_result=result.json()
print('-'*30)
print(json_result)
print('-' *30)
page_positions=json_result['content']['positionResult']['result']
#取每一个详情页的信息
for positionin page_positions:
position_dict={
'position_name':position['positionName'],
'education_level':position['education'],
'company_Name':position['companyShortName'],
'salary':position['salary'],
'workYear':position['workYear'],
'companyFullName':position['companyFullName']
}
#position_id单独放出
position_id= position['positionId']
position_detail=crawl_detail(position_id)
position_dict['position_detail']=position_detail
positions.append(position_dict)
time.sleep(3)
line=json.dumps(positions,ensure_ascii=False)
with open('lagou.json','wb+')as fp:
fp.write(line.encode('utf-8'))
if __name__ =='__main__':
main()
注意,list类型的append()和extend()的区别:
append()只能在list中添加一个元素;
extend()只能添加另外一个list。
以下参考别人:#爬取for url in urlList: cont=requests.get(url,headers=headers,cookies=cookie).content#这点非常重要,这里的正则表达式是取以数字或者字母开头,后面紧跟()中的内容 rex=re.compile(r'\w+[(]{1}(.*)[)]{1}') content=rex.findall(cont)[0]#取出括号中的东西,这是一个标准的json格式数据 con=json.loads(content,"gbk")#获取都到评论列表,因为之前还有很多的其他东西,这些是不需要的 commentList= con['comments'] for item in commentList: id= item['id'] content= item['content'].strip() referenceName= item['referenceName'] productColor= item['productColor']一个技巧:可以先尝试提出()内的数据到一些在线的json解析网站(http://www.bejson.com/)去看看json数据的排列格式,这样就更加清楚该怎么提取数据了。(三)参考文献:1、使用python抓取并分析数据—人人贷(urllib):http://bluewhale.cc/2016-12-11/use-python-crawl-and-analysis-data-renrendai-urllib-html.html#ixzz4dKuFHIgb2 http://blog.csdn.net/yan_xing_an/article/details/46892105 作者:guoery
网友评论