首先先尝试爬取沈阳python相关岗位信息
# 爬取拉钩网python信息
def spider_lagou():
from urllib import request, parse
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url = "https://www.lagou.com/jobs/list_python/p-city_44?&cl=false&fromSearch=true&labelWords=&suginput=
"
headers = {"User-Agent":" Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"}
req = request.Request(url, headers = headers)
resp = request.urlopen(req).read().decode("utf-8")
print(resp)
if __name__ =='__main__':
spider_lagou()
此时我们得到爬取数据,但是并不能在得到的数据内查找到岗位信息
通过观察可以发现此网站的职位信息并不是在https://www.lagou.com/jobs/list_python/p-city_44?&cl=false&fromSearch=true&labelWords=&suginput=内,而是通过https://www.lagou.com/jobs/positionAjax.json?city=%E6%B2%88%E9%98%B3&needAddtionalResult=false"这里传入修改以下代码
# 爬取拉钩网python信息
def spider_lagou():
from urllib import request, parse
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url = "https://www.lagou.com/jobs/positionAjax.json?city=%E6%B2%88%E9%98%B3&needAddtionalResult=false"
headers = {"User-Agent":" Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"}
data = {"first": "true", "pn": 1, "kd": "python"}
req = request.Request(url, headers = headers, data = parse.urlencode(data).encode("utf-8"), method = "POST")
resp = request.urlopen(req).read().decode("utf-8")
print(resp)
if __name__ =='__main__':
spider_lagou()
运行后得到提示:{"status":false,"msg":"您操作太频繁,请稍后再访问","clientIp":"182.200.18.29","state":2402},然后就把参数更详细的添加进去发现也不好用,查资料(https://blog.csdn.net/m0_43400362/article/details/88396490
)用requests,那么我也尝试使用requests。
拉钩的网页加载的时候有一个url专门返回除了招聘信息以外的其它东西,加载招聘信息的时候会产生另外一个ajax请求,请求返回的才是想要爬取的内容,只需要在先发送主请求,之后用requests.Session()建立session,建立完成session之后通过session来获取cookie,拿到cookie就可以直接用
import requests
s = requests.Session( )
# requests库的session会话对象可以跨请求保持某些参数,说白了,就是比如你使用session成功的登录了某个网站,则在再次使用该session对象请求该网站的其他网页都会默认使用该session之前使用的cookie等参数
优化后可以正常爬取拉勾网职位信息
下面是代码(爬取后存入excel)
# 爬取拉钩网python信息
def spider_lagou():
import requests
import time
import json
import ssl
# 在浏览器中访问该网站时会弹出证书不受信任,但是忽略仍可继续访问
ssl._create_default_https_context = ssl._create_unverified_context
url_start = "https://www.lagou.com/jobs/list_python/p-city_44?&cl=false&fromSearch=true&labelWords=&suginput="
url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=%E6%B2%88%E9%98%B3&needAddtionalResult=false"
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
"Referer": "https://www.lagou.com/jobs/list_python/p-city_44?&cl=false&fromSearch=true&labelWords=&suginput="
}
info_list = []
for x in range(1, 3):
data = {"first": "true", "pn": str(x), "kd": "python"}
s = requests.Session()
s.get(url_start, headers=headers, timeout=3) # 请求首页获取cookies
cookie = s.cookies # 为此次获取的cookies
resp = s.post(url_parse, data=data, headers=headers, cookies=cookie, timeout=3) # 获取此次文本
time.sleep(5)
resp.encoding = resp.apparent_encoding
text = json.loads(resp.text)
info = text["content"]["positionResult"]["result"]
for i in info:
info_d = {}
info_d['公司名称'] = i["companyFullName"]
info_d['职位名称'] = i["positionName"]
info_d['工资'] = i["salary"]
info_d['公司规模'] = i["companySize"]
info_d['技能要求'] = i["skillLables"]
info_d['公司位置'] = i["district"]
# print(info_d)
info_list.append(info_d)
print(info_list)
return info_list
def save_excel(info_list):
import xlwt
workbook = xlwt.Workbook()
ws = workbook.add_sheet("information")
style = xlwt.XFStyle()
for i in info_list:
keys = list(i.keys())
for i in range(len(keys)):
ws.write(0, i, keys[i], style = style)
for li in range(len(info_list)):
for key, value in info_list[li].items():
keys = list(info_list[li].keys())
# 找到key的index
ws.write(li + 1, keys.index(key), value, style = style)
# 保存表
workbook.save('职位信息.xls')
if __name__ =='__main__':
info_list = spider_lagou()
save_excel(info_list)
沈阳python相关职位信息
网友评论