https://blog.csdn.net/qq_41562377/article/details/89703431
#添加包
import random
import json
import requests
import time
lagou=open('lagou.txt','w',encoding='utf-8') #创建一个文本文档
def req(page):
first_url = 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
second_url = "https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false"
#构造足够多的浏览器
user_agent = [
'Mozilla/5.0 (Windows NT 6.1; rv:50.0) Gecko/20100101 Firefox/50.0',
'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; Trident/5.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (iPad; CPU OS 10_1_1 like Mac OS X) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0 Mobile/14B100 Safari/602.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:49.0) Gecko/20100101 Firefox/49.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
]
num = random.randint(0, 9) ##定义随机函数抽取浏览器访问
user_agent = user_agent[num]
#给全所有的headers
headers={
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content - Length': '55',
'Content - Type': 'application / x - www - form - urlencoded; charset = UTF - 8',
'Host': 'www.lagou.com',
'Origin': 'https: // www.lagou.com',
'Pragma': 'no-cache',
'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': str(user_agent),
'X - Anit - Forge - Code': '0',
'X - Anit - Forge - Token': 'None',
'X - Requested - With': 'XMLHttpRequest'
}
#在给表单参数,由于在第一页中的first和之后几页的first不同,所以需要判断
if page==1:
datas={
'first': 'true',
'pn': page,
'kd': '数据挖掘'
}
else:
datas = {
'first': 'false',
'pn': page,
'kd': '数据挖掘'
}
s=requests.Session() #持续保持通信
s.get(url=first_url,headers=headers,timeout=4) #开始构造第一个请求,获取cookie,,first_url为get请求
cookings=s.cookies
html=s.post(url=second_url,headers=headers,data=datas,cookies=cookings,timeout=3) #将得到的cookie继续请求senond_url,这样虽然会变得很慢,但是能爬
time.sleep(5) #在停它5秒
content=json.loads(html.text).get('content') #解析返回来的json
txt=json.dumps(content,ensure_ascii=False) #转为字符串,而且设置编码
lagou.write(txt) #书写
lagou.write('\n')
req(page+1) #回调函数
if __name__=='__main__':
req(1)
网友评论