美文网首页
2019-07-30 拉钩 反爬虫

2019-07-30 拉钩 反爬虫

作者: 年画儿 | 来源:发表于2019-07-30 17:59 被阅读0次

    https://blog.csdn.net/qq_41562377/article/details/89703431

    #添加包
    
    import random
    import json
    import requests
    import time
    
    lagou=open('lagou.txt','w',encoding='utf-8')  #创建一个文本文档
    
    def req(page):
        first_url = 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
    
        second_url = "https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false"
        
        #构造足够多的浏览器
        user_agent = [
                    'Mozilla/5.0 (Windows NT 6.1; rv:50.0) Gecko/20100101 Firefox/50.0',
    
                    'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0',
    
                    'Mozilla/5.0 (X11; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0',
    
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    
                    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;  Trident/5.0)',
    
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
    
                    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
    
                    'Mozilla/5.0 (iPad; CPU OS 10_1_1 like Mac OS X) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0 Mobile/14B100 Safari/602.1',
    
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:49.0) Gecko/20100101 Firefox/49.0',
    
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0',
    
                    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
                ]
        
        num = random.randint(0, 9)           ##定义随机函数抽取浏览器访问
        user_agent = user_agent[num]  
    
        #给全所有的headers
        headers={
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content - Length': '55',
            'Content - Type': 'application / x - www - form - urlencoded; charset = UTF - 8',
            'Host': 'www.lagou.com',
            'Origin': 'https: // www.lagou.com',
            'Pragma': 'no-cache',
            'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
            'User-Agent': str(user_agent),
            'X - Anit - Forge - Code': '0',
            'X - Anit - Forge - Token': 'None',
            'X - Requested - With': 'XMLHttpRequest'
            }
        #在给表单参数,由于在第一页中的first和之后几页的first不同,所以需要判断
        if page==1: 
            datas={ 
               'first': 'true',
               'pn': page,
               'kd': '数据挖掘'
    
                }
        else: 
            datas = {
                'first': 'false',
                'pn': page,
                'kd': '数据挖掘'
                }
        s=requests.Session()  #持续保持通信
        s.get(url=first_url,headers=headers,timeout=4)  #开始构造第一个请求,获取cookie,,first_url为get请求
        cookings=s.cookies
        html=s.post(url=second_url,headers=headers,data=datas,cookies=cookings,timeout=3)  #将得到的cookie继续请求senond_url,这样虽然会变得很慢,但是能爬
        time.sleep(5) #在停它5秒
        content=json.loads(html.text).get('content') #解析返回来的json
        txt=json.dumps(content,ensure_ascii=False) #转为字符串,而且设置编码
        lagou.write(txt)  #书写
        lagou.write('\n')
    
        req(page+1) #回调函数
    
    if __name__=='__main__':
        req(1)
    
    

    相关文章

      网友评论

          本文标题:2019-07-30 拉钩 反爬虫

          本文链接:https://www.haomeiwen.com/subject/jtqorctx.html