美文网首页
python爬虫之requests库基本操作

python爬虫之requests库基本操作

作者: Pickupthesmokes | 来源:发表于2018-12-23 15:54 被阅读0次

    1.requests_post请求

        import requests
    
        #url, 目标url
        # data=None,:post请求要上传的表单数据
    
        url = 'https://www.lagou.com/jobs/positionAjax.json?                needAddtionalResult=false'
    
        form_data = {
            'first': 'true',
            'pn': 1,
            'kd': 'python',
        }
    
        #设置请求头
        req_header = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Referer': 'https://www.lagou.com/jobs/list_python?                city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
        }
    
        response = requests.post(url,data=form_data,headers=req_header)
    
        print(response.status_code)
    
        print(response.text)
    
        #可以吧将返回的json字符串转为python数据类型
        data = response.json()
        print(type(data))
    

    2.requests_post_file文件上传

       import requests
    
       #文件上传
       #测试接口
       url = 'https://httpbin.org/post'
       files = {
           'file':open('cookies.txt','r')
       }
    
       response = requests.post(url,files=files)
    
       print(response.status_code)
       print(response.text)
    

    3.requests_auth客户端验证

       #web客户端验证
       import requests
    
       #设置认证信息
       auth = ('username','password')
    
       url = 'http://192.168.1.110'
    
       response = requests.get(url,auth=auth)
    
       print(response.status_code)
    

    4.requests_cookie模拟登录

       #requests下使用cookies
    
       import requests
       #分析发现
       # https://www.douban.com/accounts/login
       # 没有验证码的情况
       # source: index_nav
       # form_email: 18518753265
       # form_password: ljh12345678
    
       #有验证码的情况
       # source: index_nav
       # form_email: 18518753265
       # form_password: ljh12345678
       # captcha-solution: blade
       # captcha-id: 5IBtw5wm2riyrIrnV3utwUPt:en
    
       url = 'https://www.douban.com/accounts/login'
    
       form_data = {
           'source': 'index_nav',
           'form_email': '18518753265',
           'form_password': 'ljh12345678',
           'captcha-solution': 'violent',
           'captcha-id': 'AuKNJ1FIktyrmpljJ6WAzXo3:en'
       }
    
       #设置请求头
       req_header = {
           'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
       }
    
       #发起请求
       response = requests.post(url,headers=req_header,data=form_data)
    
       #使用response.cookies获取cookies信息
       print('模拟登录后的cookies信息',response.cookies)
       print(type(response.cookies))
       print(response.headers)
    
       with open('douban.html','w') as file:
           file.write(response.text)
    
       #requests.utils.cookiejar_from_dict():将字典转为cookiejar
       #requests.utils.dict_from_cookiejar():将cookiejar转为字典
       cookies_dict =        requests.utils.dict_from_cookiejar(response.cookies)
       print(cookies_dict)
     #登录成功后访问个人主页,能够成功获取到个人主页信息,说明确实保存了cookie
      #并且在一下次发起请求的时候携带了cookie
      url = 'https://www.douban.com/people/175417123/'
      #设置cookies参数,模拟用户发起请求
      response =  requests.get(url,headers=req_header,cookies=cookies_dict)
    
      if response.status_code == 200:
    
           with open('douban1.html','w') as file:
    
                file.write(response.text)
    

    5.requests_proxies代理

     #使用requests模块设置代理
     import requests
    
     proxies = {
         'http':'219.238.186.188:8118',
         'https':'222.76.204.110:808',
         'https':'https://username:password@ip:port',
         'http':'http://username:password@ip:port'
     }
    
     url = 'https://httpbin.org/get'
    
     response = requests.get(url,proxies=proxies,timeout=10)
    
     print(response.text)
    

    6.requests_session会话

      #requests.session():维持会话,可以让我们在跨请求时保存某些参数
    
    
      import requests
    
      #实例化session
      session = requests.session()
    
      #目标url
      url = 'https://www.douban.com/accounts/login'
    
      form_data = {
          'source': 'index_nav',
          'form_email': '18518753265',
          'form_password': 'ljh12345678',
          'captcha-solution': 'stamp',
          'captcha-id': 'b3dssX515MsmNaklBX8uh5Ab:en'
      }
    
      #设置请求头
      req_header = {
          'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
      }
    
      #使用session发起请求
      response = session.post(url,headers=req_header,data=form_data)
    
      if response.status_code == 200:
    
          #访问个人主页:
          url = 'https://www.douban.com/people/175417123/'
    
          response = session.get(url,headers = req_header)
    
          if response.status_code == 200:
    
              with open('douban3.html','w') as file:
    
                  file.write(response.text)
    

    7.requests使用

      #pip3 install requests
      #requests模块:是对urllib的封装,可以实现urllib的所有功能
      #并且api调用更加简单方便
    
      import requests
    
      # url = 'http://www.baidu.com/'
      url = 'http://www.sina.com'
      # url, :要请求的目标url
      # params:get请求后面要拼接的参数
      """
      :param method: 要发起的是什么类型的请求.
      :param url: 要请求的目标url
      :param params: get请求后面要拼接的参数
      :param data: Dictionary, post请求的表单数据
      :param json: 传递json数据跟上面的data效果类似
      :param headers: (optional) Dictionary 请求头
      :param cookies: (optional) Dict or CookieJar object (设置cookies信息模拟用户请求)
      :param files: 上传文件
      :param auth: 网站需要验证的信息(账号和密码)
      :param timeout: 设置请求的超时时间
      :param allow_redirects: bool,是否允许重定向
      :param proxies: (optional) Dictionary (设置代理)
      :param verify:  Defaults to ``True``.(忽略证书认证,默认为True表示不忽略)
      """
      req_header = {
          'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
      }
      parmars = {
          'wd':'豆瓣'
      }
      # response =      requests.get(url,params=parmars,headers=req_header)
      response = requests.get(url,headers=req_header)
      response.encoding='utf-8'
    
      #从响应结果中获取的信息
      #(这里得到的是解码后的字符串)
      html = response.text
    
      """
      #如果使用response.text出现了乱码
      方式一
      #response.content.decode('')
      方式二
      response.encoding=''设置编码类型
      """
    
      #获取bytes类型的数据
      b_html = response.content
      #获取状态码
      code = response.status_code
      #获取响应头
      response_headers = response.headers
      #请求头
      req_headers = response.request.headers
      #获取当前请求的url地址
      current_url = response.url
      #response.json():可以将json字符串转为python数据类型
      print(code)
      print(html)
    

    requests认证书问题

    import requests
    
    url = 'https://www.baidu.com/'
    
    #verify:默认为True,表示需要进行CA证书认证,
    #如果在请求网站的过程中遇到的ssl证书认证问题
    #只需将verify改为False
    response = requests.get(url,verify=False)
    
    print(response.status_code)
    

    相关文章

      网友评论

          本文标题:python爬虫之requests库基本操作

          本文链接:https://www.haomeiwen.com/subject/upjnkqtx.html