美文网首页计算机+技术+世界
爬虫基础:Requests库

爬虫基础:Requests库

作者: xuzhougeng | 来源:发表于2017-05-01 09:21 被阅读72次

    Requests库

    Requests使用Python语言编写,基于urllib,采用Apache2 Licensed开源协议的HTTP库。
    它比urllib更加方便,可以节约我们大量的工作,完全满足HTTP测试需求。存在大量的API方便调用。

    一句话--Python实现的简单易用的HTTP库。基本上能用urllib的部分都可以用Requests代替。

    安装

    pip install requests
    

    用法

    实例引入

    import requests
    response = requests.get("https://www.baidu.com/")
    print(type(response))
    # 状态码
    print(response.status_code)
    # 响应内容
    print(response.text)
    print(response.cookies)
    

    各种请求方式

    import requests
    #httpbin.org 测试网站
    requests.post('http://httpbin.org/post')
    requests.put('http://httpbin.org/put')
    requests.delete('http://httpbin.org/delete')
    requests.head('http://httpbin.org/get')
    requests.options('http://httpbin.org/get')
    

    基本GET请求

    基本写法:

    import requests
    response = requests.get("http://httpbin.org/get")
    print(response.text)
    

    带参数GET请求

    #参数也就是?name=germey&age=22部分
    #直接写在url上比较繁琐
    import requests
    response = requests.get("http://httpbin.org/get?name=germey&age=22")
    print(response.text)
    # 使用params传入字典构造参数
    import requests
    data = {
        'name':'germey',
        'age':22
    }
    response = requests.get("http://httpbin.org/get",params=data)
    print(response.text)
    

    解析json:自带了json方法

    import requests
    import json
    response = requests.get("http://httpbin.org/get")
    print(type(response.text))
    print(response.json())
    print(json.loads(response.text))
    print(type(response.json()))
    

    获取二进制数据(图片,视频,音频)

    import requests
    response = requests.get("https://github.com/favicon.ico")
    print(type(response.text),type(response.content))
    #print(response.text)
    #print(response.content)
    
    ## 保存
    with open('favicon.ico','wb') as f:
        f.write(response.content)
        f.close()
    

    添加headers

    # 不加headers,可能会被直接禁止,比如百度没有header就不会有结果
    ## https://www.baidu.com/s?wd=风景
    ## https://www.zhihu.com/explore
    import requests
    data = {
        'wd':'风景'
    }
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
        'Host':'www.baidu.com'
    
    }
    response = requests.get("https://www.baidu.com/s",params=data,headers=headers)
    print(response.text)
    

    基本POST请求

    基本是以form表单形式提交

    import requests
    data = {'name':'germey','age':'22'}
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    }
    response = requests.post("http://httpbin.org/post",data=data,headers=headers)
    print(response.json())
    
    

    响应

    response属性

    import requests
    response = requests.get('http://www.jianshu.com')
    # 常用属性
    print(type(response.status_code),response.status_code)
    print(type(response.headers),response.headers)
    print(type(response.cookies),response.cookies)
    print(type(response.url),response.url)
    print(type(response.history),response.history
    
    

    状态码判断

    import requests
    response = requests.get('http://www.jianshu.com')
    exit() if not response.status_code == requests.codes.ok else print("Request successful")
    

    高级操作

    文件上传

    # 文件上传
    import requests
    files = {'file':open('favicon.ico','rb')}
    response = requests.post('http://httpbin.org/post',files=files)
    print(response.text)
    

    获取cookies

    import requests
    response = requests.get('https://www.baidu.com/')
    print(response.cookies)
    for key,value in response.cookies.items():
        print(key +'='+value)
    

    会话维持: Session

    ## 模拟登陆
    import requests
    # 先设置cookies,再查看cookies
    requests.get('http://httpbin.org/cookies/set/number/123456789')
    response = requests.get('http://httpbin.org/cookies')
    print(response.text)
    # 其实相当于两次独立的请求
    # 需要声明Session对象,用于维持同一次会话
    s = requests.Session()
    s.get('http://httpbin.org/cookies/set/number/123456789')
    response = s.get('http://httpbin.org/cookies')
    print(response.text)
    

    证书验证:cert or verify

    # 直接访问12306,会出错
    import requests
    response = requests.get('https://www.12306.cn')
    print(response.status_code)
    # 方法1:在参数中关闭SSL验证
    ## 通过requests的原生包urllib3,取消警告
    from requests.packages import urllib3
    urllib3.disable_warnings()
    response = requests.get('https://www.12306.cn',verify=False)
    print(response.status_code)
    # 方法2:传入一个认证
    response = requests.get('https://www.12306.cn',cert=('path/server.crt','path/key'))
    
    

    代理设置:proxies

    # 普通的http和https代理
    import requests
    proxies = {
        'http':'http://127.0.0.1:9743',
        'https':'htpps://127.0.0.1:9743'
        'http':'http://user:password@127.0.0.1:9743'
    }
    response = requests.get('https://www.taobao.com','proxies=proxies')
    print(response.status_code)
    # 如果是shadowsocks代理
    # 需要先安装requests[sockes]
    # pip3 install requests[socks]
    import requests
    proxies = {
        'http':'socks5:127.0.0.1:9742'
        'https':'socks5:127.0.0.1:9742'
    }
    response = requests.get('https://www.baidu.com',proxies=proxies)
    
    

    超时设置:timeout

    import requests
    from requests.exceptions import ReadTimeout
    #response = requests.get('https://www.taobao.com',timeout=1)
    #response = requests.get('httpb://httpbin.org/get',timeout=1)
    #查看官方对异常的定义,捕获异常
    try:
        response = requests.get('http://httpbin.org/get',timeout=0.1)
        print(response.status_code)
    except ReadTimeout:
        print('Timeout')
    

    认证设置:auth

    import requests
    from requests.auth import HTTPBasicAuth
    r1 = requests.get('http://120.27.34.24:9001',auth=HTTPBasicAuth('user','123'))
    r2 = requests.get('http://120.27.34.24:9001',auth=('user','123'))
    print(r2.status_code)
    

    异常处理

    # 从子类异常到父类异常
    import requests
    from requests.exceptions import ReadTimeout, HTTPError, RequestException
    try:
        response = requests.get('http://httpbin.org/get',timeout=0.1)
        print(response.status_code)
    except ReadTimeout:
        print('Timeout')
    except ConnectionError:
        print('ConnectionError')
    except RequestException:
        print('Error')
    

    推荐阅读:http://blog.csdn.net/qq_18863573/article/details/52775130

    相关文章

      网友评论

        本文标题:爬虫基础:Requests库

        本文链接:https://www.haomeiwen.com/subject/hssftxtx.html