爬虫—urllib模块的使用

作者: 烤奶要加冰 | 来源:发表于2019-10-16 12:19 被阅读0次

    说明:urllib模块属于python自带模块,不需要下载,直接在.py文件头部引入使用

    使用包

    import urllib.request

    语法1

    urllib.request.urlopen(url, data, timeout)

    • url:请求的路由;
    • data:请求参数;
    • timeout:过期时间。
      例:
      # 只传入地址url的情况
      url = 'http://www.baidu.com'
      response = urllib.request.urlopen(url)
      print(response)
      # read()读取响应内容,默认格式为bytes
      text = response.read().decode('utf-8')
      
      # 传入地址url和参数data的情况
      # 参数放在url地址中,请求方式为GET
      url = 'http://www.baidu.com/s?wd=python'
      response = urllib.request.urlopen(url)
      text = response.read().decode('utf-8')
      
      # 参数放在urlopen(data)中,请求方式为POST
      url = 'http://www.baidu.com/s'
      data = {
          'wd': 'python'
      }
      # urlencode()方法将字典转化为key=value的形式
      data = urllib.parse.urlencode(data)
      # bytes()方法将data转化为bytes类型
      data = bytes(data, encoding='utf-8')
      response = urllib.request.urlopen(url, data)
      text = response.read().decode('utf-8')
      
      # 传入timeout参数
      url = 'http://www.baidu.com'
      try:
          response = urllib.request.urlopen(url, timeout=0.01)
          text = response.read().decode('utf-8')
          print(text)
      except urllib.error.URLError as e:
          print('超时')
      

    语法2

    request = urllib.request.Request(url, data, headers, method)
    urllib.request.urlopen(obj)

    例:

    # request = urllib.request.Request(url, data, headers, method)
    # urllib.request.urlopen(对象request)
    
    # Request(url)只传入地址url的情况
    url = 'https://movie.douban.com/top250'
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    text = response.read().decode('utf-8')
    
    # Request(url, data)传入地址url和参数data的情况
    # data是bytes类型,请求为POST
    url = 'http://www.baidu.com/s'
    data = {
     'wd': 'python'
    }
    data = urllib.parse.urlencode(data)
    data = bytes(data, encoding='utf-8')
    request = urllib.request.Request(url, data)
    response = urllib.request.urlopen(request)
    text = response.read().decode('utf-8')
    
    # Request(url, headers), headers表示请求头,如User_Agent参数
    # User_Agent参数可被服务端获取进行判断,判断该请求为爬虫还是人工
    
    url = 'http://httpbin.org/get'
    headers = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    text = response.read().decode('utf-8')
    print(text)
    

    语法3

    proxy_handler = urllib.request.ProxyHandler(proxies=proxies)
    opener = urllib.request.build_opener(proxy_handler)
    res = openr.open(obj)res = openr.open(url)

    # 修改代理IP
    url = 'http://httpbin.org/get'
    proxies = {
        'http': 'http://113.120.63.179:9999'
    }
    proxy_handler = urllib.request.ProxyHandler(proxies=proxies)
    opener = urllib.request.build_opener(proxy_handler)
    # 设置User-Agent
    # 设置User-Agent方法1
    opener.addheaders = [('User-Agent', '')]
    opener.open(url)
    # 设置User-Agent方法2
    request = urllib.request.Request(url, headers=headers)
    response = opener.open(request)
    
    text = response.read().decode('utf-8')
    print(text)
    

    相关文章

      网友评论

        本文标题:爬虫—urllib模块的使用

        本文链接:https://www.haomeiwen.com/subject/ioiymctx.html