比较基础的urllib库来了解一下

作者: 爱吃西瓜的番茄酱 | 来源:发表于2018-05-30 21:08 被阅读25次

什么是urllib库

Python内置的HTTP请求库

urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt解析模块

相比Python2的变化

在Python2.x中，这个库叫做urllib2，在Python3.x里，urllib2改名为urllib，被分成了三个子模块：

urllib.request
urllib.parse
urllib.error

Python2

import urllib2
response = urllib.urlopen('http://www.baidu.com')

python3

import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')

urlopen函数

函数原型

# 函数原型：
urllib.request.urlopen(url, data=none, [timeout]*, -------- )
# 主要参数为请求URL、data数据和超时设置

基本示例

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))
# read()方法是读取响应体的内容
# decode('utf-8') 表示以'utf-8'格式解码
# encoding='utf-8' 表示以'utf-8'格式编码

输出内容为百度首页的源代码，太多了，这里就不贴了。

在urlopen中携带data数据

import urllib.request
import urllib.parse

data = bytes(urllib.parse.urlencode({'word':'hello'}), encoding='utf-8')
# 在urlopen中携带data数据
# http://httpbin.org是一个测试HTTP请求的网站
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read())

b'{"args":{},"data":"","files":{},"form":{"word":"hello"},"headers":{"Accept-Encoding":"identity","Connection":"close","Content-Length":"10","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"Python-urllib/3.6"},"json":null,"origin":"117.139.10.7","url":"http://httpbin.org/post"}\n'

设置超时参数

import urllib.request

# 设置超时参数
response = urllib.request.urlopen('http://httpbin.org/get', timeout=1)
print(response.read())

b'{"args":{},"headers":{"Accept-Encoding":"identity","Connection":"close","Host":"httpbin.org","User-Agent":"Python-urllib/3.6"},"origin":"117.139.10.7","url":"http://httpbin.org/get"}\n'

import urllib.request
import urllib.error
import socket

try:
    response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

TIME OUT

响应

响应类型

import urllib.request

response = urllib.request.urlopen('http://www.python.org')
print(type(response))

<class 'http.client.HTTPResponse'>

状态码、响应头

import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(response.status)  # 输出状态码
print(response.getheaders())
print(response.getheader('Server'))

200
[('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'SAMEORIGIN'), ('x-xss-protection', '1; mode=block'), ('X-Clacks-Overhead', 'GNU Terry Pratchett'), ('Via', '1.1 varnish'), ('Content-Length', '48703'), ('Accept-Ranges', 'bytes'), ('Date', 'Tue, 29 May 2018 10:57:05 GMT'), ('Via', '1.1 varnish'), ('Age', '932'), ('Connection', 'close'), ('X-Served-By', 'cache-iad2148-IAD, cache-lax8633-LAX'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '1, 14'), ('X-Timer', 'S1527591425.014404,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]
nginx

Request

# urlopen函数不能携带headers信息
# Request函数可以携带headers等信息
import urllib.request

request = urllib.request.Request('https://www.python.org')
response = urllib.request.urlopen(request)
print(response.code)

携带data数据和headers信息的Request请求

from urllib import request, parse

url = 'http://httpbin.org/post'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'

}
dict = {
    'name' : 'germey'
}
data = bytes(parse.urlencode(dict), encoding='utf-8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

{"args":{},"data":"","files":{},"form":{"name":"germey"},"headers":{"Accept-Encoding":"identity","Connection":"close","Content-Length":"11","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"},"json":null,"origin":"117.139.10.7","url":"http://httpbin.org/post"}

cookie

# cookie是用来保存登陆状态的
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name+ " = " +item.value)

BAIDUID = E2078AB08DD6A6FE566A65305B8E1944:FG=1
BIDUPSID = E2078AB08DD6A6FE566A65305B8E1944
H_PS_PSSID = 1460_21080_26430
PSTM = 1527595557
BDSVRTM = 0
BD_HOME = 0

将cookie信息保存下来

import http.cookiejar, urllib.request

#　将cookie信息保存为文本文档
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename) # 谷歌浏览器的cookie保存格式
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True) #　使用save方法将cookie保存下来

使用load方法将读取已保存好的cookie信息

import http.cookiejar, urllib.request

cookie = http.cookiejar.MozillaCookieJar()
# 使用load方法将读取已保存好的cookie信息
# 将这个cookie再次放在request中请求网页
cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.code)

异常处理

from urllib import request, error

try: 
    response = request.urlopen('http://www.jianshu.com/index.html')
except error.URLError as e:
    print(e.reason)

Forbidden

验证异常的具体类型

import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('http://www.baidu.com', timeout=0.01)
except urllib.error.URLError as e:  # 验证异常的具体类型
    print(type(e.reason))
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

<class 'socket.timeout'>
TIME OUT

urlparse

函数原型

# 函数原型
urllib.parse.urlparse(urlstring, scheme="", allow_fragments=True)
#　参数scheme指的是协议类型

示例：

from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index。html;user?id=5#commont')
print(type(result), result)

<class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index。html', params='user', query='id=5', fragment='commont')

urlunparse

# urlunparse函数是urlparse函数的反函数，可以用来拼接URL
from urllib.parse import urlunparse

data = ['http', 'www.baidu.com', 'index.html', 'user', 'id=5', 'comment']
print(urlunparse(data))

http://www.baidu.com/index.html;user?id=5#comment

urljoin

# 用来拼接url
from urllib.parse import urljoin

# 以后面的url为基准，将两个url进行拼接或者覆盖前一个url
print(urljoin('http://www.baidu.com', 'FAQ.html'))
print(urljoin('http://www.baidu.com', 'https://www.baidu.com/FAQ.html'))
print(urljoin('http://www.baidu.com', 'https://www.jianshu.com/u/13b5875d0a63'))
print(urljoin('https://www.jianshu.com', 'u/13b5875d0a63'))

http://www.baidu.com/FAQ.html
https://www.baidu.com/FAQ.html
https://www.jianshu.com/u/13b5875d0a63
https://www.jianshu.com/u/13b5875d0a63

urlencode

# urlencode将字典对象转换为get请求参数
from urllib.parse import urlencode

params = {
    "name": "gemmry",
    'age': 22
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)

http://www.baidu.com?name=gemmry&age=22

urllib库常用函数大致就是这些，其实这个还是比较繁琐的，最好用的HTTP请求库当然是requests了，下次再来了解下吧。

每天学习一点点，每天进步一点点。

网友评论

Python数据科学

本文标题：比较基础的urllib库来了解一下

本文链接：https://www.haomeiwen.com/subject/eowxsftx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

比较基础的urllib库来了解一下

什么是urllib库

相比Python2的变化

urlopen函数

函数原型

基本示例

在urlopen中携带data数据

设置超时参数

响应

响应类型

状态码、响应头

Request

携带data数据和headers信息的Request请求

cookie

将cookie信息保存下来

使用load方法将读取已保存好的cookie信息

异常处理

验证异常的具体类型

urlparse

函数原型

示例：

urlunparse

urljoin

urlencode

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

Python数据科学