- 什么是Urllib
- Python内置的http请求库
- urllib库常用方法
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse urlj解析模块
urllib.robotparser robot.txt解析模块
- Python2和Python3比较
Python2
>>> import urllib2
>>> response = urllib2.urlopen('http://www.baidu.com')
Python3
>>> import urllib.request
>>> response = urllib.request.urlopen('http://www.baidu.com')
- 用法讲解
- urlopen
get类型的操作
from urllib import request
response = request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))#获取响应体的内容
post类型的请求
from urllib import parse
data = bytes(parse.urlencode({'word':'hello'}),encoding = 'utf8')
response1 = request.urlopen('http://httpbin.org/post',data = data)#http://httpbin.org/是一个做http测试的网站
print(response1.read())
- 响应
相应类型
print(type(response))
状态码、响应头
print(response.status)#状态码
print(response.getheaders())#响应头
print(response.getheader('Set-Cookie'))
- Request
from urllib import request
from urllib import parse,error
request1 = request.Request('http://python.org/')#此步骤为请求,对比urllib的使用可知可省略
response = request.urlopen(request1)
print(response.read().decode('utf-8'))
from urllib import parse,request,error
import socket
url = 'http://httpbin.org/post'#构造一个POST请求
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64)',
'Host':'httpbin.org'
}
dict1 = {
'name':'Germey'
}
data = bytes(parse.urlencode(dict1),encoding='utf8')#fontdata数据
req = request.Request(url = url,data = data,headers = headers,method = 'POST')#整一个Request()的一个结构
response = request.urlopen(req)
print(response.read().decode('utf-8'))#输出结构中可以看出我们前面所构造的headers和dict1
下面是另一种形式的post请求的方式
req1 = request.Request(url = url,data = data,method = 'POST')
req1.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64)')#使用add_header添加
response = request.urlopen(req1)
print(response.read().decode('utf-8'))
- Headler
from urllib import request
proxy_handler = request.ProxyHandler(
{'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
})#此IP为过期IP,最近我的途径被封了,无法为大家展示><sorry
opener = request.build_opener(proxy_handler)
response = opener.open('http://www.baidu.com')
print(response.read())
- Cookie
from urllib import request
from http import cookiejar
cookie =cookiejar.CookieJar()#设置一个cookie栈
handler = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(handler)
response =opener.open('http://www.baidu.com')
for item in cookie:
print(item.name+'='+item.value)
6.异常处理
from urllib import error
#我们试着访问一个不存在的网址
try:
response = request.urlopen('http://www.cuiqingcai.com/index.html')#http://www.cuiqingcai.com/此链接为崔老师的个人博客
except error.URLError as e:
print(e.reason)#通过审查可以查到我们捕捉的异常是否与之相符
- URL解析
from urllib.parse import urlparse
result = urlparse('https://www.baidu.com/s?wd=urllib&ie=UTF-8')
print(type(result),result) #<class 'urllib.parse.ParseResult'>
#无协议类型指定,自行添加的情况
result = urlparse('www.baidu.com/s?wd=urllib&ie=UTF-8',scheme = 'https')
print(result)
#有指定协议类型,添加的情况
result1 = urlparse('http://www.baidu.com/s?wd=urllib&ie=UTF-8',scheme = 'https')
print(result1)
#allow_fragments参数使用
result1 = urlparse('http://www.baidu.com/s?#comment',allow_fragments = False)
result2 = urlparse('http://www.baidu.com/s?wd=urllib&ie=UTF-8#comment',allow_fragments = False)
print(result1,result2)#allow_fragments=False表示#后面的东西不能填,原本在fragment位置的参数就会往上一个位置拼接,可以对比result1和result2的区别
网友评论