urllib
python2 - urllib2
python3 - urllib
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
# req = urllib.request.Request("http://www.baidu.com")
# resp = urllib.request.urlopen(req)
print(response.read())
带请求头的get请求:
import urllib.request
headers = {}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
print(response.read()) # response.read().decode()
post请求:
import urllib.request
import urllib.parse
headers = {}
data = {"email": "efd@123.com", "password": "123"}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request('http://www.renren.com/PLogin.do', data=data, headers=headers)
resp = urllib.request.urlopen(request)
print(resp.read())
requests模块
requests的底层实现是urllib,python2和python3通用,可以自动解压相应内容
import requests
url = 'http://www.baidu.com'
response = requests.get(url)
print(response.text)
response的基本属性:
response.text 响应体str 编码方式 response.encoding='gbk'
response.content 响应体 bytes 编码方式 response.content.decode='utf8'
response.status_code 状态码
response.request.headers 响应对应的请求头
response.headers 响应头
response.request._cookies 响应对应请求的cookie
response.request.cookies 响应的cookie(经过set-cookie动作)
获取网页源的方法:
# 顺序尝试,可以解决100%网页解码的问题
response.content.decode()
response.content.decode('gbk')
response.text
将网络图片下载到本地:
import requests
url = 'https://www.baidu.com/img/bd_logo1.png'
response = requests.get(url) # 响应体是一个图片,二进制类型
with open('baidu.png', 'wb') as f:
f.write(response.content)
模拟浏览器,发送带请求头的请求:
import requests
url = 'http://www.baidu.com'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
response = requests.get(url, headers = headers)
print(response.request.headers)
带参数的get请求:(有时候在url地址中很多参数是没有用的,可以多尝试,只保留有效的参数)
# 方式一
import requests
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
url = 'https://www.baidu.com/s?wd=bilibili'
reponse = requests.get(url, headers=headers)
# 方式二
import requests
url = 'https://www.baidu.com/s?'
kw = {'wd': 'bibili'}
response = requests.get(url, headers=headers, params=kw)
print(response.content)
post请求:
import requests
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
data = {'query': '好', 'from': 'zh', 'to': 'en'}
response = requests.post("http://fanyi.baidu.com/basetrans", data=data, headers=headers)
关于代理
正向代理:对于浏览器知道服务器的真实地址,例如VPN
反向代理:浏览器不知道服务器的真实地址,例如nginx
import requests
proxies = {'http':'http://123.333.445.444:23434', 'https':'https://123.333.445.444:23434'}
requests.get('http://www.baidu.com', proxies=proxies)
cookies的使用:
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Cookie":" Pycharm-26c2d973=dbb9b300-2483-478f-9f5a-16ca4580177e; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1512607763; Pycharm-26c2d974=f645329f-338e-486c-82c2-29e2a0205c74; _xsrf=2|d1a3d8ea|c5b07851cbce048bd5453846445de19d|1522379036"}
}
cookies = {'name': 'value'}
requests.get(url, headers=headers, cookies=cookies)
seesion:
session = requests.session()
response = session.get(url, headers=headers)
其他参数:
import requests
response = requests.get(url, verfy=False, timeout=3)
from retrying import retry
@retry(stop_max_attempt_number=3)
def _parse_url(url):
reponse = requests.get(url, timeout=3)
assert response.status_code == 200
return response
def parse_url(url):
try:
response = _parse_url(url)
except Exception as e:
print(e)
response = None
return response
网友评论