1、urllib.request
HTTP请求和响应1.1 urllib.request.urlopen(发起请求)
from urllib import request
url = "https://www.bing.com"
res = request.urlopen(url) # , timeout=10)
# urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
# *, cafile=None, capath=None, cadefault=False, context=None)
"""
data是发送的数据;None则发起get请求;否则发起POST请求
timeout 设置超时时间,默认一直到访问成功;超时报错:socket.timeout: The read operation timed out
返回 http.client.HTTPResponse object 类文件对象
"""
print(res)
print(res.closed) # False;说明是类文件对象
with res: # 打开文件之后关闭
print(type(res)) # <class 'http.client.HTTPResponse'>
print(res.status) # 200
print(res._method) # GET ;默认不可调用
print(res.read()) # fb读取返回的内容
print(res.info()) # HTTP请求,则返回Response headers(返回结果的)
print(res.url) # https://cn.bing.com/ ;重定向
print(res.geturl()) # 获取真正访问的url;响应对象的URL
print(res.getcode()) # 200 返回请求时的HTTP响应码
print(res.closed) # True
print(resp.read().decode("utf-8")) # 解码,则输出HTML,而非字节串类型bytes
注:
string.encode("utf-8") # 字符串类型修改为字节bytes类型
bytes.decode("utf-8") # 字节码修改为字符串
- 测试网站,获取请求头
url = "http://httpbin.org/get"
resp = urlopen(url)
with resp:
print(resp.read())
# --------------------------------------------------------
# 输出结果:
{
"args": {},
"headers": {
"Accept-Encoding": "identity",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.9",
"X-Amzn-Trace-Id": "Root=1-62cfee93-2dd9d6045c7e5f4620262d67"
},
"origin": "202.103.156.155",
"url": "http://httpbin.org/get"
}
注意:
httpbin.org 这个网站能测试 HTTP 请求和响应的各种信息,比如 cookie、IP、headers 和登录验证等,且支持 GET、POST 等多种方法,对 Web 开发和测试很有帮助。
1.2 urllib.request.Request (重构爬虫UA)
- 请求头需要注意参数:
-
Referrer
:访问源至哪里(一些大型网站,会通过Referrer做防盗链策略;爬虫时需要注意模拟) -
User-Agent
:访问的浏览器(需要添加,否则会被认为是爬虫程序) -
cookie
:请求头注意携带
注:响应头中set-cookie,可能有多个,是来告诉浏览器,把cookie保存下来
set-cookie: SSO_USER_TOKEN=***; Path=/; Domain=kong.net; Max-Age=2592000; Expires=Sat, 13 Aug 2022 06:45:57 GMT; Secure; HttpOnly
"""
urlopen不能构造HTTP的请求。例如User-agent
class OpenerDirector:
def __init__(self):
client_version = "Python-urllib/%s" % __version__
self.addheaders = [('User-agent', client_version)] # 添加request headers
若网站是反爬虫的,则需要把爬虫伪装成浏览器。即复制浏览器中的UA,用来伪装
如 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36
class Request:
def __init__(self, url, data=None, headers={}, # headers是默认空
origin_req_host=None, unverifiable=False,
method=None)
"""
from urllib.request import urlopen, Request
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
req = Request(url, headers={"User-agent": ua})
# 或者req.add_header("User-agent", ua)
res = urlopen(req)
print(res.closed) # False;说明是类文件对象
with res: # 打开文件之后关闭
# print(res.read()) # fb读取返回的内容
print(res.info()) # HTTP请求,则返回Response headers(返回结果的)
print(res.closed) # True
- 伪装ua
UA 一成不变,很容易识别;所以可以考虑改变 - UA在线工具:https://useragent.buyaocha.com/
# 方式1:
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
"safafi",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49"
]
ua = random.choice(ua_list)
print(ua)
# 方式2:faker模块
from faker import Faker
fake = Faker(locale="zh_CN")
ua = fake.user_agent()
print(ua)
1.3 urllib.request.urlretrieve
from urllib.request import urlretrieve # 从网络上下载资源到本地
urlretrieve(url,img_name) # 从url中下载资源到本地file_name
"""
def urlretrieve(url, filename=None, reporthook=None, data=None):
Retrieve a URL into a temporary location on disk.
"""
# 豆瓣网实例中,直接下载图片到本地,替代urlopen和open
def upload_img2(covers):
"""方式2:存储图片到本地"""
from urllib.request import urlretrieve
for i in range(len(covers)):
img_url = covers[i]
img_name = f'{i+1}_{titles[i]}_{img_url.split("/")[-1]}'
img_full_name = os.path.join(img_path, img_name)
try:
urlretrieve(img_url, img_full_name)
except Exception as e:
print(f"报错信息:{e}")
下载网页的第3种方式
# 下载网页的第3种方式
from urllib.request import build_opener, install_opener, urlopen, HTTPCookieProcessor
from http import cookiejar
cookie = cookiejar.CookieJar()
# 加入urllib处理cookie能力
url = "http://www.baidu.com"
opener = build_opener(HTTPCookieProcessor(cookie))
install_opener(opener)
resp = urlopen(url)
with resp:
with open("baidu.html", "wb") as f:
f.write(resp.read())
2、urllib.parse
from urllib import parse
# parse 实现url的解编码
u = parse.urlencode("http://www.magedu.com/python")
# TypeError: not a valid non-string sequence or mapping object
# Request,构建请求信息
# urlopen, 发起请求
urlencode(query, doseq=False, safe='', encoding=None, errors=None,
quote_via=quote_plus):
# 第1参数是dict或二元组序列
d = {
"id": 1001,
"name": "百草"
}
u = parse.urlencode(d) # id=1001&name=%E7%99%BE%E8%8D%89
print(u)
url = "https://www.google.com/search?q=baid&oq=baid&aqs=chrome&sourceid=chrome&ie=UTF-8"
d2 = {
"id": 1001,
"q": "百草",
"url": "https://www.google.com/search"
}
u2 = parse.urlencode(d2)
print(u2) # id=1001&q=%E7%99%BE%E8%8D%89&url=https%3A%2F%2Fwww.google.com%2Fsearch
# 十六进制表示
"https://www.baidu.com/s?wd=%E7%99%BE%E8%8D%89" # 百度搜索百草
# 实例
p = parse.urlencode({"wd": "百草"}) # wd=%E7%99%BE%E8%8D%89
url = "https://www.baidu.com/s?{}".format(p)
print(url) # https://www.baidu.com/s?wd=%E7%99%BE%E8%8D%89
print("百草".encode("utf-8")) # b'\xe7\x99\xbe\xe8\x8d\x89'
print(parse.unquote(p)) # wd=百草;即解码
# 一般按照字符集的encoding要求转换为字节序列,每一个字节对应的十六进制字符串前加上百分号即可
3、实例
最常用的HTTP方法:GET、POST
GET: 数据通过URL传递,即HTTP报文中的header部分
POST: 数据放在HTTP报文的body部分;且键值对形式,多个参数之间使用&连接
3.1 GET请求
# 拼接
base_url = "https://www.baidu.com/s"
p = {
"wd": "百草"
}
u = parse.urlencode(p)
url = "{}?{}".format(base_url, u) # url=https://www.baidu.com/s?wd=%E7%99%BE%E8%8D%89
# 伪装
from faker import Faker
fake = Faker(locale="zh_CN")
ua = fake.user_agent()
from urllib.request import urlopen, Request
# get请求
req = Request(url, headers={"User-agent": ua})
resp = urlopen(req)
with resp:
# text = resp.read()
# resp.read()
# 存储本地
with open("1.html", "wb+") as f:
f.write(resp.read())
f.flush() # 刷新缓冲区
爬取结果
3.2 POST请求
# 测试网站:http://httpbin.org/
from urllib.request import Request, urlopen
from urllib.parse import urlencode
import simplejson
url = "http://httpbin.org/post"
req = Request(url)
req.add_header("user-agent", ua)
p = {
"name": "百草",
"age": "6"
}
data = urlencode(p)
print(data)
3.3 处理JSON数据
import os
from urllib.request import Request, urlopen
from urllib import parse
# json数据处理
import jsonpath
import simplejson
from simplejson0713.get_files import img_path
"""
示例:豆瓣网站
"""
# ua
def get_url(url, data=None):
"""
发送请求
:param url:
:return:
"""
from faker import Faker
fake = Faker(locale="ZH_CN")
ua = fake.user_agent()
# 请求
req = Request(url, headers={
"User-agent": ua
})
res = urlopen(req, data=data)
return res
# url = "https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&page_limit=500&page_start=0"
base_url = "https://movie.douban.com/j/search_subjects"
u = {
"type": "tv",
"tag": "热门",
"page_limit": 10,
"page_start": 0
}
url = "{}?{}".format(base_url, parse.urlencode(u))
res = get_url(url)
with res:
text = res.read()
subject = simplejson.loads(text)
print(subject)
# with open("douban.html", "wb") as f:
# f.write(text)
# import json
#
# import jsonpath
#
# with open("douban.html", "r", encoding="utf-8") as f:
# text = f.read()
# print(text)
# s = json.dumps(eval(text))
# p = jsonpath.jsonpath(s, "$..title")
# print(p)
covers = jsonpath.jsonpath(subject, "$..cover") # 封面地址
print(covers)
titles = jsonpath.jsonpath(subject, "$..title") # 名称
print(titles)
# AJAX 是什么
# img_url = "https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2875702766.webp"
for i in range(len(covers)):
img_url = covers[i]
res = get_url(img_url)
img_name = f'{i}_{titles[i]}_{img_url.split("/")[-1]}'
print(img_name)
with res:
text = res.read() # 读取类文本对象的内容
try:
with open(os.path.join(img_path, img_name), "wb") as f:
f.write(text)
except OSError as e:
# OSError: [Errno 22] Invalid argument: 'C:\\img\\去他*的世界 第二季_p2573147295.jpg'
# win文件名中不能包括\?/:*"<>|
print(e)
img_name = f'{i}_{img_url.split("/")[-1]}'
print("更名后:", img_name)
with open(os.path.join(img_path, img_name), "wb") as f:
f.write(text)
3.4 静态页面与动态页面
- 静态页面:
内容相对固定,无需连接后台数据库 - 动态页面:
采用动态技术。如AJAX、ASP、JSP等技术,无需加载整个页面内容,可以实现局部更新;涉及与服务器的交互,如下滑时,从服务器自动加载数据并渲染页面
4、忽略不安全的证书
import ssl
# 忽略不安全证书
context = ssl._create_unverified_context()
res = urlopen(url, context=context)
# ssl.CertificateError:……
网友评论