美文网首页
py爬虫3:urllib库

py爬虫3:urllib库

作者: _百草_ | 来源:发表于2022-07-14 14:35 被阅读0次

1、urllib.request

HTTP请求和响应

1.1 urllib.request.urlopen(发起请求)

from urllib import request

url = "https://www.bing.com"
res = request.urlopen(url)  # , timeout=10)
# urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
#             *, cafile=None, capath=None, cadefault=False, context=None)
"""
data是发送的数据;None则发起get请求;否则发起POST请求
timeout 设置超时时间,默认一直到访问成功;超时报错:socket.timeout: The read operation timed out
返回 http.client.HTTPResponse object 类文件对象
"""
print(res)
print(res.closed)  # False;说明是类文件对象
with res:  # 打开文件之后关闭
    print(type(res))  # <class 'http.client.HTTPResponse'>
    print(res.status)  # 200
    print(res._method)  # GET ;默认不可调用
    print(res.read())   # fb读取返回的内容
    print(res.info())  # HTTP请求,则返回Response headers(返回结果的)
    print(res.url)  # https://cn.bing.com/ ;重定向
    print(res.geturl())  # 获取真正访问的url;响应对象的URL
    print(res.getcode())  # 200 返回请求时的HTTP响应码

print(res.closed)  # True

print(resp.read().decode("utf-8"))  # 解码,则输出HTML,而非字节串类型bytes

注:
string.encode("utf-8")  # 字符串类型修改为字节bytes类型
bytes.decode("utf-8")  # 字节码修改为字符串
  • 测试网站,获取请求头
url = "http://httpbin.org/get"
resp = urlopen(url)
with resp:
    print(resp.read())
# --------------------------------------------------------
# 输出结果:
{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.9", 
    "X-Amzn-Trace-Id": "Root=1-62cfee93-2dd9d6045c7e5f4620262d67"
  }, 
  "origin": "202.103.156.155", 
  "url": "http://httpbin.org/get"
}

注意:httpbin.org 这个网站能测试 HTTP 请求和响应的各种信息,比如 cookie、IP、headers 和登录验证等,且支持 GET、POST 等多种方法,对 Web 开发和测试很有帮助。

1.2 urllib.request.Request (重构爬虫UA)

  • 请求头需要注意参数:
  1. Referrer:访问源至哪里(一些大型网站,会通过Referrer做防盗链策略;爬虫时需要注意模拟)
  2. User-Agent:访问的浏览器(需要添加,否则会被认为是爬虫程序)
  3. cookie:请求头注意携带
    注:响应头中set-cookie,可能有多个,是来告诉浏览器,把cookie保存下来
    set-cookie: SSO_USER_TOKEN=***; Path=/; Domain=kong.net; Max-Age=2592000; Expires=Sat, 13 Aug 2022 06:45:57 GMT; Secure; HttpOnly
"""
urlopen不能构造HTTP的请求。例如User-agent
class OpenerDirector:
    def __init__(self):
        client_version = "Python-urllib/%s" % __version__
        self.addheaders = [('User-agent', client_version)]  # 添加request headers
若网站是反爬虫的,则需要把爬虫伪装成浏览器。即复制浏览器中的UA,用来伪装  
如 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36

class Request:
    def __init__(self, url, data=None, headers={},  # headers是默认空
                 origin_req_host=None, unverifiable=False,
                 method=None)      
"""
from urllib.request import urlopen, Request

ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
req = Request(url, headers={"User-agent": ua})
# 或者req.add_header("User-agent", ua)
res = urlopen(req)
print(res.closed)  # False;说明是类文件对象
with res:  # 打开文件之后关闭
    # print(res.read())   # fb读取返回的内容
    print(res.info())  # HTTP请求,则返回Response headers(返回结果的)

print(res.closed)  # True
# 方式1:
ua_list = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
    "safafi",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49"
]
ua = random.choice(ua_list)
print(ua)
# 方式2:faker模块
from faker import Faker

fake = Faker(locale="zh_CN")
ua = fake.user_agent()
print(ua)

1.3 urllib.request.urlretrieve

from urllib.request import urlretrieve  # 从网络上下载资源到本地
urlretrieve(url,img_name)   # 从url中下载资源到本地file_name
"""
def urlretrieve(url, filename=None, reporthook=None, data=None):
  Retrieve a URL into a temporary location on disk.
"""

# 豆瓣网实例中,直接下载图片到本地,替代urlopen和open
def upload_img2(covers):
    """方式2:存储图片到本地"""
    from urllib.request import urlretrieve
    for i in range(len(covers)):
        img_url = covers[i]
        img_name = f'{i+1}_{titles[i]}_{img_url.split("/")[-1]}'
        img_full_name = os.path.join(img_path, img_name)
        try:
            urlretrieve(img_url, img_full_name)
        except Exception as e:
            print(f"报错信息:{e}")

下载网页的第3种方式

# 下载网页的第3种方式
from urllib.request import build_opener, install_opener, urlopen, HTTPCookieProcessor
from http import cookiejar
cookie = cookiejar.CookieJar()

# 加入urllib处理cookie能力
url = "http://www.baidu.com"
opener = build_opener(HTTPCookieProcessor(cookie))
install_opener(opener)
resp = urlopen(url)
with resp:
    with open("baidu.html", "wb") as f:
        f.write(resp.read())

2、urllib.parse

from urllib import parse

# parse 实现url的解编码
u = parse.urlencode("http://www.magedu.com/python")
# TypeError: not a valid non-string sequence or mapping object
# Request,构建请求信息
# urlopen, 发起请求
urlencode(query, doseq=False, safe='', encoding=None, errors=None,
              quote_via=quote_plus):
# 第1参数是dict或二元组序列

d = {
    "id": 1001,
    "name": "百草"
}
u = parse.urlencode(d)  # id=1001&name=%E7%99%BE%E8%8D%89
print(u)
url = "https://www.google.com/search?q=baid&oq=baid&aqs=chrome&sourceid=chrome&ie=UTF-8"

d2 = {
    "id": 1001,
    "q": "百草",
    "url": "https://www.google.com/search"
}

u2 = parse.urlencode(d2)
print(u2)  # id=1001&q=%E7%99%BE%E8%8D%89&url=https%3A%2F%2Fwww.google.com%2Fsearch
# 十六进制表示
"https://www.baidu.com/s?wd=%E7%99%BE%E8%8D%89"  # 百度搜索百草
# 实例
p = parse.urlencode({"wd": "百草"})  # wd=%E7%99%BE%E8%8D%89
url = "https://www.baidu.com/s?{}".format(p)
print(url)  # https://www.baidu.com/s?wd=%E7%99%BE%E8%8D%89
print("百草".encode("utf-8"))  # b'\xe7\x99\xbe\xe8\x8d\x89'
print(parse.unquote(p))  # wd=百草;即解码
# 一般按照字符集的encoding要求转换为字节序列,每一个字节对应的十六进制字符串前加上百分号即可

3、实例

最常用的HTTP方法:GET、POST
GET: 数据通过URL传递,即HTTP报文中的header部分
POST: 数据放在HTTP报文的body部分;且键值对形式,多个参数之间使用&连接

3.1 GET请求

# 拼接
base_url = "https://www.baidu.com/s"
p = {
    "wd": "百草"
}
u = parse.urlencode(p)
url = "{}?{}".format(base_url, u)  # url=https://www.baidu.com/s?wd=%E7%99%BE%E8%8D%89

# 伪装
from faker import Faker
fake = Faker(locale="zh_CN")
ua = fake.user_agent()
from urllib.request import urlopen, Request

# get请求
req = Request(url, headers={"User-agent": ua})
resp = urlopen(req)
with resp:
    # text = resp.read()
    # resp.read()
    # 存储本地
    with open("1.html", "wb+") as f:
        f.write(resp.read())
        f.flush()  # 刷新缓冲区

爬取结果

3.2 POST请求

# 测试网站:http://httpbin.org/
from urllib.request import Request, urlopen
from urllib.parse import urlencode
import simplejson

url = "http://httpbin.org/post"
req = Request(url)
req.add_header("user-agent", ua)

p = {
    "name": "百草",
    "age": "6"
}
data = urlencode(p)
print(data)

3.3 处理JSON数据

import os
from urllib.request import Request, urlopen
from urllib import parse
# json数据处理
import jsonpath
import simplejson
from simplejson0713.get_files import img_path

"""
示例:豆瓣网站

"""


# ua
def get_url(url, data=None):
    """
    发送请求
    :param url:
    :return:
    """
    from faker import Faker
    fake = Faker(locale="ZH_CN")
    ua = fake.user_agent()

    # 请求
    req = Request(url, headers={
        "User-agent": ua
    })
    res = urlopen(req, data=data)
    return res


# url = "https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&page_limit=500&page_start=0"
base_url = "https://movie.douban.com/j/search_subjects"
u = {
    "type": "tv",
    "tag": "热门",
    "page_limit": 10,
    "page_start": 0
}
url = "{}?{}".format(base_url, parse.urlencode(u))

res = get_url(url)
with res:
    text = res.read()
    subject = simplejson.loads(text)
    print(subject)
    # with open("douban.html", "wb") as f:
    #     f.write(text)
# import json
#
# import jsonpath
#
# with open("douban.html", "r", encoding="utf-8") as f:
#     text = f.read()
# print(text)
# s = json.dumps(eval(text))
# p = jsonpath.jsonpath(s, "$..title")
# print(p)

covers = jsonpath.jsonpath(subject, "$..cover")  # 封面地址
print(covers)

titles = jsonpath.jsonpath(subject, "$..title")  # 名称
print(titles)

# AJAX 是什么


# img_url = "https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2875702766.webp"
for i in range(len(covers)):
    img_url = covers[i]
    res = get_url(img_url)
    img_name = f'{i}_{titles[i]}_{img_url.split("/")[-1]}'
    print(img_name)
    with res:
        text = res.read()  # 读取类文本对象的内容
        try:
            with open(os.path.join(img_path, img_name), "wb") as f:
                f.write(text)
        except OSError as e:
            # OSError: [Errno 22] Invalid argument: 'C:\\img\\去他*的世界 第二季_p2573147295.jpg'
            # win文件名中不能包括\?/:*"<>|
            print(e)
            img_name = f'{i}_{img_url.split("/")[-1]}'
            print("更名后:", img_name)
            with open(os.path.join(img_path, img_name), "wb") as f:
                f.write(text)

3.4 静态页面与动态页面

  • 静态页面:
    内容相对固定,无需连接后台数据库
  • 动态页面:
    采用动态技术。如AJAX、ASP、JSP等技术,无需加载整个页面内容,可以实现局部更新;涉及与服务器的交互,如下滑时,从服务器自动加载数据并渲染页面

4、忽略不安全的证书

import ssl
# 忽略不安全证书
context = ssl._create_unverified_context()
res = urlopen(url, context=context)
# ssl.CertificateError:……

参考

https://www.runoob.com/python3/python-urllib.html

相关文章

网友评论

      本文标题:py爬虫3:urllib库

      本文链接:https://www.haomeiwen.com/subject/xilabrtx.html