美文网首页
python3 urllib 爬虫基本使用

python3 urllib 爬虫基本使用

作者: XiWeidong | 来源:发表于2018-03-13 22:04 被阅读0次

    urllib提供了一系列用于操作URL的功能。
    urllib的request模块可以非常方便地抓取URL内容,也就是发送一个GET请求到指定的页面,然后返回HTTP的响应

    01 简单使用

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import urllib.request
    
    request_url = 'http://www.baidu.com'           # 需要请求的URL地址
    response = urllib.request.urlopen(request_url) # 发起请求
    print(response.read().decode('utf-8'))         # 打印响应的文本,并进行UTF-8解码
    
    
    • read(), readline(), readlines(), fileno(), close():对HTTPResponse类型数据进行操作
    • info():返回HTTPMessage对象,表示远程服务器返回的头信息
    • getcode():返回Http状态码。如果是http请求,200请求成功完成、404网址未找到等等
    • geturl():返回请求的url

    02 GET 方法

     #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import urllib.request
    import urllib.parse
    
    get_data = {'username': 'aaa', 'password': 'bbb'}          # 此处将GET的数据定义为一个字典
    get_data_encode = urllib.parse.urlencode(get_data)         # 将GET的数据进行编码
    
    request_url = 'http://www.baidu.com'              # 需要请求的URL地址
    request_url += '?' + get_data_encode                       # 追加GET参数到URL后面
    
    # https://www.zhihu.com/#signin?username=aaa&password=bbb
    print(request_url)
    
    # 发起请求
    response = urllib.request.urlopen(request_url)
    print(response.read().decode('utf-8'))         # 打印响应的文本,并进行UTF-8解码
    

    03 GET并获取header信息

    from urllib import request
    
    with request.urlopen('http://www.baidu.com') as f:
        data = f.read()
        print('Status:', f.status, f.reason)
        for k, v in f.getheaders():
            print('%s: %s' % (k, v))
        print('Data:', data.decode('utf-8'))
    

    04 POST 方法

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
     
    
    import urllib.request
    import urllib.parse
    
    post_data = {'first': 'true', 'pn': 1, 'kd': 'Python'}      # 此处将POST的数据定义为一个字典
    post_data_encode = urllib.parse.urlencode(post_data)        # 将POST的数据进行编码
    
    # UTF-8编码
    # 否则会报错:POST data should be bytes or an iterable of bytes. It cannot be of type str.
    post_data_encode = post_data_encode.encode(encoding='utf-8')
    request_url = 'http://www.lagou.com/jobs/positionAjax.json?'               # 需要请求的URL地址
    
    # 发起请求
    # 此处增加了第二个参数为传送的POST数据(默认为None)
    # 第三个参数为请求超时时间,默认为socket._GLOBAL_DEFAULT_TIMEOUT
    response = urllib.request.urlopen(request_url, post_data_encode, 3)
    print(response.read().decode('utf-8'))         # 打印响应的文本,并进行UTF-8解码
    
    
    from urllib import request, parse
    
    print('Login to weibo.cn...')
    email = input('Email: ')
    passwd = input('Password: ')
    login_data = parse.urlencode([
        ('username', email),
        ('password', passwd),
        ('entry', 'mweibo'),
        ('client_id', ''),
        ('savestate', '1'),
        ('ec', ''),
        ('pagerefer', 'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F')
    ])
    
    req = request.Request('https://passport.weibo.cn/sso/login')
    req.add_header('Origin', 'https://passport.weibo.cn')
    req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
    req.add_header('Referer', 'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F')
    
    with request.urlopen(req, data=login_data.encode('utf-8')) as f:
        print('Status:', f.status, f.reason)
        for k, v in f.getheaders():
            print('%s: %s' % (k, v))
        print('Data:', f.read().decode('utf-8'))
    

    04 使用Request 设置Headers属性

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
     
    import urllib.request
    import urllib.parse
    
    user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87'
    referer = 'http://www.lagou.com/jobs/positionAjax.json?'
    post_data = {'first': 'true', 'pn': 1, 'kd': 'Python'}                              # 此处将POST的数据定义为一个字典
    headers = {'User-Agent': user_agent, 'Referer': referer}                            # Headers属性初始化
    post_data_encode = urllib.parse.urlencode(post_data)                                # 将POST的数据进行编码
    
    # UTF-8编码
    # 否则会报错:POST data should be bytes or an iterable of bytes. It cannot be of type str.
    post_data_encode = post_data_encode.encode(encoding='utf-8')
    request_url = 'http://www.lagou.com/zhaopin/Python/?labelWords=label'               # 需要请求的URL地址
    
    # 使用Request来设置Headers
    request = urllib.request.Request(request_url, post_data_encode, headers)
    
    response = urllib.request.urlopen(request)
    print(response.read().decode('utf-8'))         # 打印响应的文本,并进行UTF-8解码
    
    

    06 Proxy(代理)的设置

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    from urllib import request
    
    request_url = 'http://www.xmgc360.com/project/test.php'
    proxy = request.ProxyHandler({'http': '119.28.54.102:3389'})   # 设置代理服务器
    opener = request.build_opener(proxy)                            # 挂载opener
    request.install_opener(opener)                                  # 安装opener
    response = request.urlopen(request_url)
    print(response.read().decode('utf-8'))         # 打印响应的文本,并进行UTF-8解码
    

    07 异常处理

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    from urllib import request
    
    request_url = 'http://www.lagou.com/jobs/positionAjax.json?'
    proxy = request.ProxyHandler({'http': '127.0.0.1:8989'}) # 设置代理服务器
    opener = request.build_opener(proxy)                         # 挂载opener
    request.install_opener(opener)                               # 安装opener
    try:
        response = request.urlopen(request_url)
    except Exception as e:
        print(e)                   # 打印错误码
    
    

    08 练习

    http://image.baidu.com/channel/listjson?pn=1&rn=30&tag1=%E6%98%8E%E6%98%9F&tag2=%E5%85%A8%E9%83%A8&ie=utf8

    抓取信息并保存到数据库

    相关文章

      网友评论

          本文标题:python3 urllib 爬虫基本使用

          本文链接:https://www.haomeiwen.com/subject/nlgtqftx.html