美文网首页
Python ☞ day 14

Python ☞ day 14

作者: 浪子彦卿L | 来源:发表于2018-06-07 11:09 被阅读0次

    Python学习笔记之 爬虫

    urllib 模块提供了一系列用于操作URL的功能
    urllib 爬取网页
    import urllib.request
    
    # 向指定的url地址发起请求,并返回服务器响应的数据(文件的对象)
    response = urllib.request.urlopen("http://www.baicu.com")
    
    # 读取问文件的全部内容,会把读取到的数据赋值给一个字符串变量
    data = response.read()
    print(data)
    print(type(data))
    
    # 读取一行
    #data = response.readline()
    
    #读取文件的全部内容,会把读取到的数据赋值给一个列表变量
    #data = response.readlines()
    '''
    print(data)
    print(type(data))
    print(len(data))
    print(type(data[100].decode("utf-8")))
    '''
    
    
    
    #将爬取到的网页写入文件
    # with open(r"C:\Users\xlg\Desktop\Python-1704\day18\file\file1.html", "wb") as f:
    #     f.write(data)
    
    
    #response 属性
    #返回当前环境的有关信息
    print(response.info())
    
    #返回状态码
    print(response.getcode())
    #if response.getcode() == 200 or response.getcode() == 304:
        #处理网页信息
    #    pass
    
    #返回当前正在爬取的URL地址
    print(response.geturl())
    
    将爬取的网页直接写入文件
    import urllib.request
    
    urllib.request.urlretrieve("http://www.baidu.com", filename=r"C:\Users\xlg\Desktop\Python-1704\day18\file\file2.html")
    
    #urlretrieve在执行的过程当中,会保留一些缓存
    
    #清除缓存
    urllib.request.urlcleanup()
    
    模拟浏览器
    import urllib.request
    import random
    
    url = "http://www.baidu.com"
    
    '''
    #模拟请求头
    headers = {
        "Accept" : "application/json, text/javascript, */*; q=0.01",
        "X-Requested-With" : "XMLHttpRequest",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8"
    }
    #设置一个请求体
    req = urllib.request.Request(url,headers=headers)
    #发起请求
    response = urllib.request.urlopen(req)
    data = response.read().decode("utf-8")
    print(data)
    '''
    
    
    agentsList = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr = random.choice(agentsList)
    req = urllib.request.Request(url)
    #向请求体里添加了User-Agent
    req.add_header("User-Agent", agentStr)
    response = urllib.request.urlopen(req)
    print(response.read().decode("utf-8"))
    
    设置超时
    import urllib.request
    
    
    #如果网页长时间未响应,系统判断超时,无法爬取
    for i in range(1, 100):
        try:
            response = urllib.request.urlopen("http://www.baidu.com", timeout=0.5)
            print(len(response.read().decode("utf-8")))
        except:
            print("请求超时,继续下一个爬取")
    
    HTTP请求
    • 使用场景:进行客户端与服务端之间的消息传递时使用
      GET: 通过URL网址传递信息,可以直接在URL网址上添加要传递的信息
      POST: 可以向服务器提交数据,是一种比较流行的比较安全的数据传递方式
      PUT: 请求服务器存储一个资源,通常要指定存储的位置
      DELETE: 请求服务器删除一个资源
      HEAD: 请求获取对应的HTTP报头信息
      OPTIONS:可以获取当前UTL所支持的请求类型
    GET请求
    '''
    特点:把数据拼接到请求路径的后面传递给服务器
    
    有点:速度快
    
    缺点:承载的数据量小,不安全
    '''
    
    import urllib.request
    url = "http://www.sunck.wang:8085/sunck"
    response = urllib.request.urlopen(url)
    data = response.read().decode("utf-8")
    print(data)
    print(type(data))
    
    json数据解析
    '''
    概念:一种保存数据的格式
    作用:可以保存本地的json文件,页可以将json串进行传输,通常将json称为轻量级的传输方式
    
    json文件组成
    {}     代表对象(字典)
    []     代表列表
    :      代表键值对
    ,     分隔两个部分
    '''
    import json
    
    jsonStr = '{"name":"sunck凯", "age":18, "hobby":["money","power","english"], "parames":{"a":1,"b":2}}'
    #将json格式的字符串转为python数据类型的对象
    jsonData = json.loads(jsonStr)
    print(jsonData)
    print(type(jsonData))
    print(jsonData["hobby"])
    
    #将python数据类型的对象转为json格式的字符串
    jsonData2 = {"name":"sunck凯", "age":18, "hobby":["money","power","english"], "parames":{"a":1,"b":2}}
    jsonStr2 = json.dumps(jsonData2)
    print(jsonStr2)
    print(type(jsonStr2))
    
    
    #读取本地的json文件
    path1 = r"C:\Users\xlg\Desktop\Python-1704\day18\Json\caidanJson.json"
    with open(path1, "rb") as f:
        data = json.load(f)
        print(data)
        #字典类型
        print(type(data))
    
    
    #写本地json
    path2 = r"C:\Users\xlg\Desktop\Python-1704\day18\Json\test.json"
    jsonData3 = {"name":"sunck凯", "age":18, "hobby":["money","power","english"], "parames":{"a":1,"b":2}}
    with open(path2, "w") as f:
        json.dump(jsonData3, f)
    
    post请求
    '''
    特点:把参数进行打包,单独传输
    
    优点:数量大,安全(当对服务器数据进行修改时建议使用post)
    
    缺点:速度慢
    '''
    import urllib.request
    import urllib.parse
    
    url = "http://www.sunck.wang:8085/form"
    #将要发送的数据合成一个字典
    #字典的键取网址里找,一般为input标签的name属性的值
    data = {
        "username":"sunck",
        "passwd":"666"
    }
    #对要发送的数据进行打包,记住编码
    postData = urllib.parse.urlencode(data).encode("utf-8")
    #请求体
    req = urllib.request.Request(url, postData)
    #请求
    req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36")
    response = urllib.request.urlopen(req)
    print(response.read().decode("utf-8"))
    
    抓取网页动态Ajax请求的数据
    import urllib.request
    import ssl
    import json
    
    
    def ajaxCrawler(url):
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
        }
        req = urllib.request.Request(url, headers=headers)
    
        #使用ssl创建未验证的上下文
        context = ssl._create_unverified_context()
        response = urllib.request.urlopen(req,context=context)
    
        jsonStr = response.read().decode("utf-8")
        jsonData = json.loads(jsonStr)
    
        return jsonData
    
    '''
    url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20"
    info = ajaxCrawler(url)
    print(info)
    '''
    
    
    for i in (1, 11):
        url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start="+ str(i * 20)+"&limit=20"
        info = ajaxCrawler(url)
        print(len(info))
    
    嗅事百科爬虫练习
    import urllib.request
    import re
    
    
    def jokeCrawler(url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
        }
    
        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
    
        HTML = response.read().decode("utf-8")
    
        pat = r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
        re_joke = re.compile(pat, re.S)
        divsList = re_joke.findall(HTML)
        #print(divsList)
        #print(len(divsList))
        dic = {}
        for div in divsList:
            #用户名
            re_u = re.compile(r"<h2>(.*?)</h2>", re.S)
            username = re_u.findall(div)
            username = username[0]
            #段子
            re_d = re.compile(r'<div class="content">\n<span>(.*?)</span>', re.S)
            duanzi = re_d.findall(div)
            duanzi = duanzi[0]
    
            dic[username] = duanzi
    
        return dic
    
        #with open(r"C:\Users\xlg\Desktop\Python-1704\day18\file\file3.html", "w") as f:
        #    f.write(HTML)
    
    
    url = "https://www.qiushibaike.com/text/page/1/"
    info = jokeCrawler(url)
    for k, v in info.items():
        print(k + "说\n" + v)
    
    
    
    #https://www.douban.com/group/topic/41562980/?start=0
    

    相关文章

      网友评论

          本文标题:Python ☞ day 14

          本文链接:https://www.haomeiwen.com/subject/ebpwsftx.html