美文网首页
json模块、视频提取

json模块、视频提取

作者: Lonelyroots | 来源:发表于2022-07-13 22:50 被阅读0次

1. json模块

05_json模块、抖音视频无水印提取/01_json模块1.py:

"""

    爬虫中数据分类:
        结构化数据:
            json、xml
        非结构化数据:html
            处理方法:正则表达式、xpath、css选择器
    json模块是Python内置模块

"""

"""

    JSON的判断标准:
        1. JSON不允许key为元组,但是Python的字典允许
        2. JSON中没有单引号的存在
    
    json.dumps()转JSON格式的字符串
    
    NOT JSON:
        {'a': 'A', 'c': 'C', 'b': 'B', ('2', 1): '元组', 22: '数字'}
    YES JSON:
        {"a": "A", "c": "C", "b": "B", "22": "数字"}

"""

import json

dict2 = {
    "name":"Tom","age":23
}
print(str(dict2))   # 打印{'name': 'Tom', 'age': 23},虽然可以 但不是json格式,不是json类型

data1 = json.dumps(dict2)       # 转json格式的字符串
print(data1)        # 打印{"name": "Tom", "age": 23}
print(type(data1))      # 字符串类型<class 'str'>

print("------------")

data2 = json.dumps([])
print(data2)        # 打印[]
print(type(data2))      # 打印<class 'str'>

data3 = json.dumps(2)
print(type(data3))      # 打印<class 'str'>

data4 = json.dumps("3")
print(type(data4))      # 打印<class 'str'>

data = {
    "a":"A","c":"C","b":"B"
}
print("排序:",json.dumps(data,sort_keys=True))
print("缩进:",json.dumps(data,indent=3))
print("紧凑:",json.dumps(data,separators=(',',':')))      # 把, 换成,;把: 换成:。

data[('2',1)] = "元组"
print(data)     # 打印{'a': 'A', 'c': 'C', 'b': 'B', ('2', 1): '元组'}

data[22] = "数字"
# data[[1,2]] = "数组"      # 字典的key不能为数组
print(data)
print(json.dumps(data,skipkeys=True))        # skipkeys 跳过异常过滤
print(json.dumps(data,skipkeys=True,ensure_ascii=False))        # skipkeys:跳过异常过滤;ensure_ascii:解决编码问题。


with open("test.json","w",encoding='utf-8') as fp:
    # fp.write(str(data))      # 可以写入,但不是json格式,这里传的data是字典
    # fp.write(json.dumps(data,skipkeys=True,ensure_ascii=False))

    # 与fp.write()作用差不多
    json.dump(data,fp,skipkeys=True,ensure_ascii=False)

05_json模块、抖音视频无水印提取/02_json模块2.py:

"""

    总结:
        json.dumps() 将 Python对象编码成 JSON字符串,严格按照JSON格式进行转换:PythonDict -> jsonStr
        json.loads() 将 已编码的 JSON字符串解码为Python对象 jsonStr -> PythonDict
        json.dump() 和 json.load(),需要传入文件描述符,加上文件操作

"""

import json

dict2 = '{"name":"Tom","age":23}'       # 必须要是json格式的字符串
data1 = json.loads(dict2)       # 将字符串还原为Python——字典dict
print(data1)

with open("test.json",'r',encoding='utf-8') as fp:
    print(fp.read())    # {"a": "A", "c": "C", "b": "B", "22": "数字"}
    print(type(fp.read()))      # <class 'str'>
    fp.seek(0)      # 光标移动到最前面
    data2 = json.loads(fp.read())       # 转成了Python可以操作的字典
    print(data2['22'],type(data2))      # 打印:数字 <class 'dict'>
    fp.seek(0)
    print(json.load(fp))      # {'a': 'A', 'c': 'C', 'b': 'B', '22': '数字'}

2. jsonpath模块

05_json模块、抖音视频无水印提取/03_jsonpath.py:

"""

    pip install jsonpath
        用来解析多层嵌套的json数据:jsonpath是一种信息抽取库
        
"""
import jsonpath

data = {
    "price":666,
    "store":{
        "author":"Lonelyroots",
        "product":"washing_machine",
        "price":5.5,
        "book":[{
            "category":"reference",
            "author":"Nigel Rees",
            "title":"Sayings of the Century",
            "price":8.95
        },{
            "category":"reference",
            "author":"Nigel Rees",
            "title":"Sayings of the Century",
            "price":12.99
        },{
            "category":"fiction",
            "author":"Herman Melville",
            "title":"Moby Dick",
            "isbn": "0-553-21311-3",
            "price":8.99
        },{
            "category":"fiction",
            "author":"Nigel Rees",
            "title":"Sayings of the Century",
            "isbn":"0-395-19395-8",
            "price":22.99
        }],
        "bicycle":{
            "color":"red",
            "price":19.95
        }
    }
}

print(data['store']['book'][0]['price'])    # 打印8.95
print(jsonpath.jsonpath(data,'$..price'))   # 所有price节点,打印[666, 5.5, 8.95, 12.99, 8.99, 22.99, 19.95]
print(jsonpath.jsonpath(data,'$.store.book[*].price'))   # book的所有price节点,两个点(使用时表示不知道有多少层),打印[8.95, 12.99, 8.99, 22.99]
print(jsonpath.jsonpath(data,'$.price'))   # 打印第一层节点,打印[666]
print(jsonpath.jsonpath(data,'$.store.*'))   # 打印store下的所有节点
print(jsonpath.jsonpath(data,'$.store..price'))   # 打印[5.5, 8.95, 12.99, 8.99, 22.99, 19.95]
print(jsonpath.jsonpath(data,'$..book[3]'))   # 打印[{'category': 'fiction', 'author': 'Nigel Rees', 'title': 'Sayings of the Century', 'isbn': '0-395-19395-8', 'price': 22.99}]
print(jsonpath.jsonpath(data,'$..book[(@.length-1)]'))   # 匹配倒数第一个book节点(写法一),打印[{'category': 'fiction', 'author': 'Nigel Rees', 'title': 'Sayings of the Century', 'isbn': '0-395-19395-8', 'price': 22.99}]
print(jsonpath.jsonpath(data,'$..book[-1:]'))   # 匹配倒数第一个book节点(写法二),打印[{'category': 'fiction', 'author': 'Nigel Rees', 'title': 'Sayings of the Century', 'isbn': '0-395-19395-8', 'price': 22.99}]
print(jsonpath.jsonpath(data,'$..book[:2]'))   # 匹配book的0与1节点,打印[{'category': 'reference', 'author': 'Nigel Rees', 'title': 'Sayings of the Century', 'price': 8.95}, {'category': 'reference', 'author': 'Nigel Rees', 'title': 'Sayings of the Century', 'price': 12.99}]
print(js

相关文章

网友评论

      本文标题:json模块、视频提取

      本文链接:https://www.haomeiwen.com/subject/gttxirtx.html