美文网首页
BeautifulSoup包

BeautifulSoup包

作者: SodaCrush | 来源:发表于2021-01-16 14:05 被阅读0次
    1. 案例1
      节点选择器
    from bs4 import BeautifulSoup
    
    html = """
    <html><head><title>The Dormouse's story title</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    
    # 实例化对象
    soup = BeautifulSoup(html, 'lxml')
    
    # 节点选择器,当选择多个节点时,只会选择到第一个匹配的节点!!
    print(soup.title.string)
    print(soup.title)
    print(soup.head)
    print(soup.p)
    print(soup.a)
    print(soup.p.string)
    print(soup.title.string)
    
    # 节点/标签名。获取这个有毛用??
    print(soup.title.name)
    
    # 获取所有属性值
    print(soup.p.attrs)  # .attrs返回结果是字典的形式
    print(soup.p.attrs['name'])  # 注意:有的返回字符串,有的是字符串组成的列表
    # 简写:
    print(soup.p['name'])  # dromouse
    print(soup.p['class'])  # ['title']
    
    # 嵌套选择
    print(soup.head.title)
    print(soup.head.title.string)
    
    1. 案例2
      CSS选择器
    from bs4 import BeautifulSoup
    
    
    html = """
    <div class="panel">
    <div class="panel-heading">
    <h4>Hello</h4>
    </div>
    <div class="panel-body">
    <ul class ="list" id="list-1" name="elements">
    <li class ="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>
    <ul class="list list-small" id="list-2">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    </ul>
    </div>
    </div>
    """
    
    soup = BeautifulSoup(html, 'lxml')
    # CSS选择器根据CSS样式属性来选择标签
    # 选择class为panel下面的class为panel-heading的标签
    print(soup.select('.panel .panel-heading'))
    
    # 选择到多个标签,返回列表
    print(soup.select('ul li'))
    print(soup.select('ul')[0])
    
    # id为list-2里面class为element的标签
    print(soup.select('#list-2 .element'))
    
    # 嵌套选择
    for ul in soup.select('ul'):
        print(ul.select('li'))
        # 获取属性
        print(ul['id'])  # 或者
        print(ul.attrs['id'])
    
    for li in soup.select('li'):
        # 获取文本
        print('Get text', li.get_text())
        print('String', li.string)
    
    1. 关联选择
    from bs4 import BeautifulSoup
    
    html = """
    <html>
    <head>
    <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="story">
        Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id= "link1">
    <span>Elsie</span>
    </a>
    I am here
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
    and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.
    </p>
    <p class="story">...</p>
    """
    
    soup = BeautifulSoup(html, 'lxml')
    # 获取直接子节点用contents 返回列表
    print(soup.p.contents)
    print('x' * 50)
    print(list(enumerate(soup.p.children)))
    # 若用children属性来获取子节点,则返回生成器类型
    for i, child in enumerate(soup.p.children):
        print(i, child)
    
    
    # 获取直接子孙节点,用descendants属性, 递归查询所有子节点得到所有子孙节点
    for i, child in enumerate(soup.p.descendants):
        print(i, child)
    
    # 获取某个节点的父节点及其内部内容, 调用parent属性
    print(soup.span.parent)
    # 获取所有的祖先节点 用parents 返回生成器类型
    print('*' * 50)
    print(list(enumerate(soup.span.parents)))
    
    
    # 获取兄弟节点
    # next_sibling, previous_sibling分别获取下一个和上一个兄弟元素
    # next_siblings, previous_siblings分别返回所有前面和后面的兄弟节点的生成器
    print("-" * 50)
    print('Next sibling:', soup.a.next_sibling)
    print('Previous sibling:', soup.a.previous_sibling)
    print('Next sibling:', list(enumerate(soup.a.next_siblings)))
    print('Previous sibling:', list(enumerate(soup.a.previous_siblings)))
    
    # 提取信息
    # 单个节点,直接调用string或者attrs等属性。
    # 对生成器,先转成列表取出某个元素,再调用string, attrs 获取对应节点文本和属性
    print('++分割线++' * 10)
    print(soup.a.next_sibling.string)
    print(soup.a.previous_sibling.string)
    print(list(soup.a.parents)[0])
    print(list(soup.a.parents)[0].attrs['class'])
    

    相关文章

      网友评论

          本文标题:BeautifulSoup包

          本文链接:https://www.haomeiwen.com/subject/sfafgftx.html