美文网首页
Beautiful soup使用

Beautiful soup使用

作者: _Haimei | 来源:发表于2018-10-25 14:25 被阅读64次
    解析器 使用方法 优势 劣势
    Python标准库 BeautifulSoup(markup, "html.parser") Python的内置标准库、执行速度适中、文档容错能力强 Python 2.7.3及Python 3.2.2之前的版本文档容错能力差
    lxml HTML解析器 BeautifulSoup(markup, "lxml") 速度快、文档容错能力强 需要安装C语言库
    lxml XML解析器 BeautifulSoup(markup, "xml") 速度快、唯一支持XML的解析器 需要安装C语言库
    html5lib BeautifulSoup(markup, "html5lib") 最好的容错性、以浏览器的方式解析文档、生成HTML5格式的文档 速度慢、不依赖外部扩展

    提取标签中的字符串

    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup('<p>Hello</p>','lxml')
    
    print(soup.p.string)
    
    Hello
    

    基本用法

    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(html,'lxml')
    
    soup.prettify()
    
    '<html>\n <head>\n  <title>\n   The Dormouse\'s story\n  </title>\n </head>\n <body>\n  <p class="title" name="dromouse">\n   <b>\n    The Dormouse\'s story\n   </b>\n  </p>\n  <p class="story">\n   Once upon a time there were three little sisters; and their names were\n   <a class="sister" href="http://example.com/elsie" id="link1">\n    <!-- Elsie -->\n   </a>\n   ,\n   <a class="sister" href="http://example.com/lacie" id="link2">\n    Lacie\n   </a>\n   and\n   <a class="sister" href="http://example.com/tillie" id="link3">\n    Tillie\n   </a>\n   ;\nand they lived at the bottom of a well.\n  </p>\n  <p class="story">\n   ...\n  </p>\n </body>\n</html>'
    
    soup.title.string
    
    "The Dormouse's story"
    

    节点选择器

    选择元素

    soup.title
    
    <title>The Dormouse's story</title>
    
    type(soup.title)
    
    bs4.element.Tag
    
    soup.title.string
    
    "The Dormouse's story"
    
    soup.head
    
    <head><title>The Dormouse's story</title></head>
    
    soup.p
    
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    

    提取信息

    soup.title.name
    
    'title'
    

    获取属性

    soup.p.attrs
    
    {'class': ['title'], 'name': 'dromouse'}
    
    soup.p.attrs['name']
    
    'dromouse'
    
    soup.p['name']
    
    'dromouse'
    
    soup.p['class']
    
    ['title']
    

    获取内容

    soup.p.string
    
    "The Dormouse's story"
    

    嵌套选择

    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    """
    
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(html,'lxml')
    
    soup.head.title
    
    <title>The Dormouse's story</title>
    
    type(soup.head.title)
    
    bs4.element.Tag
    
    soup.head.title.string
    
    "The Dormouse's story"
    

    关联选择

    子节点和子孙节点

    html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(html,'lxml')
    
    soup.p.children
    
    <list_iterator at 0x107d82518>
    
    for i ,child in enumerate(soup.p.children):
        print(i,child)
    
    0 
                Once upon a time there were three little sisters; and their names were
                
    1 <a class="sister" href="http://example.com/elsie" id="link1">
    <span>Elsie</span>
    </a>
    2 
    
    3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
    4  
                and
                
    5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
    6 
                and they lived at the bottom of a well.
    
    soup.p.contents
    
    ['\n            Once upon a time there were three little sisters; and their names were\n            ',
     <a class="sister" href="http://example.com/elsie" id="link1">
     <span>Elsie</span>
     </a>,
     '\n',
     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
     ' \n            and\n            ',
     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
     '\n            and they lived at the bottom of a well.\n        ']
    
    for i ,child in enumerate(soup.p.descendants):
        print(i,child)
    
    0 
                Once upon a time there were three little sisters; and their names were
                
    1 <a class="sister" href="http://example.com/elsie" id="link1">
    <span>Elsie</span>
    </a>
    2 
    
    3 <span>Elsie</span>
    4 Elsie
    5 
    
    6 
    
    7 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
    8 Lacie
    9  
                and
                
    10 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
    11 Tillie
    12 
                and they lived at the bottom of a well.
    
    soup.p.descendants
    
    <generator object descendants at 0x107d06518>
    

    父节点和祖先节点

    html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
            </p>
            <p class="story">...</p>
    """
    
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(html,'lxml')
    
    soup.a.parent
    
    <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a class="sister" href="http://example.com/elsie" id="link1">
    <span>Elsie</span>
    </a>
    </p>
    
    html = """
    <html>
        <body>
            <p class="story">
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
            </p>
    """
    
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(html,'lxml')
    
    type(soup.a.parents)
    
    generator
    
    list(enumerate(soup.a.parents))
    
    [(0, <p class="story">
      <a class="sister" href="http://example.com/elsie" id="link1">
      <span>Elsie</span>
      </a>
      </p>), (1, <body>
      <p class="story">
      <a class="sister" href="http://example.com/elsie" id="link1">
      <span>Elsie</span>
      </a>
      </p>
      </body>), (2, <html>
      <body>
      <p class="story">
      <a class="sister" href="http://example.com/elsie" id="link1">
      <span>Elsie</span>
      </a>
      </p>
      </body></html>), (3, <html>
      <body>
      <p class="story">
      <a class="sister" href="http://example.com/elsie" id="link1">
      <span>Elsie</span>
      </a>
      </p>
      </body></html>)]
    

    兄弟节点

    
    html = """
    <html>
        <body>
            <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                Hello
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
    """
    
    soup = BeautifulSoup(html,'lxml')
    
    soup.a.next_sibling
    
    '\n            Hello\n            '
    
    soup.a.previous_sibling
    
    '\n            Once upon a time there were three little sisters; and their names were\n            '
    
    list(enumerate(soup.a.next_siblings))
    
    [(0, '\n            Hello\n            '),
     (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>),
     (2, ' \n            and\n            '),
     (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>),
     (4, '\n            and they lived at the bottom of a well.\n        ')]
    
    list(enumerate(soup.a.previous_siblings))
    
    [(0,
      '\n            Once upon a time there were three little sisters; and their names were\n            ')]
    

    next_sibling和previous_sibling分别获取节点的下一个和上一个兄弟元素,next_siblings和previous_siblings则分别返回所有前面和后面的兄弟节点的生成器

    提取信息

    
    html = """
    <html>
        <body>
            <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">Bob</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            </p>
    """
    
    soup = BeautifulSoup(html,'lxml')
    
    type(soup.a.next_sibling)
    
    bs4.element.Tag
    
    soup.a.next_sibling
    
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
    
    soup.a.next_sibling.string
    
    'Lacie'
    
    type(soup.a.parents)
    
    generator
    
    list(soup.a.parents)[0]
    
    <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a class="sister" href="http://example.com/elsie" id="link1">Bob</a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
    </p>
    
    list(soup.a.parents)[0].attrs['class']
    
    ['story']
    

    如果返回结果是单个节点,那么可以直接调用string、attrs等属性获得其文本和属性;如果返回结果是多个节点的生成器,则可以转为列表后取出某个元素,然后再调用string、attrs等属性获取其对应节点的文本和属性。

    方法选择器

    find_all(name , attrs , recursive , text , **kwargs)

    name

    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find_all(name='ul'))
    print(type(soup.find_all(name='ul')[0]))
    
    [<ul class="list" id="list-1">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>, <ul class="list list-small" id="list-2">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    </ul>]
    <class 'bs4.element.Tag'>
    
    for ul in soup.find_all(name='ul'):
        print(ul.find_all(name='li'))
    
    [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
    [<li class="element">Foo</li>, <li class="element">Bar</li>]
    
    for ul in soup.find_all(name='ul'):
        print(ul.find_all(name='li'))
        for li in ul.find_all(name='li'):
            print(li.string)
    
    [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
    Foo
    Bar
    Jay
    [<li class="element">Foo</li>, <li class="element">Bar</li>]
    Foo
    Bar
    

    attrs

    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find_all(attrs={'id': 'list-1'}))
    print(soup.find_all(attrs={'name': 'elements'}))
    
    [<ul class="list" id="list-1" name="elements">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>]
    [<ul class="list" id="list-1" name="elements">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>]
    
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find_all(id='list-1'))
    print(soup.find_all(class_='element'))
    
    [<ul class="list" id="list-1" name="elements">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>]
    [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
    

    text

    import re
    html='''
    <div class="panel">
        <div class="panel-body">
            <a>Hello, this is a link</a>
            <a>Hello, this is a link, too</a>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find_all(text=re.compile('link')))
    
    ['Hello, this is a link', 'Hello, this is a link, too']
    

    find()

    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find(name='ul'))
    print(type(soup.find(name='ul')))
    print(soup.find(class_='list'))
    
    <ul class="list" id="list-1">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>
    <class 'bs4.element.Tag'>
    <ul class="list" id="list-1">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>
    

    1.find_parents()和find_parent():前者返回所有祖先节点,后者返回直接父节点。
    2.find_next_siblings()和find_next_sibling():前者返回后面所有的兄弟节点,后者返回后面第一个兄弟节点。
    3.find_previous_siblings()和find_previous_sibling():前者返回前面所有的兄弟节点,后者返回前面第一个兄弟节点。
    4.find_all_next()和find_next():前者返回节点后所有符合条件的节点,后者返回第一个符合条件的节点。
    5.find_all_previous()和find_previous():前者返回节点后所有符合条件的节点,后者返回第一个符合条件的节点。

    CSS选择器

    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.select('.panel .panel-heading'))
    print(soup.select('ul li'))
    print(soup.select('#list-2 .element'))
    print(type(soup.select('ul')[0]))
    
    [<div class="panel-heading">
    <h4>Hello</h4>
    </div>]
    [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
    [<li class="element">Foo</li>, <li class="element">Bar</li>]
    <class 'bs4.element.Tag'>
    

    嵌套选择

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    for ul in soup.select('ul'):
        print(ul.select('li'))
    
    [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
    [<li class="element">Foo</li>, <li class="element">Bar</li>]
    

    获取属性

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    for ul in soup.select('ul'):
        print(ul['id'])
        print(ul.attrs['id'])
    
    list-1
    list-1
    list-2
    list-2
    

    获取文本

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    for li in soup.select('li'):
        print('Get Text:', li.get_text())
        print('String:', li.string)
    
    Get Text: Foo
    String: Foo
    Get Text: Bar
    String: Bar
    Get Text: Jay
    String: Jay
    Get Text: Foo
    String: Foo
    Get Text: Bar
    String: Bar
    

    原文:https://cuiqingcai.com/5548.html

    相关文章

      网友评论

          本文标题:Beautiful soup使用

          本文链接:https://www.haomeiwen.com/subject/lhzqtqtx.html