- 案例1
节点选择器
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story title</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 实例化对象
soup = BeautifulSoup(html, 'lxml')
# 节点选择器,当选择多个节点时,只会选择到第一个匹配的节点!!
print(soup.title.string)
print(soup.title)
print(soup.head)
print(soup.p)
print(soup.a)
print(soup.p.string)
print(soup.title.string)
# 节点/标签名。获取这个有毛用??
print(soup.title.name)
# 获取所有属性值
print(soup.p.attrs) # .attrs返回结果是字典的形式
print(soup.p.attrs['name']) # 注意:有的返回字符串,有的是字符串组成的列表
# 简写:
print(soup.p['name']) # dromouse
print(soup.p['class']) # ['title']
# 嵌套选择
print(soup.head.title)
print(soup.head.title.string)
- 案例2
CSS选择器
from bs4 import BeautifulSoup
html = """
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class ="list" id="list-1" name="elements">
<li class ="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
"""
soup = BeautifulSoup(html, 'lxml')
# CSS选择器根据CSS样式属性来选择标签
# 选择class为panel下面的class为panel-heading的标签
print(soup.select('.panel .panel-heading'))
# 选择到多个标签,返回列表
print(soup.select('ul li'))
print(soup.select('ul')[0])
# id为list-2里面class为element的标签
print(soup.select('#list-2 .element'))
# 嵌套选择
for ul in soup.select('ul'):
print(ul.select('li'))
# 获取属性
print(ul['id']) # 或者
print(ul.attrs['id'])
for li in soup.select('li'):
# 获取文本
print('Get text', li.get_text())
print('String', li.string)
- 关联选择
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id= "link1">
<span>Elsie</span>
</a>
I am here
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
# 获取直接子节点用contents 返回列表
print(soup.p.contents)
print('x' * 50)
print(list(enumerate(soup.p.children)))
# 若用children属性来获取子节点,则返回生成器类型
for i, child in enumerate(soup.p.children):
print(i, child)
# 获取直接子孙节点,用descendants属性, 递归查询所有子节点得到所有子孙节点
for i, child in enumerate(soup.p.descendants):
print(i, child)
# 获取某个节点的父节点及其内部内容, 调用parent属性
print(soup.span.parent)
# 获取所有的祖先节点 用parents 返回生成器类型
print('*' * 50)
print(list(enumerate(soup.span.parents)))
# 获取兄弟节点
# next_sibling, previous_sibling分别获取下一个和上一个兄弟元素
# next_siblings, previous_siblings分别返回所有前面和后面的兄弟节点的生成器
print("-" * 50)
print('Next sibling:', soup.a.next_sibling)
print('Previous sibling:', soup.a.previous_sibling)
print('Next sibling:', list(enumerate(soup.a.next_siblings)))
print('Previous sibling:', list(enumerate(soup.a.previous_siblings)))
# 提取信息
# 单个节点,直接调用string或者attrs等属性。
# 对生成器,先转成列表取出某个元素,再调用string, attrs 获取对应节点文本和属性
print('++分割线++' * 10)
print(soup.a.next_sibling.string)
print(soup.a.previous_sibling.string)
print(list(soup.a.parents)[0])
print(list(soup.a.parents)[0].attrs['class'])
网友评论