from lxml import etree
from bs4 import BeautifulSoup
import re
html = """
<!DOCTYPE html>
<html>
<head>
<title>xpath test</title>
</head>
<body>
<div price="99.8">
<div>
<ul>
<li>时间</li>
<li>地点</li>
<li>任务</li>
</ul>
</div>
<div id='testid' data-h="first">
<h2>这里是个小标题</h2>
<ol>
<li data="one">1</li>
<li data="two">2</li>
<li data="three">3</li>
</ol>
<ul>
<li code="84">84</li>
<li code="104">104</li>
<li code="223">223</li>
</ul>
</div>
<div>
<h3>这里是H3的内容
<a href="http://www.baidu.com">百度一下</a>
<ul>
<li>test1</li>
<li>test2</li>
</ul>
</h3>
</div>
<div id="go">
<ul>
<li>1</li>
<li>2</li>
<li>3</li>
<li>4</li>
<li>5</li>
<li>6</li>
<li>7</li>
<li>8</li>
<li>9</li>
<li>10</li>
</ul>
</div>
</div>
</body>
</html>
"""
def title():
#第一种,xpath提取
html_etree = etree.HTML(html)
# print(type(html_etree)) #<class 'lxml.etree._Element'>
# result = etree.tostring(html_etree) #如果标签不全,tostring()可以补全
# print(result.decode('utf-8')) #tostring()后的数据类型是bytes,需要decode()转成str
title_xpath1 = html_etree.xpath('/html/head/title/text()') #需要text()把文字解析出来
print('用xpath绝对路径方法提取title:', title_xpath1) #xpath返回的是列表
title_xpath2 = html_etree.xpath('//head/title/text()') #效果一样,/表示绝对路径,//表示相对路径
print('用xpath相对路径方法提取title:', title_xpath2)
#第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# print(soup)
# print(type(soup)) #<class 'bs4.BeautifulSoup'>
title_soup = soup.select('title') #soup.select返回的也是列表,需要提取出来在用get_text()拿出文字
# css选择器,标签名不加修饰,类名前加点,id名前加#,可组合查找
# print(title_soup)
# print(type(title_soup)) #list
title_BeautifuleSoup = title_soup[0].get_text()
# title_BeautifuleSoup = soup.title.get_text()
print('用BeautifulSoup方法提取title:', title_BeautifuleSoup)
#第三种,正则表达式提取
re_pattern = re.compile(r'<title>(.*?)</title>', re.S) #(.*?)是需要匹配返回的字符串,re.S可换行匹配
# print(type(re_pattern)) #re.compile返回的是数据类型正则表达式:<class 're.Pattern'>
title_re_compile = re.findall(re_pattern, html)
print('用正则表达式方法提取title:', title_re_compile)
#可以不使用re.compile
title_re = re.findall(r'<title>(.*?)</title>', html)
print('用正则表达式跳过re.compile提取title:', title_re)
def price():
#第一种,xpath提取
html_etree = etree.HTML(html)
# price_xpath = html_etree.xpath('/html/body/div/@price')
# price_xpath = html_etree.xpath('/html/body/child::*/@price') #child::* 选取当前节点所有子元素
# price_xpath = html_etree.xpath('/html/body/child::div/@price') # child::div 子节点定位div标签
# price_xpath = html_etree.xpath('//@price') #相对路径,且price属性只有一个
# price_xpath = html_etree.xpath("//div[@id='testid']/ancestor::div") #ancestor:: 提取所有父辈div元素
# price_xpath = html_etree.xpath("//div[@id='testid']/ancestor::div/@price") #父辈定位div元素price属性
price_xpath = html_etree.xpath("//div[@id='testid']/ancestor-or-self::div/@price") # 父辈及当前节点div元素
print('用xpath方法提取price:', price_xpath)
#第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
price_BeautifulSoup = soup.div.attrs['price']
# price_BeautifulSoup = soup.find('div').attrs['price']
# price_BeautifulSoup = soup.select('div')[0].attrs['price']
print('用BeautifulSoup方法提取price:', price_BeautifulSoup)
#第三种,正则表达式提取
re_pattern = re.compile(r'<div price="(.*?)">', re.S)
price_re = re.findall(re_pattern, html)
print('用正则表达式跳过re.compile提取price:', price_re)
提取第一个div下ul下li的文字
def ul_li():
# 第一种,xpath提取
html_etree = etree.HTML(html)
# ul_li = html_etree.xpath('//div/div[1]/ul/child::*/text()') #child::节点子元素方法
# ul_li = html_etree.xpath('//div/div[1]/ul/li/text()')
# ul_li = html_etree.xpath("//div[@id='testid']/preceding::div/ul/li/text()") #preceding:: 当前节点标签之前的所有节点,可定点
ul_li = html_etree.xpath("//div[@id='testid']/preceding::li/text()") #preceding:: 可避免重复节点带来的麻烦
print('用xpath方法提取ul标签下的li的内容:', ul_li)
# 第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# 第一种BeautifulSoup方法
# ul_li = soup.select('ul')[0].select('li')
# ul_li = [i.get_text() for i in ul_li]
#另外一种BeautifulSoup方法
ul_li = soup.div.div.get_text()
ul_li = ul_li.strip() #删除首尾空格
ul_li = ul_li.split('\n') #按换行符分割字符串
print('用BeautifulSoup方法提取ul_li:', ul_li)
# 第三种,正则表达式提取
re_pattern = re.compile(r'<div price="99.8">.*?<div>.*?<ul>.*?<li>(.*?)</li>.*?<li>(.*?)</li>.*?<li>(.*?)</li>', re.S)
re_ul_li = re.findall(re_pattern, html)
print('用正则表达式跳过re.compile提取ul_li:', re_ul_li)
def first_id():
# 第一种,xpath提取
html_etree = etree.HTML(html)
first_id = html_etree.xpath('//div/div[2]/@id')
print('用xpath方法提取first_id的内容:', first_id)
# 第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
first_id = soup.select('div')[2].attrs['id']
print('用BeautifulSoup方法提取first_id:', first_id)
# 第三种,正则表达式提取
re_comppile = re.compile(r"<div id='(.*?)' data-h=\"first\">", re.S)
first_id = re.findall(re_comppile, html)
print('用正则表达式跳过re.compile提取first_id:', first_id)
def h2():
# 第一种,xpath提取
html_etree = etree.HTML(html)
h2 = html_etree.xpath('//div/div[2]/h2/text()')
print('用xpath方法提取h2的内容:', h2)
# 第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# h2 = soup.select('h2')[0].get_text()
h2 = soup.div.h2.get_text()
print('用BeautifulSoup方法提取h2:', h2)
# 第三种,正则表达式提取
re_comppile = re.compile(r'<h2>(.*?)</h2>', re.S)
h2 = re.findall(re_comppile, html)
print('用正则表达式跳过re.compile提取h2:', h2)
def main():
title()
price()
ul_li()
first_id()
h2()
if name == 'main':
main()
网友评论