lxml 可以自动修正 html 代码,例子里不仅补全了 li 标签,还添加了 body,html 标签。
文件读取: 除了直接读取字符串,lxml还支持从文件里读取内容。我们新建一个hello.html文件
-
利用 etree.parse() 方法来读取文件。
初步使用 我们利用它来解析 HTML 代码,简单示例:
# lxml_test.py
# 使用 lxml 的 etree 库
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
</ul>
</div>
'''
#利用etree.HTML,将字符串转化为Element对象,
Element对象具有xpath的方法
html_element = etree.HTML(text)
# 按字符串序列化HTML文档
result = etree.tostring(html_element))
print(result)
输出结果:
<html><body>
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</body></html>
lxml 可以自动修正 html 代码,例子里不仅补全了 li 标签,还添加了 body,html 标签。
文件读取: 除了直接读取字符串,lxml还支持从文件里读取内容。我们新建一个hello.html文件:
<!-- hello.html -->
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
利用 etree.parse() 方法来读取文件。
# lxml_parse.py
from lxml import etree
# 读取外部文件 hello.html
html = etree.parse('./hello.html')
result = etree.tostring(html, pretty_print=True)
print(result)
输出结果与之前相同:
<html><body>
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</body></html>
XPath实例测试
1获取所有的 <li> 标签
# xpath_li.py
from lxml import etree
html = etree.parse('hello.html')
print type(html) # 显示etree.parse() 返回类型
result = html.xpath('//li')
print result # 打印<li>标签的元素集合
print len(result)
print type(result)
print type(result[0])
输出结果:
<type 'lxml.etree._ElementTree'>
[<Element li at 0x1014e0e18>, <Element li at 0x1014e0ef0>, <Element li at 0x1014e0f38>, <Element li at 0x1014e0f80>, <Element li at 0x1014e0fc8>]
5
<type 'list'>
<type 'lxml.etree._Element'>
2. 继续获取<li> 标签的所有 class属性
# xpath_li.py
from lxml import etree
html = etree.parse('hello.html')
result = html.xpath('//li/@class')
print (result)
运行结果
['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0']
3. 继续获取
标签下hre 为 link1.html 的 标签
# xpath_li.py
from lxml import etree
html = etree.parse('hello.html')
result = html.xpath('//li/a[@href="link1.html"]')
print(result)
运行结果
[<Element a at 0x10ffaae18>]
4. 获取
标签下的所有 标签
# xpath_li.py
from lxml import etree
html = etree.parse('hello.html')
#result = html.xpath('//li/span')
#注意这么写是不对的:
#因为 / 是用来获取子元素的,而 <span> 并不是 <li> 的子元素,所以,要用双斜杠
result = html.xpath('//li//span')
print(result)
运行结果
[<Element span at 0x10d698e18>]
5. 获取
标签下的标签里的所有 class
# xpath_li.py
from lxml import etree
html = etree.parse('hello.html')
result = html.xpath('//li/a//@class')
print(result)
运行结果
['blod']
6. 获取最后一个
的 的 href
# xpath_li.py
from lxml import etree
html = etree.parse('hello.html')
result = html.xpath('//li[last()]/a/@href')
# 谓语 [last()] 可以找到最后一个元素
print(result)
运行结果
['link5.html']
7. 获取倒数第二个元素的内容
# xpath_li.py
from lxml import etree
html = etree.parse('hello.html')
result = html.xpath('//li[last()-1]/a')
# text 方法可以获取元素内容
print(result[0].text)
运行结果
fourth item
8. 获取 class 值为 bold 的标签名 # xpath_li.py
from lxml import etree
html = etree.parse('hello.html')
result = html.xpath('//*[@class="bold"]')
# tag方法可以获取标签名
print(result[0].tag)
运行结果
span
例子
from lxml import etree
from fake_useragent import UserAgent
import requests
def qidianSpider(start_url):
get_novel_list_by_url(start_url)
def get_novel_list_by_url(req_url):
req_header = {'User-Agent':UserAgent().random}
response = requests.get(url=req_url,headers=req_header)
if response.status_code == 200:
html_element = etree.HTML(response.text)
noval_lis = html_element.xpath('//ul[@class="all-img-list cf"]/li')
for noval_li in noval_lis:
coverImage = noval_li.xpath('./div[@class="book-img-box"]/a/img/@src')[0]
title = noval_li.xpath('./div[@class="book-mid-info"]/h4/a/text()')[0]
author = noval_li.xpath('.//a[@class="name"]/text()')[0]
nub = noval_li.xpath('.//span/text()')[0]
#< span class ="jIVjQgqX" > 𘟄𘟆𘟁𘟉𘟈𘟆 < / span >
type = noval_li.xpath('.//p/a[2]/text()')[0]
type_small = noval_li.xpath('.//p/a[3]/text()')[0]
tit_url = noval_li.xpath('.//h4/a/@href')[0]
print(coverImage,title,author,type,type_small,nub)
novel_chapter(tit_url)
break
def novel_chapter(url):
req_url = 'https:'+url+'#Catalog'
print(req_url)
html_chapter_element = etree.HTMl(requests.text)
chapter_lis = html_chapter_element.xpath('//ul[@class="cf"]/li')[0]
print(chapter_lis)
if __name__=='__main__':
url = 'https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=1'
qidianSpider(url)
网友评论