pyquery
简介:同样是一个强大的网页解析工具 它提供了和jQuery类似的语法来解析HTML文档,支持CSS选择器,使用非常方便
安装:
pip install pyquery
pyquery基本用法
1.初始化
1.字符串初始化
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html) #PyQuery对象
li = doc('li') #PyQuery对象
print(doc('li'))
结果:
- url初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
- 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))
2.基本css样式选择器
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li'))
结果:
3. 子元素
1.子元素
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('ul.list')
print(type(items))
print(items)
#再找后代元素
lis = items.find('li')
print(type(lis))
print(lis)
#只找子元素
lis1 = items.children('.item-1')
print(lis1)
find():查找后代元素,必须添加选择器
children():查找子元素,可以不加选择器
2.父元素
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
#只找直接父元素
parent = items.parent()
print(type(parent))
print(parent)
#查找所有的父元素,可加限制条件
parents = items.parents('.wrap')
print(type(parents))
print(parents)
parent(): 只找直接父元素
parents():查找所有的父元素,可加限制条件css选择器
3.兄弟节点
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
result = doc('.list .item-0.active')
#查找所有的兄弟节点
print(result.siblings())
#查找指定条件的
print(result.siblings('.active'))
结果:
siblings():查找兄弟节点,可加限制条件
4.遍历
1.单个元素
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
#1.单个元素
li1 = doc('.item-0.active')
print(li1)
2.多个元素
#2. 多个元素
li2 = doc('li')
iter = li2.items() #转为迭代器
print(type(iter)) #生成器
for i in iter: #遍历
print(i)
print(type(i)) #pyquery对象
结果:
5.5.获取信息
1.获取属性
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
result = doc('.list .item-1 a')
#单个值时
print(type(result.attr('href'))) #str类型
print(result.attr('href')) #link2.html
#多个值时
re = doc('.item-0 a')
for i in re.items():
print(i.attr('href'))
结果:
2.获取文本
#获取文本
print(result.text())
# 当有多个值时,可以遍历出来
#获取文本
for i in result.items():
print(i.text())
3.获取HTML
#获取html
#一个值时
result2 = doc('.list .item-1')
print(result2)
print(result2.html())
#多个值时
for i in result2.items():
print(i.html())
6.DOM操作
- addClass、removeClass
添加和移除class属性
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active')
print(li)
li.addClass('active')
print(li)
结果:
- attr、css
设置属性和css样式
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')
print(li)
li.css('font-size', '14px')
print(li)
结果:
- remove
html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text())
结果:
7.伪类选择器
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
#第一个元素
li = doc('li:first-child')
print(li)
#最后一个元素
li = doc('li:last-child')
print(li)
#第二个元素
li = doc('li:nth-child(2)')
print(li)
# 第二个li之后的li
li = doc('li:gt(2)')
print(li)
#第二的倍数的li
li = doc('li:nth-child(2n)')
print(li)
# li元素中是否包含该字符串
li = doc('li:contains(second)')
print(li)
print(li.text())
结果:
实战---爬取百度校花(pyquery版)
整体思路:
代码:
import os
import requests
import time
from pyquery import PyQuery as pq
class Baidu(object):
def __init__(self, name,pn):
self.pn = pn
self.url = f'http://tieba.baidu.com/f?ie=utf-8&kw={name}&pn={pn}'
# 使用较老版本的请求头,该浏览器不支持js
self.headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0) '
}
#发送请求函数
def get_data(self,url):
resp = requests.get(url,headers=self.headers)
return resp.content
# 解析列表页数据,获取列表页的title和超链接
def list_page(self,data):
#把列表页的响应数据写入文件
with open('baidu.html','wb') as f:
f.write(data)
#获取pq对象
list_doc = pq(data)
#使用pyquery获取 所有列表项a标签,返回pq对象
node_list = list_doc('div.threadlist_title.pull_left.j_th_tit a')
#遍历pq对象转换的生成器,获取title和url的值,全部加到新的列表中
detail_list = []
for item in node_list.items():
temp = {}
temp['title'] = item.text()
temp['url'] = 'http://tieba.baidu.com' + item.attr('href')
detail_list.append(temp)
# print(detail_list)
# 改变pn的值,修改新的url(下一页)
self.pn+=50
next_url = f'https://tieba.baidu.com/f?kw=%E6%A0%A1%E8%8A%B1&ie=utf-8&pn={self.pn}'
return detail_list,next_url
#解析详情页
def detail_page(self,data):
#获取pq对象
detail_doc = pq(data)
#解析详情页,获取该页的所有图片地址,返回pq对象
img_list = detail_doc('img.BDE_Image').items()
#遍历生成器,将每一项的图片的src加入新的列表中
img_url_list = []
for img in img_list:
# print(img.attr('src'))
img_url_list.append(img.attr('src'))
#打印该页的图片地址
print(img_url_list)
return img_url_list
def download(self,image_list):
#创建images目录
if not os.path.exists('images'):
os.makedirs('images')
#遍历传进来的src列依次发送请求,进啊给返回的字节码写入文件
for image_url in image_list:
file_name = 'images' + os.sep + image_url.split('/')[-1]
image_resp = self.get_data(image_url)
with open(file_name, 'wb') as f:
f.write(image_resp)
def run(self):
# 获取第一个url
next_url = self.url
#发送第一个请求,返回列表页
resp = self.get_data(next_url)
#如果第一个响应存在,开始循环爬取
while resp:
#解析列表页resp
detail_list,next_url = self.list_page(resp)
#遍历详情页的title和url的列表
for detail in detail_list:
url = detail['url'] #取出每一项详情url
detail_resp = self.get_data(url) #发送请求,得到content回应
#详情页解析处理
img_url_list = self.detail_page(detail_resp)
self.download(img_url_list)
# 发送下一条请求,返回resp
resp = self.get_data(next_url)
if __name__ == '__main__':
baidu = Baidu('校花吧',50)
baidu.run()
网友评论