一、常用xpath表达式
#找到class属性值为btn的div标签
//div[@class="btn"]
#找到class属性值为song的div的直系子标签ul下的第二个子标签li下的直系子标签a
//div[@class="song"]/ul/li[2]/a
#找到href属性值为空且class属性值为music的a标签
//a[@href="" and @class="music"]
//div[contains(@class, "ng")]
//div[starts-with(@class, "so")]
# /表示获取某个标签下的文本内容
# //表示获取某个标签下的文本内容和所有子标签下的文本内容
//div[@class="song"]/p[1]/text()
//div[@class="music"]//text()
# 提取div里的所有文字,深层嵌套的全部文字
data = selector.xpath('//div[@id="test3"]')[0]
info = data.xpath('string(.)')
//div[@class="music"]//li[2]/a/@href
二、python使用xpath表达式的步骤
1.下载:pip install lxml
2.导包:from lxml import etree
3.将html文档或者xml文档转换成一个etree对象,然后调用对象中的方法查找指定的节点
3.1 本地文件:tree = etree.parse(文件名)
tree.xpath("xpath表达式")
3.2 网络数据:tree = etree.HTML(网页内容字符串)
tree.xpath("xpath表达式")
三、案例
import requests
from lxml import etree
url = 'http://www.haoduanzi.com/category-10_2.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
}
url_content = requests.get(url, headers=headers).text
# 使用xpath对url_conten进行解析
# 使用xpath解析从网络上获取的数据
tree = etree.HTML(url_content)
# 解析获取当页所有段子的标题
title_list = tree.xpath('//div[@class="log cate10 auth1"]/h3/a/text()')
ele_div_list = tree.xpath('//div[@class="log cate10 auth1"]')
text_list = [] # 存储段子的文本内容
for ele in ele_div_list:
# 段子的文本内容
text_list = ele.xpath('./div[@class="cont"]//text()')
# list列表中的文本内容全部提取到一个字符串中
text_str = str(text_list)
# 字符串形式的文本内容放置到text_list列表中
text_list.append(text_str)
print(title_list)
print(text_list)
import requests
from lxml import etree
job = input('enter a job:')
url = 'https://www.zhipin.com/job_detail/?'
param = {
'query': job
}
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
}
response = requests.get(url=url, params=param, headers=headers)
page_text = response.text # (【1,获取网络页面】)
# 解析:
# 1.获取所有岗位的链接(【2,将网页html实例化成一个tree对象】)
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="job-list"]/ul/li') # (【3,用tree.xpath方法解析筛选想要的部分】)
# 只用Element类型的对象可以调用xpath方法
for li in li_list:
job_url = li.xpath("./div/div[1]/h3/a/@href")[0] # .li对象表示的局部页面内容
job_url = "https://www.zhipin.com" + job_url
# 对job_url发起请求,获取岗位对应的详情页面(又重新获取一个新的网络页面)
secondPage_text = requests.get(url=job_url, headers=headers).text
tree = etree.HTML(secondPage_text)
# 解析岗位名称
jobName = tree.xpath('//div[@class="info-primary"]/div[2]/h1/text()')[0]
salary = tree.xpath('//div[@class="info-primary"]/div[2]/span/text()')[0].strip('\n\t')
detail = tree.xpath('//div[@class="info-primary"]/p//text()')[0]
company = tree.xpath('//div[@class="info-company"]/h3/a/text()')[0]
jobDesc = tree.xpath('//div[@class="detail-content"]/div[1]/div//text()')[0]
# 持久化存储
网友评论