# xpath:在xml中查找信息对xml中的文档信息进行遍历和属性的提取
# xml设计目的式味蕾传输数据结构html特别相似,是一种标记语言
"""
xpath:常见语法
nodename:节点名称,选取此节点的所有子节点
/:从根节点开始查找
//: 匹配节点不考虑节点的位置。
.:选取当前节点
..:选取当前节点的父节点
@:用来去标签的属性
a@herf 取a标签属性
a@text() 取a标签文本
a[@class='123'] 根据class变迁属性寻找标签
a[@id='123'] 根据id属性寻找标签
a[@id='123'][last()] 取最后一个id为123的a标签
a[@id='123'][postion()<2] 取前两个id为123的a标签
"""
#http://www.budejie.com/audio/1
#http://www.budejie.com/audio/2
import requests
from lxml import etree
import re
def load_page_data(url):
req_heard = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
response = requests.get(url,headers=req_heard)
if response.status_code == 200:
print('请求成功')
# with open('page.html','w') as file:
#
# file.write(response.text)
status = parse_page_data(response.text)
if status:
pattern = re.compile('\d+')
cur_page = re.search(pattern,response.url).group()
next_page = int(cur_page)+1
next_page_url = re.sub(pattern,str(next_page),response.url)
load_page_data(next_page_url)
def parse_page_data(html):
"""
使用xpath
:param html:
:return:
"""
#etree.HTML(html)得到html_element对像
html_element = etree.HTML(html)
autio_list = html_element.xpath('//div[@class="j-r-c"]/div[@class="j-r-list"]/ul/li')
print(autio_list)
print(len(autio_list))
for autio in autio_list:
autio_data = {}
autio_data['name'] = autio.xpath('.//a[@class="u-user-name"]/text()')[0]
autio_data['publishtime'] = autio.xpath('.//span[@class="u-time f-ib f-fr"]/text()')[0]
autio_data['content']=autio.xpath('.//div[@class="j-r-list-c-desc"]/text()')[0]
autio_data['dianzanshu']=autio.xpath('.//li[@class="j-r-list-tool-l-up"]/span/text()')[0]
autio_data['chapingshu']=autio.xpath('.//li[@class="j-r-list-tool-l-down "]/span/text()')[0]
autio_data['tupian']= autio.xpath('.//div[@class=" j-audio"]/@data-poster')[0]
autio_data['url']=autio.xpath('.//div[@class=" j-audio"]/@data-mp3')[0]
download_audio_list(autio_data['url'],autio_data)
if len(autio_list) > 0:
return True
else:
return False
print(autio_data)
def download_audio_list(url,audiodata):
req_heard = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
response = requests.get(url, headers=req_heard)
if response.status_code == 200:
print(response.url,'下载成功')
filename = response.url[-17:0]
with open('baisi/'+filename,'w') as file:
file.write(response.content)
audiodata['localpath']='baisi/'+filename
save_data_to_db(audiodata)
def save_data_to_db(audio):
print(audio)
if __name__ == '__main__':
start_url='http://www.budejie.com/audio/1'
load_page_data(start_url)
网友评论