python爬虫之xpath使用基本操作

作者: Pickupthesmokes | 来源:发表于2018-12-23 16:36 被阅读0次

python爬虫之xpath使用基本操作
Python爬虫(十三)_案例：使用XPath的爬虫
2019-01-18srcrapy框架xpath和css选择器语
爬虫实战1.3.2 页面解析之Xpath
2019-05-29 图书比价工具
爬虫处理——结构化数据操作
爬虫入门(2)-使用Xpath抓取信息
python爬虫之xpath
爬虫-python-scrapy框架基本命令
Scrapyd部署爬虫

  # xpath:在ｘｍｌ中查找信息对ｘｍｌ中的文档信息进行遍历和属性的提取

  # xml设计目的式味蕾传输数据结构ｈｔｍｌ特别相似，是一种标记语言
  """
  xpath:常见语法
  nodename:节点名称，选取此节点的所有子节点
  /:从根节点开始查找
  //: 匹配节点不考虑节点的位置。

  .:选取当前节点
  ..:选取当前节点的父节点
  ＠：用来去标签的属性
  a@herf 取ａ标签属性
  a＠text() 取ａ标签文本
  a[@class='123'] 根据class变迁属性寻找标签
  a[@id='123'] 根据id属性寻找标签

  a[@id='123'][last()] 取最后一个id为123的ａ标签

  a[@id='123'][postion()<2] 取前两个id为123的a标签

  """

  #http://www.budejie.com/audio/1
  #http://www.budejie.com/audio/2

  import requests

  from lxml import etree

  import re

  def load_page_data(url):

      req_heard = {
          'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
      }

      response = requests.get(url,headers=req_heard)

      if response.status_code == 200:

          print('请求成功')
  
          # with open('page.html','w') as file:
          #
          #     file.write(response.text)

          status = parse_page_data(response.text)

          if status:

              pattern = re.compile('\d+')

              cur_page = re.search(pattern,response.url).group()

              next_page = int(cur_page)+1

              next_page_url = re.sub(pattern,str(next_page),response.url)

              load_page_data(next_page_url)

  def parse_page_data(html):

      """
      使用xpath
      :param html:
      :return:
      """
      #etree.HTML(html)得到html_element对像
      html_element = etree.HTML(html)

      autio_list = html_element.xpath('//div[@class="j-r-c"]/div[@class="j-r-list"]/ul/li')

      print(autio_list)

      print(len(autio_list))

      for autio in autio_list:

          autio_data = {}

          autio_data['name'] = autio.xpath('.//a[@class="u-user-name"]/text()')[0]

          autio_data['publishtime'] = autio.xpath('.//span[@class="u-time  f-ib f-fr"]/text()')[0]

          autio_data['content']=autio.xpath('.//div[@class="j-r-list-c-desc"]/text()')[0]

          autio_data['dianzanshu']=autio.xpath('.//li[@class="j-r-list-tool-l-up"]/span/text()')[0]

          autio_data['chapingshu']=autio.xpath('.//li[@class="j-r-list-tool-l-down "]/span/text()')[0]

          autio_data['tupian']= autio.xpath('.//div[@class=" j-audio"]/@data-poster')[0]

          autio_data['url']=autio.xpath('.//div[@class=" j-audio"]/@data-mp3')[0]

          download_audio_list(autio_data['url'],autio_data)

      if len(autio_list) > 0:

          return True

      else:

          return False

          print(autio_data)

  def download_audio_list(url,audiodata):

      req_heard = {
          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
      }

      response = requests.get(url, headers=req_heard)

      if response.status_code == 200:

          print(response.url,'下载成功')

          filename = response.url[-17:0]

          with open('baisi/'+filename,'w') as file:

              file.write(response.content)

              audiodata['localpath']='baisi/'+filename

          save_data_to_db(audiodata)

  def save_data_to_db(audio):

      print(audio)


  if __name__ == '__main__':

      start_url='http://www.budejie.com/audio/1'

     load_page_data(start_url)