美文网首页
python爬虫之xpath使用基本操作

python爬虫之xpath使用基本操作

作者: Pickupthesmokes | 来源:发表于2018-12-23 16:36 被阅读0次
      # xpath:在xml中查找信息对xml中的文档信息进行遍历和属性的提取
    
      # xml设计目的式味蕾传输数据结构html特别相似,是一种标记语言
      """
      xpath:常见语法
      nodename:节点名称,选取此节点的所有子节点
      /:从根节点开始查找
      //: 匹配节点不考虑节点的位置。
    
      .:选取当前节点
      ..:选取当前节点的父节点
      @:用来去标签的属性
      a@herf 取a标签属性
      a@text() 取a标签文本
      a[@class='123'] 根据class变迁属性寻找标签
      a[@id='123'] 根据id属性寻找标签
    
      a[@id='123'][last()] 取最后一个id为123的a标签
    
      a[@id='123'][postion()<2] 取前两个id为123的a标签
    
      """
    
      #http://www.budejie.com/audio/1
      #http://www.budejie.com/audio/2
    
      import requests
    
      from lxml import etree
    
      import re
    
      def load_page_data(url):
    
          req_heard = {
              'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
          }
    
          response = requests.get(url,headers=req_heard)
    
          if response.status_code == 200:
    
              print('请求成功')
      
              # with open('page.html','w') as file:
              #
              #     file.write(response.text)
    
              status = parse_page_data(response.text)
    
              if status:
    
                  pattern = re.compile('\d+')
    
                  cur_page = re.search(pattern,response.url).group()
    
                  next_page = int(cur_page)+1
    
                  next_page_url = re.sub(pattern,str(next_page),response.url)
    
                  load_page_data(next_page_url)
    
      def parse_page_data(html):
    
          """
          使用xpath
          :param html:
          :return:
          """
          #etree.HTML(html)得到html_element对像
          html_element = etree.HTML(html)
    
          autio_list = html_element.xpath('//div[@class="j-r-c"]/div[@class="j-r-list"]/ul/li')
    
          print(autio_list)
    
          print(len(autio_list))
    
          for autio in autio_list:
    
              autio_data = {}
    
              autio_data['name'] = autio.xpath('.//a[@class="u-user-name"]/text()')[0]
    
              autio_data['publishtime'] = autio.xpath('.//span[@class="u-time  f-ib f-fr"]/text()')[0]
    
              autio_data['content']=autio.xpath('.//div[@class="j-r-list-c-desc"]/text()')[0]
    
              autio_data['dianzanshu']=autio.xpath('.//li[@class="j-r-list-tool-l-up"]/span/text()')[0]
    
              autio_data['chapingshu']=autio.xpath('.//li[@class="j-r-list-tool-l-down "]/span/text()')[0]
    
              autio_data['tupian']= autio.xpath('.//div[@class=" j-audio"]/@data-poster')[0]
    
              autio_data['url']=autio.xpath('.//div[@class=" j-audio"]/@data-mp3')[0]
    
              download_audio_list(autio_data['url'],autio_data)
    
          if len(autio_list) > 0:
    
              return True
    
          else:
    
              return False
    
              print(autio_data)
    
      def download_audio_list(url,audiodata):
    
          req_heard = {
              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
          }
    
          response = requests.get(url, headers=req_heard)
    
          if response.status_code == 200:
    
              print(response.url,'下载成功')
    
              filename = response.url[-17:0]
    
              with open('baisi/'+filename,'w') as file:
    
                  file.write(response.content)
    
                  audiodata['localpath']='baisi/'+filename
    
              save_data_to_db(audiodata)
    
      def save_data_to_db(audio):
    
          print(audio)
    
    
      if __name__ == '__main__':
    
          start_url='http://www.budejie.com/audio/1'
    
         load_page_data(start_url)

    相关文章

      网友评论

          本文标题:python爬虫之xpath使用基本操作

          本文链接:https://www.haomeiwen.com/subject/krunkqtx.html