python爬虫之requests百思不得其姐声音爬取

作者: Pickupthesmokes | 来源:发表于2018-12-23 16:24 被阅读0次

python爬虫之requests百思不得其姐声音爬取
Python网络爬虫之爬取百思不得姐视频并保存至文件
python 爬虫练习（一）
Python爬虫实战之爬取链家广州房价_03存储
Python爬虫之爬取不得姐图片
Python爬虫之爬取不得姐段子
Python 爬虫（正则匹配爬取百思不得姐）
Python爬虫之BeautifulSoup库的入门与使用
使用requests，lxml爬取百思不得姐
爬取音悦台任意艺人的mv（最高画质）+批量下载

import requests,re

from lxml.html import etree

def duanzispaider(url):

pass

def load_page_data(url):

req_headers = {
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}

response = requests.get(url)

if response.status_code == 200:

    status = parse_page_data(response.text)

    if status:

        pattern = re.compile('\d')

        cur_page = re.search(pattern,response.url).group()

        next_page = int(cur_page) + 1

        next_url = re.sub(pattern,str(next_page),response.url)

        load_page_data(next_url)

def parse_page_data(html):

html_element = etree.HTML(html)

autio_list = html_element.xpath('//div[@class="j-r-c"]/div[@class="j-r-list"]/ul/li')

for autio in autio_list:

    autio_data = {}

    autio_data['name']=autio.xpath('.//a[@class="u-user-name"]/text()')[0]

    autio_data['content']=autio.xpath('.//div[@class="j-r-list-c-desc"]/text()')[0]

    autio_data['publishtime']=autio.xpath('.//span[@class="u-time  f-ib f-fr"]/text()')[0]

    autio_data['zannum']=autio.xpath('.//li[@class="j-r-list-tool-l-up"]/span/text()')[0]

    autio_data['chaping']=autio.xpath('.//li[@class="j-r-list-tool-l-down "]/span/text()')[0]

    autio_data['fengmian']=autio.xpath('.//div[@class=" j-audio"]/@data-poster')[0]

    autio_data['yinpin']=autio.xpath('.//div[@class=" j-audio"]/@data-mp3')[0]

    download_autio_by_url(autio_data['yinpin'],autio_data)

if autio_data > 0:

    return True

else:

    return False

def download_autio_by_url(url,autio_data):

req_header = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}

response = requests.get(url,headers=req_header)


if response.status_code == 200:

    filename = response.url[-17:]

    print(filename)

    with open('duanzi/'+filename,'wb') as file:

        filename = response.url[-17:]

        file.write(response.content)

        autio_data['lujing']='duanzi/'+filename

if __name__ == '__main__':

url = 'http://www.budejie.com/audio/1'

load_page_data(url)