美文网首页
2019-下载喜马拉雅音频

2019-下载喜马拉雅音频

作者: berrycam | 来源:发表于2019-08-01 16:49 被阅读0次
    
    #coding=utf-8
    '''
    author : berrycam
    time : 2019.8.1
    '''
    
    import os
    import requests
    from lxml import etree
    
    
    class DownloadMedia(object):
        '''
        下载喜马拉雅的音频文件
        '''
        def __init__(self, target):
            self.__target_url = target
            self.__headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/73.0.3683.75 Safari/537.36'}
    
        def get_url_list(self):
            '''
            :return:把所有需要下载音频json数据进行汇集
            '''
            resp_home_page = requests.get(self.__target_url, headers=self.__headers)
            html = resp_home_page.text
            elements = etree.HTML(html)
            # 创建下载文件夹
            folder_path = elements.xpath('//*[@id="root"]/main/section/div/div[2]/div[1]/div[1]/div[2]/div[2]/h1')[0].text.strip()
            if not os.path.exists(folder_path):
                os.mkdir(folder_path)
            # 获取总的页数
            page_navigation = elements.xpath('//*[@id="anchor_sound_list"]/div[2]/div/nav/ul/li')[-2]
            page_num = page_navigation.xpath('./a/span')[0].text
            # 获取要进行访问音频列表的URL列表
            page_url = ['{}p{}'.format(self.__target_url, num) for num in range(1, int(page_num)+1)]
            media_url_list = list()
            for url in page_url:
                response = requests.get(url, headers=self.__headers).text
                elements = etree.HTML(response)
                sound_list = elements.xpath('//*[@id="anchor_sound_list"]/div[2]/ul/li')
                for li in sound_list:
                    media_href = li.xpath('./div[2]/a/@href')[0].strip()
                    trackID = media_href.split('/')
                    track_json = 'http://www.ximalaya.com/tracks/{}.json'.format(trackID[-1])
                    media_url_list.append(track_json)
            return media_url_list,folder_path
    
        def down_media_m4a(self, media_url_list, folder_path):
            '''下载音频文件
            '''
            for url in media_url_list:
                resp = requests.get(url, headers=self.__headers)
                json_dict = resp.json()
                play_path = json_dict.get('play_path_64')
                title = json_dict.get('title')
                file_path = os.path.join(folder_path, title+'.m4a')
                resp_data = requests.get(play_path, headers=self.__headers)
                if resp_data.status_code == 200:
                    with open(file_path, 'wb') as f:
                        f.write(resp_data.content)
                        print(' %s  finish'% file_path)
                else:
                    print('error :%d' % resp_data.status_code)
    
    
    if __name__ == '__main__':
        # target_url = str(input("请输入喜马拉雅下载地址:\n"))
        target_url = 'https://www.ximalaya.com/yinyue/16162468/'
        d = DownloadMedia(target=target_url)
        url_list, forder_path = d.get_url_list()
        d.down_media_m4a(url_list, forder_path)
    

    相关文章

      网友评论

          本文标题:2019-下载喜马拉雅音频

          本文链接:https://www.haomeiwen.com/subject/jddcdctx.html