美文网首页python自学
Python例子之《郁莉为你讲古诗》音频离线

Python例子之《郁莉为你讲古诗》音频离线

作者: By_syk | 来源:发表于2017-09-26 14:07 被阅读31次

    https://www.lizhi.fm/1804846/album/28562879269936667

    郁莉为你讲故事 - 荔枝FM

    Python 脚本代码:

    # download audio resource from 荔枝FM
    # author: By_syk <By_syk@163.com>
    # date: 2017-09-26
    
    import os
    import re
    from urllib import request
    
    import math
    
    FOLDER_SAVE = r'E:/Download/LizhiFm/'
    URL_BASE = r'https://www.lizhi.fm'
    
    
    def download_page(url):
        res = request.urlopen(url)
        return res.read().decode('utf-8')
    
    
    def download_audio(folder_save, audio_name, url_audio):
        audio_name = re.sub(r'\s+', ' ', audio_name.strip()).replace(' ', '_')
        file_audio = folder_save + audio_name + get_res_suffix(url_audio)
    
        if os.path.exists(file_audio):
            return
    
        res = request.urlopen(url_audio)
        data = res.read()
        with open(file_audio, 'wb') as file:
            file.write(data)
    
    
    def parse_album_name(url_page, html_content):
        if is_album_page(url_page):
            match = re.search(r'<h1 class="radioName">.+?>(.+?)<', html_content)
            if match:
                return match.group(1)
        if is_user_page(url_page):
            match = re.search(r'<h1 class="user-info-name">FM\d+ (.+?)<', html_content)
            if match:
                return match.group(1)
    
    
    def parse_all_page_url(url_first, html_content):
        url_page_arr = [url_first]
        match = re.search(r'<div class="page.+?</div>', html_content, re.S)
        if not match:
            return url_page_arr
        block_page = match.group()
        return url_page_arr + [URL_BASE + i for i in re.findall(r'<a href="([^"]+?)">', block_page)]
    
    
    def parse_audios(html_content):
        return re.findall(r'<a.+?title="(.+?)".+?data-duration="(\d+)".+?data-url="(.+?)"', html_content)
    
    
    def get_res_suffix(url_res):
        return '.' + url_res.split('.')[-1]
    
    
    def is_album_page(url_page):
        return re.match(r'^https://www\.lizhi\.fm/\d+/album/\d+$', url_page)
    
    
    def is_user_page(url_page):
        return re.match(r'^https://www\.lizhi\.fm/user/\d+$', url_page)
    
    
    def readable_sec(sec):
        text = ''
        if sec > 60 * 60:
            text += str(sec // (60 * 60)) + 'h'
            sec %= 60 * 60
        text += str(sec // 60) + 'm'
        return text
    
    
    def readable_kb(kb):
        if kb > 1024 * 1024:
            return '%.1fGB' % (kb / (1024 * 1024) + 0.05)
        if kb > 1024:
            return str(math.ceil(kb / 1024)) + 'MB'
        return str(math.ceil(kb)) + 'KB'
    
    
    def download_all(url_html_album):
        page = download_page(url_html_album)
    
        album_name = parse_album_name(url_html_album, page)
        if album_name is None:
            print('err. invalid url.')
            return
        print('album name:', album_name)
    
        folder_save_album = FOLDER_SAVE + album_name + '/'
        if not os.path.isdir(folder_save_album):
            # os.mkdir(folder_save_album)
            os.makedirs(folder_save_album)
        print('save folder:', folder_save_album)
    
        print('fetching all audios...')
        all_audios = []
        for url_page in parse_all_page_url(url_html_album, page):
            page = download_page(url_page)
            all_audios += parse_audios(page)
        all_audio_dict = {}
        total_audio_duration = 0
        for item in all_audios:
            if item[2] in all_audio_dict:
                continue
            all_audio_dict[item[2]] = item[0]
            total_audio_duration += int(item[1])
        print(len(all_audio_dict), 'audios found, about', readable_sec(total_audio_duration), '/',
              readable_kb(total_audio_duration * 17))
    
        for index, item in enumerate(all_audio_dict.items()):
            print('downloading audio %d: %s' % (index + 1, item[1]))
            download_audio(folder_save_album, item[1], item[0])
    
        print('all done')
    
    download_all('https://www.lizhi.fm/1804846/album/28562879269936667')
    

    输入输出:

    python3 lizhi_fm_offline.py
    album name: 郁莉为你讲古诗(1)
    save folder: E:/Download/LizhiFm/郁莉为你讲古诗(1)/
    fetching all audios...
    100 audios found, about 11h1m / 659MB
    downloading audio 1: 《月夜》--思念的翅膀==郁莉为你讲古诗
    downloading audio 2: 《迢迢牵牛星》--天上人间==郁莉为你讲古诗
    ...omiited
    all done
    
    全部音频文件

    拓展

    为了方便离线荔枝FM平台的其他音频资源,稍加修改代码作为通用脚本:

    # download_all('https://www.lizhi.fm/1804846/album/28562879269936667')
    if __name__ == '__main__':
        url_album = input('url: ').strip()
        download_all(url_album)
    

    离线周建龙的《鬼吹灯全集》,启动脚本后输入其链接即可:

    python3 lizhi_fm_offline.py
    url: https://www.lizhi.fm/user/2617184632410917420
    

    本文代码仅作学习交流之用,请勿用于其他用途。

    相关文章

      网友评论

        本文标题:Python例子之《郁莉为你讲古诗》音频离线

        本文链接:https://www.haomeiwen.com/subject/qihsextx.html