美文网首页
day04 多线程和多进程自动爬取虾米音乐并存储到本地的mp3

day04 多线程和多进程自动爬取虾米音乐并存储到本地的mp3

作者: LittleBear_6c91 | 来源:发表于2019-04-12 20:41 被阅读0次
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver import ActionChains
    import time
    from lxml import etree
    from kaisha import str2url
    import requests
    import threading # 多线程
    from multiprocessing import Process  #多进程
    
    # 设置浏览器参数
    chrome_options = webdriver.ChromeOptions()
    # 设置成无头浏览器
    # chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)
    
    wait = WebDriverWait(browser, 5)
    
    def get_page():
        url = 'https://www.xiami.com/chart'
        browser.get(url)
        time.sleep(2)
        html = browser.page_source
        return html
    
    #多线程保存音乐文件
    def save_mp3_with_threads(data_title, data_mp3):
        for i in range(len(data_title)):
            title = data_title[i]
            mp3 = data_mp3[i]
            mp3_url = str2url(mp3)
            thread = threading.Thread(target=save_mp3, args=(mp3_url, title))
            thread.start()
    
    #多进程保存音乐文件
    def save_mp3_with_process(data_title, data_mp3):
        for i in range(len(data_title)):
            title = data_title[i]
            mp3 = data_mp3[i]
            mp3_url = str2url(mp3)
            process = Process(target=save_mp3, args=(mp3_url, title))
            process.start()
    
    
    def parse_page(html):
        etree_html = etree.HTML(html)
        data_title = etree_html.xpath('//tr[@class="songwrapper"]/@data-title')
        data_mp3 = etree_html.xpath('//tr[@class="songwrapper"]/@data-mp3')
    
        # save_mp3_with_threads(data_title, data_mp3)
    
        save_mp3_with_process(data_title, data_mp3)
    
        # print(data_mp3)
        # for i in range(len(data_title)):
        #     title = data_title[i]
        #     mp3 = data_mp3[i]
        #     mp3_url = str2url(mp3)
        #     print(mp3_url)
        #     save_mp3(mp3_url, title)
    
    # 保存mp3文件到本地
    def save_mp3(mp3_url, title):
        print(title)
        response = requests.get(mp3_url)
        if response.status_code == 200:
            content = response.content
            # 写文件
            with open('./mp3/%s.mp3' % title, 'wb') as f:
                f.write(content)
    
    def main():
        html = get_page()
        parse_page(html)
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:day04 多线程和多进程自动爬取虾米音乐并存储到本地的mp3

          本文链接:https://www.haomeiwen.com/subject/ombhwqtx.html