美文网首页我爱编程
抓取网站音频

抓取网站音频

作者: 流星一剑 | 来源:发表于2018-03-13 10:07 被阅读0次

    要求:抓取喜马拉雅音频存储到本地 D:\temp_ximalaya_audio,并按节目和章节分类存储。

    特殊说明:分析思路参考 https://www.jianshu.com/p/fc2e83c6583c

    __author__ ='tony'

    import json

    import random

    import time

    import urllib.request

    import pymongo

    import requests

    import aiohttp

    import asyncio

    from bs4import BeautifulSoup

    from lxmlimport etree

    import os

    import shutil

    filePath ="D:\\temp_ximalaya_audio"

    channelFilePath =""

    # 初始化文件目录

    if os.path.isdir(filePath):

    shutil.rmtree(filePath)# 递归删除目录树

    elif os.path.isfile(filePath):

    os.remove(filePath)# 删除文件

    os.makedirs(filePath)# 创建目录

    # mongodb

    #clients = pymongo.MongoClient('localhost')

    #db = clients["XiMaLaYa"]

    #col1 = db["album2"]

    #col2 = db["detaile2"]

    UA_LIST = [

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

    ]

    headers1 = {

    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

    'Accept-Encoding':'gzip, deflate, sdch',

    'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

    'Cache-Control':'max-age=0',

    'Proxy-Connection':'keep-alive',

    'Upgrade-Insecure-Requests':'1',

    'User-Agent': random.choice(UA_LIST)

    }

    headers2 = {

    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

    'Accept-Encoding':'gzip, deflate, sdch',

    'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

    'Cache-Control':'max-age=0',

    'Proxy-Connection':'keep-alive',

    'Referer':'http://www.ximalaya.com/dq/all/2',

    'Upgrade-Insecure-Requests':'1',

    'User-Agent': random.choice(UA_LIST)

    }

    def get_url():

    #start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]

        start_urls = ["http://www.ximalaya.com/dq/all/1/"]

    print(start_urls)

    for start_urlin start_urls:

    print(start_url)

    print("===============begin html=============")

    html = requests.get(start_url,headers=headers1).text

    print("html = {}".format(html))

    print("===============end html=============")

    print("===============begin soup=============")

    soup = BeautifulSoup(html,'lxml')

    print(soup)

    print("===============end soup=============")

    for itemin soup.find_all(class_="albumfaceOutter"):

    print("================begin item================")

    print(item)

    print("================end item=========================")

    print("================begin content================")

    content = {

    'href': item.a['href'],

    'title': item.img['alt'],

    'img_url': item.img['src']

    }

    print(content)

    print("================end content=========================")

    #col1.insert(content)

                print('写入一个频道' + item.a['href'])

    subchannel = item.a['href']

    print("============begin subchannel===================")

    print(subchannel)

    subchannelArr = subchannel.split("/")

    print(subchannelArr)

    #channelFilePath = subchannelArr[len(subchannelArr) - 2]

                channelFilePath = content['title']

    print(channelFilePath)

    channelFilePath = filePath + os.sep + channelFilePath

    print(channelFilePath)

    if os.path.isdir(channelFilePath):

    shutil.rmtree(channelFilePath)# 递归删除目录树

                elif os.path.isfile(channelFilePath):

    os.remove(channelFilePath)# 删除文件

                os.makedirs(channelFilePath)# 创建目录

                print("============end subchannel===================")

    print(content)

    another(channelFilePath, item.a['href'])

    time.sleep(1)

    def another(channelFilePath, url):

    print("=======================begin another html=======================")

    html = requests.get(url,headers=headers2).text

    print(html)

    print("=======================end another html=======================")

    print("=======================begin another ifanother=======================")

    ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')

    print(ifanother)

    print("=======================end another ifanother=======================")

    if len(ifanother):

    num = ifanother[0]

    print('本频道资源存在' + num +'个页面')

    for nin range(1,int(num)):

    print('开始解析{}个中的第{}个页面'.format(num, n))

    url2 = url +'?page={}'.format(n)

    print(url)

    print(url2)

    get_m4a(channelFilePath, url2)

    get_m4a(url)

    def get_m4a(channelFilePath, url):

    time.sleep(1)

    html = requests.get(url,headers=headers2).text

    print("==============begin get_m4a====================")

    numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')

    print(numlist)

    print("==============end get_m4a====================")

    for iin numlist:

    print("==============begin get_m4a murl====================")

    murl ='http://www.ximalaya.com/tracks/{}.json'.format(i)

    print(murl)

    print(channelFilePath)

    filePath_tracks = channelFilePath

    '''

    filePath_tracks = channelFilePath + os.sep + i

    if os.path.isdir(filePath_tracks):

    shutil.rmtree(filePath_tracks)  # 递归删除目录树

    elif os.path.isfile(filePath_tracks):

    os.remove(filePath_tracks)  # 删除文件

    os.makedirs(filePath_tracks)  # 创建目录

    '''

            print("==============begin get_m4a murl====================")

    print("==============begin get_m4a html====================")

    html = requests.get(murl,headers=headers1).text

    print(html)

    print("==============end get_m4a html====================")

    print("==============begin get_m4a dic====================")

    dic = json.loads(html)

    print(dic)

    print("==============end get_m4a dic====================")

    print("==============begin get_m4a getdata====================")

    imageUrl = dic["play_path"]

    print(imageUrl)

    imgData = urllib.request.urlopen(imageUrl).read()

    print("==============end get_m4a getdata====================")

    print("==============begin get_m4a savedata====================")

    #iamgeUrlArr = imageUrl.split("/")

    #imgFilePath = iamgeUrlArr[len(iamgeUrlArr) - 1]

            postfixArr = imageUrl.split(".")

    postfix = postfixArr[len(postfixArr) -1]

    imgFilePath = dic['title'] +"." + postfix

    imgFilePath = filePath_tracks + os.sep + imgFilePath# + getTimeStr() + ".jpg"

            imageFile =open(imgFilePath,"wb")

    imageFile.write(imgData)

    imageFile.close()

    print("==============end get_m4a savedata====================")

    print("下载文件", imgFilePath,"成功,另存路径:" + imgFilePath)

    print("==============end get_m4a dic====================")

    #col2.insert(dic)

    #print(murl + '中的数据已被成功插入mongodb')

    if __name__ =='__main__':

    print("begin")

    get_url()

    print("end")

    相关文章

      网友评论

        本文标题:抓取网站音频

        本文链接:https://www.haomeiwen.com/subject/ojlmfftx.html