美文网首页我爱编程
抓取网站音频

抓取网站音频

作者: 流星一剑 | 来源:发表于2018-03-13 10:07 被阅读0次

要求:抓取喜马拉雅音频存储到本地 D:\temp_ximalaya_audio,并按节目和章节分类存储。

特殊说明:分析思路参考 https://www.jianshu.com/p/fc2e83c6583c

__author__ ='tony'

import json

import random

import time

import urllib.request

import pymongo

import requests

import aiohttp

import asyncio

from bs4import BeautifulSoup

from lxmlimport etree

import os

import shutil

filePath ="D:\\temp_ximalaya_audio"

channelFilePath =""

# 初始化文件目录

if os.path.isdir(filePath):

shutil.rmtree(filePath)# 递归删除目录树

elif os.path.isfile(filePath):

os.remove(filePath)# 删除文件

os.makedirs(filePath)# 创建目录

# mongodb

#clients = pymongo.MongoClient('localhost')

#db = clients["XiMaLaYa"]

#col1 = db["album2"]

#col2 = db["detaile2"]

UA_LIST = [

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

]

headers1 = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

'Cache-Control':'max-age=0',

'Proxy-Connection':'keep-alive',

'Upgrade-Insecure-Requests':'1',

'User-Agent': random.choice(UA_LIST)

}

headers2 = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

'Cache-Control':'max-age=0',

'Proxy-Connection':'keep-alive',

'Referer':'http://www.ximalaya.com/dq/all/2',

'Upgrade-Insecure-Requests':'1',

'User-Agent': random.choice(UA_LIST)

}

def get_url():

#start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]

    start_urls = ["http://www.ximalaya.com/dq/all/1/"]

print(start_urls)

for start_urlin start_urls:

print(start_url)

print("===============begin html=============")

html = requests.get(start_url,headers=headers1).text

print("html = {}".format(html))

print("===============end html=============")

print("===============begin soup=============")

soup = BeautifulSoup(html,'lxml')

print(soup)

print("===============end soup=============")

for itemin soup.find_all(class_="albumfaceOutter"):

print("================begin item================")

print(item)

print("================end item=========================")

print("================begin content================")

content = {

'href': item.a['href'],

'title': item.img['alt'],

'img_url': item.img['src']

}

print(content)

print("================end content=========================")

#col1.insert(content)

            print('写入一个频道' + item.a['href'])

subchannel = item.a['href']

print("============begin subchannel===================")

print(subchannel)

subchannelArr = subchannel.split("/")

print(subchannelArr)

#channelFilePath = subchannelArr[len(subchannelArr) - 2]

            channelFilePath = content['title']

print(channelFilePath)

channelFilePath = filePath + os.sep + channelFilePath

print(channelFilePath)

if os.path.isdir(channelFilePath):

shutil.rmtree(channelFilePath)# 递归删除目录树

            elif os.path.isfile(channelFilePath):

os.remove(channelFilePath)# 删除文件

            os.makedirs(channelFilePath)# 创建目录

            print("============end subchannel===================")

print(content)

another(channelFilePath, item.a['href'])

time.sleep(1)

def another(channelFilePath, url):

print("=======================begin another html=======================")

html = requests.get(url,headers=headers2).text

print(html)

print("=======================end another html=======================")

print("=======================begin another ifanother=======================")

ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')

print(ifanother)

print("=======================end another ifanother=======================")

if len(ifanother):

num = ifanother[0]

print('本频道资源存在' + num +'个页面')

for nin range(1,int(num)):

print('开始解析{}个中的第{}个页面'.format(num, n))

url2 = url +'?page={}'.format(n)

print(url)

print(url2)

get_m4a(channelFilePath, url2)

get_m4a(url)

def get_m4a(channelFilePath, url):

time.sleep(1)

html = requests.get(url,headers=headers2).text

print("==============begin get_m4a====================")

numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')

print(numlist)

print("==============end get_m4a====================")

for iin numlist:

print("==============begin get_m4a murl====================")

murl ='http://www.ximalaya.com/tracks/{}.json'.format(i)

print(murl)

print(channelFilePath)

filePath_tracks = channelFilePath

'''

filePath_tracks = channelFilePath + os.sep + i

if os.path.isdir(filePath_tracks):

shutil.rmtree(filePath_tracks)  # 递归删除目录树

elif os.path.isfile(filePath_tracks):

os.remove(filePath_tracks)  # 删除文件

os.makedirs(filePath_tracks)  # 创建目录

'''

        print("==============begin get_m4a murl====================")

print("==============begin get_m4a html====================")

html = requests.get(murl,headers=headers1).text

print(html)

print("==============end get_m4a html====================")

print("==============begin get_m4a dic====================")

dic = json.loads(html)

print(dic)

print("==============end get_m4a dic====================")

print("==============begin get_m4a getdata====================")

imageUrl = dic["play_path"]

print(imageUrl)

imgData = urllib.request.urlopen(imageUrl).read()

print("==============end get_m4a getdata====================")

print("==============begin get_m4a savedata====================")

#iamgeUrlArr = imageUrl.split("/")

#imgFilePath = iamgeUrlArr[len(iamgeUrlArr) - 1]

        postfixArr = imageUrl.split(".")

postfix = postfixArr[len(postfixArr) -1]

imgFilePath = dic['title'] +"." + postfix

imgFilePath = filePath_tracks + os.sep + imgFilePath# + getTimeStr() + ".jpg"

        imageFile =open(imgFilePath,"wb")

imageFile.write(imgData)

imageFile.close()

print("==============end get_m4a savedata====================")

print("下载文件", imgFilePath,"成功,另存路径:" + imgFilePath)

print("==============end get_m4a dic====================")

#col2.insert(dic)

#print(murl + '中的数据已被成功插入mongodb')

if __name__ =='__main__':

print("begin")

get_url()

print("end")

相关文章

网友评论

    本文标题:抓取网站音频

    本文链接:https://www.haomeiwen.com/subject/ojlmfftx.html