抓取网站音频

抓取网站音频

作者: 流星一剑 | 来源:发表于2018-03-13 10:07 被阅读0次

抓取网站音频
简单的Python爬虫应用_学习笔记003
Python爬虫入门到入职03：全量抓取
python库学习之Requests
Scrapy抓取小说网站存储成json
Python:使用selenium模拟浏览器抓取数据
js逆向之企名片公司信息的抓取
三步走，教你定制自己的个性python爬虫，代码都省了有木有~
网站动态抓取
【Python爬虫作业】- 第十八次使用bs4模块抓取手机号网

要求：抓取喜马拉雅音频存储到本地 D:\temp_ximalaya_audio，并按节目和章节分类存储。

特殊说明：分析思路参考 https://www.jianshu.com/p/fc2e83c6583c

__author__ ='tony'

import json

import random

import time

import urllib.request

import pymongo

import requests

import aiohttp

import asyncio

from bs4import BeautifulSoup

from lxmlimport etree

import os

import shutil

filePath ="D:\\temp_ximalaya_audio"

channelFilePath =""

# 初始化文件目录

if os.path.isdir(filePath):

shutil.rmtree(filePath)# 递归删除目录树

elif os.path.isfile(filePath):

os.remove(filePath)# 删除文件

os.makedirs(filePath)# 创建目录

# mongodb

#clients = pymongo.MongoClient('localhost')

#db = clients["XiMaLaYa"]

#col1 = db["album2"]

#col2 = db["detaile2"]

UA_LIST = [

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

]

headers1 = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

'Cache-Control':'max-age=0',

'Proxy-Connection':'keep-alive',

'Upgrade-Insecure-Requests':'1',

'User-Agent': random.choice(UA_LIST)

}

headers2 = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

'Cache-Control':'max-age=0',

'Proxy-Connection':'keep-alive',

'Referer':'http://www.ximalaya.com/dq/all/2',

'Upgrade-Insecure-Requests':'1',

'User-Agent': random.choice(UA_LIST)

}

def get_url():

#start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]

start_urls = ["http://www.ximalaya.com/dq/all/1/"]

print(start_urls)

for start_urlin start_urls:

print(start_url)

print("===============begin html=============")

html = requests.get(start_url,headers=headers1).text

print("html = {}".format(html))

print("===============end html=============")

print("===============begin soup=============")

soup = BeautifulSoup(html,'lxml')

print(soup)

print("===============end soup=============")

for itemin soup.find_all(class_="albumfaceOutter"):

print("================begin item================")

print(item)

print("================end item=========================")

print("================begin content================")

content = {

'href': item.a['href'],

'title': item.img['alt'],

'img_url': item.img['src']

}

print(content)

print("================end content=========================")

#col1.insert(content)

print('写入一个频道' + item.a['href'])

subchannel = item.a['href']

print("============begin subchannel===================")

print(subchannel)

subchannelArr = subchannel.split("/")

print(subchannelArr)

#channelFilePath = subchannelArr[len(subchannelArr) - 2]

channelFilePath = content['title']

print(channelFilePath)

channelFilePath = filePath + os.sep + channelFilePath

print(channelFilePath)

if os.path.isdir(channelFilePath):

shutil.rmtree(channelFilePath)# 递归删除目录树

elif os.path.isfile(channelFilePath):

os.remove(channelFilePath)# 删除文件

os.makedirs(channelFilePath)# 创建目录

print("============end subchannel===================")

print(content)

another(channelFilePath, item.a['href'])

time.sleep(1)

def another(channelFilePath, url):

print("=======================begin another html=======================")

html = requests.get(url,headers=headers2).text

print(html)

print("=======================end another html=======================")

print("=======================begin another ifanother=======================")

ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')

print(ifanother)

print("=======================end another ifanother=======================")

if len(ifanother):

num = ifanother[0]

print('本频道资源存在' + num +'个页面')

for nin range(1,int(num)):

print('开始解析{}个中的第{}个页面'.format(num, n))

url2 = url +'?page={}'.format(n)

print(url)

print(url2)

get_m4a(channelFilePath, url2)

get_m4a(url)

def get_m4a(channelFilePath, url):

time.sleep(1)

html = requests.get(url,headers=headers2).text

print("==============begin get_m4a====================")

numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')

print(numlist)

print("==============end get_m4a====================")

for iin numlist:

print("==============begin get_m4a murl====================")

murl ='http://www.ximalaya.com/tracks/{}.json'.format(i)

print(murl)

print(channelFilePath)

filePath_tracks = channelFilePath

'''

filePath_tracks = channelFilePath + os.sep + i

if os.path.isdir(filePath_tracks):

shutil.rmtree(filePath_tracks) # 递归删除目录树

elif os.path.isfile(filePath_tracks):

os.remove(filePath_tracks) # 删除文件

os.makedirs(filePath_tracks) # 创建目录

'''

print("==============begin get_m4a murl====================")

print("==============begin get_m4a html====================")

html = requests.get(murl,headers=headers1).text

print(html)

print("==============end get_m4a html====================")

print("==============begin get_m4a dic====================")

dic = json.loads(html)

print(dic)

print("==============end get_m4a dic====================")

print("==============begin get_m4a getdata====================")

imageUrl = dic["play_path"]

print(imageUrl)

imgData = urllib.request.urlopen(imageUrl).read()

print("==============end get_m4a getdata====================")

print("==============begin get_m4a savedata====================")

#iamgeUrlArr = imageUrl.split("/")

#imgFilePath = iamgeUrlArr[len(iamgeUrlArr) - 1]

postfixArr = imageUrl.split(".")

postfix = postfixArr[len(postfixArr) -1]

imgFilePath = dic['title'] +"." + postfix

imgFilePath = filePath_tracks + os.sep + imgFilePath# + getTimeStr() + ".jpg"

imageFile =open(imgFilePath,"wb")

imageFile.write(imgData)

imageFile.close()

print("==============end get_m4a savedata====================")

print("下载文件", imgFilePath,"成功,另存路径:" + imgFilePath)

print("==============end get_m4a dic====================")

#col2.insert(dic)

#print(murl + '中的数据已被成功插入mongodb')

if __name__ =='__main__':

print("begin")

get_url()

print("end")

相关文章

抓取网站音频
要求：抓取喜马拉雅音频存储到本地D:\temp_ximalaya_audio，并按节目和章节分类存储。特殊说明：...
简单的Python爬虫应用_学习笔记003
从京东网站抓取_007 `接上篇内容从京东网站抓取_006 组织成列表的形式返回内容商品价格的抓取京东网站价...
Python爬虫入门到入职03：全量抓取
全量抓取是一种常见的抓取方式，针对目标网站进行批量抓取，需要我们进行翻页操作，遍历整个网站。本章知识点：网页中...
python库学习之Requests
基本使用抓取网页（文本）抓取二进制数据（图片，音频，视频）如果要保存图片
Scrapy抓取小说网站存储成json
抓取的目标网站是小说网站：笔趣看小说名为元尊http://biqukan.com/0_790/ 思路;先抓取...
Python:使用selenium模拟浏览器抓取数据
有些js动态网站和反抓取的网站，对requests的检查比较严格, 使用requests包很难抓取数据。这时可以考...
js逆向之企名片公司信息的抓取
最近应公司需求抓取企名片这个网站，抓取创业项目模块的信息，经过抓取分析发现这网站存在js加密，下面就让我们一步步的...
三步走，教你定制自己的个性python爬虫，代码都省了有木有~
想抓取各大招聘网站上的职位信息吗，想抓取各大电商网站上的商品信息吗，想抓取1024上各种不可描述吗？看这里，看这里...
网站动态抓取
今天研究了一下动态网站的抓取，这里用到的时选股宝这个网站：https://xuangubao.cn/ 这个小案例是...
【Python爬虫作业】- 第十八次使用bs4模块抓取手机号网
一、bs4模块解析的用法复习二、抓取逻辑的复习三、http://www.51hao.cc/ 网站全站抓取抓取...

网友评论

我爱编程

本文标题：抓取网站音频

本文链接：https://www.haomeiwen.com/subject/ojlmfftx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

栏目导航

热点阅读

我爱编程

关于我们|服务条款|联系我们|抓取网站音频|投稿指南|网站地图|RSS订阅|排版工具|手机版

提供经典美文摘抄,优美散文欣赏,现代诗歌精选,短篇小说,心情随笔,表白情书范文,故事会在线阅读欣赏

Copyright © 2014-2023 Haomeiwen.com All Rights Reserved. 好美文阅读网版权所有

备案信息：桂公网安备 45052102000051号 · 桂ICP备13007215号-3

本站所收录作品、热点评论等信息部分来源互联网，目的只是为了系统归纳学习和传递资讯

所有作品版权归原创作者所有，与本站立场无关，如不慎侵犯了你的权益，请联系我们告知，我们将做删除处理！