实现思路参考:
版权声明:本文为CSDN博主「python爬虫人工智能大数据」的原创文章,遵循CC 4.0 by-sa版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/weixin_41666747/article/details/82723781
免责声明
声明:
本栏内容只供学习使用,如有违反法律,与本人无关。
文章内容仅供参考,如因本人发布的作品内容涉及版权或存在其他问题,请联系我进行删除。
import re
from copy import deepcopy
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from selenium import webdriver
start_url = "http://music.163.com/discover/playlist"
headers = {
'User-Agent' : str(UserAgent().random),
'Cookie': '_ntes_nnid=4eb2f3b7db0ddea87dbd349abfc53cb9,1560555139',
'Referer': 'http://music.163.com/'
}
category_list = []
total_playlist_list = []
# 获取网页
def get_page(url):
try:
res = requests.get(url, headers=headers)
res.raise_for_status()
res.encoding = res.apparent_encoding
return res.content
except:
print('requests error')
def get_category_list():
res = get_page(start_url)
bsObj = BeautifulSoup(res, "html.parser")
dl_list = bsObj.select('div.bd dl.f-cb')
for dl in dl_list:
b_cate = dl.find('dt').get_text() #获取歌单分类原则
a_list = dl.find('dd').find_all('a') #由分类原则分出的类
for a in a_list:
item = {}
item['b_cate'] = b_cate
item['s_cate'] = a.get_text()
item['a_href'] = "http://music.163.com" + a['href'] #每一类歌单的链接
category_list.append(item)
def get_playlist_list(item):
playlist_list = []
if item['a_href'] is not None:
scate_resp = get_page(item['a_href'])
scate_bs = BeautifulSoup(scate_resp, "html.parser")
li_list = scate_bs.select('ul#m-pl-container li')
for li in li_list:
#获取歌单名称
item['playlist_title'] = li.find('p',{'class':'dec'}).find('a')['title']
#获取歌单链接
item["playlist_href"] = "http://music.163.com"+li.find('p', {'class':'dec'}).find('a')['href']
#获取歌单作者
item['author_name'] = li.find_all('p')[-1].find('a')['title']
#获取歌单作者链接
item['author_href'] = "http://music.163.com" + li.find_all('p')[-1].find('a')['href']
playlist_list.append(deepcopy(item))
total_playlist_list.extend(playlist_list)
#获取下一页歌单列表的链接
next_url = scate_bs.find('a', text='下一页')['href']
if next_url is not None and next_url != 'javascript:void(0)':
item['a_href'] = "http://music.163.com"+next_url
#递归进去
return get_playlist_list(item)
def get_palylist_info(playlist):
if playlist['playlist_href'] is not None:
palylist_res = get_page(playlist['playlist_href'])
#歌单封面
playlist['covers'] = re.findall("\"images\":.*?\[\"(.*?)\"\],", palylist_res.decode('utf-8'))
playlist['covers'] = playlist['covers'][0] if len(playlist['covers']) > 0 else None
#创建时间 --> 已更改为动态加载
playlist['create_time'] = re.findall("\"pubData\":\"(.*?)\"", palylist_res.decode('utf-8'))
playlist['create_time'] = playlist['create_time'][0] if len(playlist['create_time']) > 0 else None
playlist_bs = BeautifulSoup(palylist_res, 'html.parser')
#被添加次数
playlist['favorited_times'] = playlist_bs.find('a', {'data-res-action':'fav'})['data-count']
#被分享次数
playlist['shared_times'] = playlist_bs.find('a', {'data-res-action':'share'})['data-count']
# 歌单介绍
playlist['desc'] = playlist_bs.find('p', {'id':'album-desc-more'}).get_text()
#歌单播放次数
playlist['played_times'] = playlist_bs.find('strong', {'id':'play-count'}).get_text()
#歌单包含曲目信息
playlist['tracks'] = get_playlist_tracks(playlist['playlist_href'])
#返回列表进行打印
return playlist
def get_playlist_tracks(href):
driver = webdriver.Firefox(firefox_binary='D:/Mozilla Firefox/firefox.exe')
driver.get(href)
#设置frame原因可参考http://www.51testing.com/html/87/300987-831171.html
driver.switch_to.frame("g_iframe")
tr_list = driver.find_elements_by_xpath("//tbody/tr")
playlist_tracks = []
for tr in tr_list:
track = {}
track['name'] = tr.find_element_by_xpath("./td[2]/div/div/div/span/a/b").get_attribute("title")
track['duration'] = tr.find_element_by_xpath("./td[3]/span").text
track['singer'] = tr.find_element_by_xpath("./td[4]/div").get_attribute("title")
track['album_name'] = tr.find_element_by_xpath("./td[5]/div/a").get_attribute("title")
playlist_tracks.append(track)
driver.close()
return playlist_tracks
get_category_list()
for cl in category_list:
get_playlist_list(cl)
for pl in total_playlist_list:
print(get_palylist_info(pl))
网友评论