美文网首页python爬虫惠大人工智能
学习记录--爬取网易云全部歌单

学习记录--爬取网易云全部歌单

作者: modao233 | 来源:发表于2019-08-19 21:07 被阅读1次

实现思路参考:

版权声明:本文为CSDN博主「python爬虫人工智能大数据」的原创文章,遵循CC 4.0 by-sa版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/weixin_41666747/article/details/82723781

免责声明

声明:

本栏内容只供学习使用,如有违反法律,与本人无关。

文章内容仅供参考,如因本人发布的作品内容涉及版权或存在其他问题,请联系我进行删除。

import re
from copy import deepcopy
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from selenium import webdriver

start_url = "http://music.163.com/discover/playlist"
headers = {
    'User-Agent' : str(UserAgent().random),
    'Cookie': '_ntes_nnid=4eb2f3b7db0ddea87dbd349abfc53cb9,1560555139',
    'Referer': 'http://music.163.com/'
}

category_list = []
total_playlist_list = []

# 获取网页
def get_page(url):
    try:
        res = requests.get(url, headers=headers)
        res.raise_for_status()
        res.encoding = res.apparent_encoding
        return res.content
    except:
        print('requests error')


def get_category_list():
    res = get_page(start_url)
    bsObj = BeautifulSoup(res, "html.parser")
    dl_list = bsObj.select('div.bd dl.f-cb')

    for dl in dl_list:
        b_cate = dl.find('dt').get_text()   #获取歌单分类原则
        a_list = dl.find('dd').find_all('a')    #由分类原则分出的类
        for a in a_list:
            item = {}
            item['b_cate'] = b_cate
            item['s_cate'] = a.get_text()
            item['a_href'] = "http://music.163.com" + a['href']     #每一类歌单的链接
            category_list.append(item)


def get_playlist_list(item):
    playlist_list = []
    if item['a_href'] is not None:
        scate_resp = get_page(item['a_href'])
        scate_bs = BeautifulSoup(scate_resp, "html.parser")
        li_list = scate_bs.select('ul#m-pl-container li')
        for li in li_list:
            #获取歌单名称
            item['playlist_title'] = li.find('p',{'class':'dec'}).find('a')['title']
            #获取歌单链接
            item["playlist_href"] = "http://music.163.com"+li.find('p', {'class':'dec'}).find('a')['href']
            #获取歌单作者
            item['author_name'] = li.find_all('p')[-1].find('a')['title']
            #获取歌单作者链接
            item['author_href'] = "http://music.163.com" + li.find_all('p')[-1].find('a')['href']
            playlist_list.append(deepcopy(item))
        total_playlist_list.extend(playlist_list)
        #获取下一页歌单列表的链接
        next_url = scate_bs.find('a', text='下一页')['href']
        if next_url is not None and next_url != 'javascript:void(0)':
            item['a_href'] = "http://music.163.com"+next_url
            #递归进去
            return get_playlist_list(item)

def get_palylist_info(playlist):
    if playlist['playlist_href'] is not None:
        palylist_res = get_page(playlist['playlist_href'])
        #歌单封面
        playlist['covers'] = re.findall("\"images\":.*?\[\"(.*?)\"\],", palylist_res.decode('utf-8'))
        playlist['covers'] = playlist['covers'][0] if len(playlist['covers']) > 0 else None
        #创建时间 --> 已更改为动态加载
        playlist['create_time'] = re.findall("\"pubData\":\"(.*?)\"", palylist_res.decode('utf-8'))
        playlist['create_time'] = playlist['create_time'][0] if len(playlist['create_time']) > 0 else None
        playlist_bs = BeautifulSoup(palylist_res, 'html.parser')
        #被添加次数
        playlist['favorited_times'] = playlist_bs.find('a', {'data-res-action':'fav'})['data-count']
        #被分享次数
        playlist['shared_times'] = playlist_bs.find('a', {'data-res-action':'share'})['data-count']
        # 歌单介绍
        playlist['desc'] = playlist_bs.find('p', {'id':'album-desc-more'}).get_text()
        #歌单播放次数
        playlist['played_times'] = playlist_bs.find('strong', {'id':'play-count'}).get_text()
        #歌单包含曲目信息
        playlist['tracks'] = get_playlist_tracks(playlist['playlist_href'])
        #返回列表进行打印
        return playlist

def get_playlist_tracks(href):
    driver = webdriver.Firefox(firefox_binary='D:/Mozilla Firefox/firefox.exe')
    driver.get(href)
    #设置frame原因可参考http://www.51testing.com/html/87/300987-831171.html
    driver.switch_to.frame("g_iframe")
    tr_list = driver.find_elements_by_xpath("//tbody/tr")
    playlist_tracks = []
    for tr in tr_list:
        track = {}
        track['name'] = tr.find_element_by_xpath("./td[2]/div/div/div/span/a/b").get_attribute("title")
        track['duration'] = tr.find_element_by_xpath("./td[3]/span").text
        track['singer'] = tr.find_element_by_xpath("./td[4]/div").get_attribute("title")
        track['album_name'] = tr.find_element_by_xpath("./td[5]/div/a").get_attribute("title")
        playlist_tracks.append(track)
    driver.close()
    return playlist_tracks

get_category_list()
for cl in category_list:
    get_playlist_list(cl)
    for pl in total_playlist_list:
        print(get_palylist_info(pl))

相关文章

网友评论

    本文标题:学习记录--爬取网易云全部歌单

    本文链接:https://www.haomeiwen.com/subject/ybxysctx.html