美文网首页Python
爬取网易云音乐

爬取网易云音乐

作者: heheddff | 来源:发表于2018-12-14 17:30 被阅读0次

    效果图


    spider_music.py主页面

    # coding=gbk
    from download import Download
    from url_manager import Url_Manager
    from html_parser import Html_Parser
    from save import Save
    from set_text_color import Set_Color
    
    
    class Spider_Music():
        
        def __init__(self):
            self.download = Download()
            self.url_manager = Url_Manager()
            self.html_parser = Html_Parser()
            self.save = Save()
            self.set_color = Set_Color()
            
        def craw(self,url):
            self.url_manager.addurl({'url':url,'name':'temp'})
        
            while self.url_manager.checknewurllength>0:
                newurl = self.url_manager.geturl()
                
                if self.save.checkfile(newurl['name']):
                    self.set_color.printDarkRed("{} 已下载!\n".format(newurl['name']))
                    continue
                
                print("开始下载 {} {}".format(newurl['name'],newurl['url']))
                htmlcontent = self.download.download(newurl['url'])
                
                if htmlcontent['htmlcontents'] == None:
                    self.url_manager.delUrl(newurl)
                    self.url_manager.addurl(newurl)         
                    
                newurls,result = self.html_parser.parser(htmlcontent)
                
                self.url_manager.addurls(newurls)           
                self.save.save(result,newurl['name'])
                print("下载完成 {} ".format(newurl['name']))
            print("共下载{}首歌曲".format(self.save.count))
            
        def main(self):
            self.craw('https://music.163.com/#/playlist?id=2492536378')
    
    spider = Spider_Music()
    spider.main()
    

    download.py负责下载

    # coding=gbk
    import re
    import requests
    from selenium import webdriver
    import random
    from bs4 import BeautifulSoup
    from selenium.webdriver.chrome.options import Options
    
    
    class Download():
        
        __uas = [
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
        "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
        ]
        
        __ips = []
        
        headers = {
                #"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                'Referer':'http://music.163.com/',
                'Host':'music.163.com',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9"
                }
        
        def __init__(self):
            self.url = ''
            #self.__ips = self.get_ip()
        
        def download(self,url):
            self.url = url
            #print(url)
            return self.patterns        
        
        @property
        def patterns(self):
            playlist = re.compile("playlist\?id=\d+")   #匹配歌单
            song = re.compile("song/media/outer/url\?id=\d+")   #匹配下载地址
            
            res = {
                'identify':False,
                'htmlcontents':'',
            }
            
            if re.search(song,self.url):
                res['identify'] = 1
                res['htmlcontents'] = self.getmusic()   #用于获取mp3
            elif re.search(playlist,self.url):
                res['identify'] = 2
                res['htmlcontents'] = self.geturl() #获取网页内容
                
            return res
        
        #未找到高可用代理,功能暂时停止
        def get_ip(self):
            url = "https://www.kuaidaili.com/free/inha/1/"
            res = requests.get(url)
            soup = BeautifulSoup(res.text,'html.parser')
            data = soup.find(id="list").find('tbody').find_all('tr')
            ip_compile= re.compile(r'<td data-title="IP">(\d+\.\d+\.\d+\.\d+)</td>')    # 匹配IP
            port_compile = re.compile(r'<td data-title="PORT">(\d+)</td>')                # 匹配端口
            ip = re.findall(ip_compile,str(data))       # 获取所有IP
            port = re.findall(port_compile,str(data))   # 获取所有端口
            return [":".join(i) for i in zip(ip,port)]  # 组合IP+端口,如:115.112.88.23:8080
            
            
                    
        def getmusic(self):
            try:
                url = self.getrealurl()
                host = url.strip('http://').split('/')[0]
                headers = {
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                    "Accept-Encoding": "gzip, deflate",
                    "Accept-Language": "zh-CN,zh;q=0.9",
                    "Host": host,
                    "User-Agent": self.__uas[random.randint(0,6)]#模拟不同浏览器
                    }
                ip = random.choice(self.__ips)
                proxies = {
                    'http':'http://'+ip,
                    'https':'http://'+ip
                        }
                res = requests.get(url,headers=headers)
            except Exception as e:
                print(e)
                return
            else:
                return res.content
                
        def getrealurl(self):
            res = requests.get(self.url,headers=self.headers)
            return res.url
                
        def geturl(self):
            try:
                chrome_options = Options()
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--disable-gpu')            
                brower = webdriver.Chrome("D:\\tools\\chromedriver_win32\\chromedriver.exe",options=chrome_options)#创建driver,参数为插件的路径
                brower.get(self.url)
                brower.switch_to.frame(brower.find_element_by_name("contentFrame"))#切换到指定框架
            except Exception as e:
                print(e)
                return
            else:
                return brower.page_source
            
    #d=Download()
    #print(d.get_ip())
    #res = d.download('http://music.163.com/song/media/outer/url?id=28160459.mp3')
    #print(res['identify'])
    #print(res['htmlcontents'])
    

    html_parser.py负责网页内容解析

    from bs4 import BeautifulSoup
    
    class Html_Parser():
        baseurl = "http://music.163.com/song/media/outer/url?{}.mp3"
        
        def parser(self,res):
            
            if res.get('identify') == 1:
                #print(res['identify'])
                return None,res.get('htmlcontents',False)
                
            else:
                return self.geturls(res['htmlcontents'])
        
        def geturls(self,htmlcontent):
            #print(htmlcontent)
            newsurl=list()
            try:
                soup = BeautifulSoup(htmlcontent,'html.parser')
                songlist = soup.find('table').find_all('tr')[1:]
            
                for link in songlist:
                    url = self.baseurl.format(link.find_all('td')[1].find('a')['href'].split('?')[-1])
                    name = link.find_all('td')[1].find('a').find('b')['title']
                    newsurl.append({'url':url,'name':name})
            except Exception as e:
                print(e)
                pass
            else:
                return newsurl,False
    

    url_manager.pyurl管理器

    # coding=gbk
    class Url_Manager():
        
        __newurl = list()   #存放未下载的url
        __oldurl = list()   #存放已下载的url
        
        def addurl(self,url):
            if url == None:
                return
            if self.checkurl(url):
                self.__newurl.append(url)
            
        def addurls(self,urls):
            if urls == None:
                return
                
            for url in urls:
                self.addurl(url)
            
        def geturl(self):
            newurl = self.__newurl.pop()
            self.__oldurl.append(newurl)
            return newurl
        
        def delUrl(self,url):
            if url in self.__oldurl:
                self.__oldurl.remove(url)
                
        @property   
        def checknewurllength(self):
            return len(self.__newurl)
        
        def checkurl(self,url):
            if url not in self.__newurl and url not in self.__oldurl:
                return True
            else:
                return False
    

    save.py保存下载内容

    # coding=gbk
    import os
    
    class Save():
        path="./download/"
        count = 0
        
        def __init__(self):
            self.mkdir(self.path)
        
        def save(self,contents,name):
            if contents and name:
                try:
                    with open(self.remove_special_characters(name),'wb') as f:
                        f.write(contents)
                except Exception as e:
                    print(e)
                    pass
                else:
                    self.count+=1
        #创建文件存放目录
        def mkdir(self,path):
            if os.path.exists(path):
                return
            os.makedirs(path)
        
        #防止重复下载 
        def checkfile(self,name):
            if name == 'temp':
                return
            return os.path.exists(self.remove_special_characters(name))
        
        #确保windows下文件可创建成功
        def remove_special_characters(self,string):
            #windows文件名中不能有下列符号:'\\', '/', ':', '*', '?', '"', '<', '>', '|'
            special_characters = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']
            for special_character in special_characters:
                string = string.replace(special_character,'')
            return '/'.join([self.path.strip('/'),string.strip()])+".mp3"
    

    set_text_color.py设置cmd窗口显示颜色

    # coding=gbk
    #参考地址https://blog.csdn.net/wy_97/article/details/79663014
    
    import ctypes,sys
    class Set_Color():
        STD_INPUT_HANDLE = -10
        STD_OUTPUT_HANDLE = -11
        STD_ERROR_HANDLE = -12
    
        # 字体颜色定义 ,关键在于颜色编码,由2位十六进制组成,分别取0~f,前一位指的是背景色,后一位指的是字体色
        #由于该函数的限制,应该是只有这16种,可以前景色与背景色组合。也可以几种颜色通过或运算组合,组合后还是在这16种颜色中
    
        # Windows CMD命令行 字体颜色定义 text colors
        FOREGROUND_BLACK = 0x00 # black.
        FOREGROUND_DARKBLUE = 0x01 # dark blue.
        FOREGROUND_DARKGREEN = 0x02 # dark green.
        FOREGROUND_DARKSKYBLUE = 0x03 # dark skyblue.
        FOREGROUND_DARKRED = 0x04 # dark red.
        FOREGROUND_DARKPINK = 0x05 # dark pink.
        FOREGROUND_DARKYELLOW = 0x06 # dark yellow.
        FOREGROUND_DARKWHITE = 0x07 # dark white.
        FOREGROUND_DARKGRAY = 0x08 # dark gray.
        FOREGROUND_BLUE = 0x09 # blue.
        FOREGROUND_GREEN = 0x0a # green.
        FOREGROUND_SKYBLUE = 0x0b # skyblue.
        FOREGROUND_RED = 0x0c # red.
        FOREGROUND_PINK = 0x0d # pink.
        FOREGROUND_YELLOW = 0x0e # yellow.
        FOREGROUND_WHITE = 0x0f # white.
    
    
        # Windows CMD命令行 背景颜色定义 background colors
        BACKGROUND_BLUE = 0x10 # dark blue.
        BACKGROUND_GREEN = 0x20 # dark green.
        BACKGROUND_DARKSKYBLUE = 0x30 # dark skyblue.
        BACKGROUND_DARKRED = 0x40 # dark red.
        BACKGROUND_DARKPINK = 0x50 # dark pink.
        BACKGROUND_DARKYELLOW = 0x60 # dark yellow.
        BACKGROUND_DARKWHITE = 0x70 # dark white.
        BACKGROUND_DARKGRAY = 0x80 # dark gray.
        BACKGROUND_BLUE = 0x90 # blue.
        BACKGROUND_GREEN = 0xa0 # green.
        BACKGROUND_SKYBLUE = 0xb0 # skyblue.
        BACKGROUND_RED = 0xc0 # red.
        BACKGROUND_PINK = 0xd0 # pink.
        BACKGROUND_YELLOW = 0xe0 # yellow.
        BACKGROUND_WHITE = 0xf0 # white.
    
        std_out_handle = ctypes.windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
        # get handle
        
    
        def set_cmd_text_color(self,color, handle=False):
            if handle:
                Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(handle, color)
            else:
                Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(self.std_out_handle, color)
            return Bool
    
        #reset white
        def resetColor(self):
            self.set_cmd_text_color(self.FOREGROUND_GREEN)
    
        #reset white
        def resetDefault(self):
            self.set_cmd_text_color(self.FOREGROUND_RED | self.FOREGROUND_GREEN | self.FOREGROUND_BLUE)
        
        ###############################################################
    
        #暗蓝色
        #dark blue
        def printDarkBlue(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_DARKBLUE)
            sys.stdout.write(mess)
            self.resetColor()
    
        #暗绿色
        #dark green
        def printDarkGreen(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_DARKGREEN)
            sys.stdout.write(mess)
            self.resetColor()
    
        #暗天蓝色
        #dark sky blue
        def printDarkSkyBlue(mess):
            self.set_cmd_text_color(self.FOREGROUND_DARKSKYBLUE)
            sys.stdout.write(mess)
            self.resetColor()
    
        #暗红色
        #dark red
        def printDarkRed(self,mess):
            #self.set_back()
            self.set_cmd_text_color(self.FOREGROUND_DARKRED)
            sys.stdout.write(mess)
            self.resetColor()
    
        #暗粉红色
        #dark pink
        def printDarkPink(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_DARKPINK)
            sys.stdout.write(mess)
            self.resetColor()
    
        #暗黄色
        #dark yellow
        def printDarkYellow(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_DARKYELLOW)
            sys.stdout.write(mess)
            self.resetColor()
    
        #暗白色
        #dark white
        def printDarkWhite(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_DARKWHITE)
            sys.stdout.write(mess)
            self.resetColor()
    
        #暗灰色
        #dark gray
        def printDarkGray(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_DARKGRAY)
            sys.stdout.write(mess)
            self.resetColor()
    
        #蓝色
        #blue
        def printBlue(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_BLUE)
            sys.stdout.write(mess)
            self.resetColor()
    
        #绿色
        #green
        def printGreen(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_GREEN)
            sys.stdout.write(mess)
            self.resetColor()
    
        #天蓝色
        #sky blue
        def printSkyBlue(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_SKYBLUE)
            sys.stdout.write(mess)
            self.resetColor()
    
        #红色
        #red
        def printRed(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_RED)
            sys.stdout.write(mess)
            self.resetColor()
    
        #粉红色
        #pink
        def printPink(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_PINK)
            sys.stdout.write(mess)
            self.resetColor()
    
        #黄色
        #yellow
        def printYellow(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_YELLOW)
            sys.stdout.write(mess)
            self.resetColor()
    
        #白色
        #white
        def printWhite(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_WHITE)
            sys.stdout.write(mess)
            self.resetColor()
    
        ##################################################
    
        #白底黑字
        #white bkground and black text
        def printWhiteBlack(self,mess):
            self.set_cmd_text_color(self.FOREGROUND_BLACK | self.BACKGROUND_WHITE)
            sys.stdout.write(mess)
            self.resetColor()
    
        #白底黑字
        #white bkground and black text
        def printWhiteBlack_2(self,mess):
            self.set_cmd_text_color(0xf0)
            sys.stdout.write(mess)
            self.resetColor()
    
    
        #黄底蓝字
        #white bkground and black text
        def printYellowRed(self,mess):
            self.set_cmd_text_color(BACKGROUND_YELLOW | FOREGROUND_RED)
            sys.stdout.write(mess)
            self.resetColor()
    
    
        ##############################################################
        """
        if __name__ == '__main__':
    
            print
            printDarkBlue('printDarkBlue:暗蓝色文字\n')
            printDarkGreen('printDarkGreen:暗绿色文字\n')
            printDarkSkyBlue(u'printDarkSkyBlue:暗天蓝色文字\n')
            printDarkRed(u'printDarkRed:暗红色文字\n')
            printDarkPink(u'printDarkPink:暗粉红色文字\n')
            printDarkYellow(u'printDarkYellow:暗黄色文字\n')
            printDarkWhite(u'printDarkWhite:暗白色文字\n')
            printDarkGray(u'printDarkGray:暗灰色文字\n')
            printBlue(u'printBlue:蓝色文字\n')
            printGreen(u'printGreen:绿色文字\n')
            printSkyBlue(u'printSkyBlue:天蓝色文字\n')
            printRed(u'printRed:红色文字\n')
            printPink(u'printPink:粉红色文字\n')
            printYellow(u'printYellow:黄色文字\n')
            printWhite(u'printWhite:白色文字\n')
            printWhiteBlack(u'printWhiteBlack:白底黑字输出\n')
            printWhiteBlack_2(u'printWhiteBlack_2:白底黑字输出\n')
            printYellowRed('printYellowRed:黄底红字输出\n')
         """
    #c = Set_Color()
    #c.printDarkRed(u'printDarkRed:暗红色文字\n')
    

    相关文章

      网友评论

        本文标题:爬取网易云音乐

        本文链接:https://www.haomeiwen.com/subject/vggghqtx.html