美文网首页
爬虫-豆瓣音乐top250信息

爬虫-豆瓣音乐top250信息

作者: 我问你瓜保熟吗 | 来源:发表于2018-08-31 12:00 被阅读12次

    介绍:爬取豆瓣音乐TOP250的数据,练习到了了MondoDB,正则表达式,lxml

    import requests
    from lxml import etree
    import re
    import time
    import pymongo
    
    x = 0
    
    # 连接数据库
    client = pymongo.MongoClient('localhost', 27017)
    mydb = client['mydb']
    musictop = mydb['musictop']
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
    }
    
    # 取得每一页中25个音乐的url
    def get_url_music(url):
        html = requests.get(url, headers=headers)
        selector = etree.HTML(html.text)
        music_urls = selector.xpath( '//*[@id="content"]//div//tr//a/@href')   # @href @取属性,text()取标签内容
        music_urls=list(music_urls)[0::2]
    
        for music_url in music_urls:
            get_url_info(music_url)
            # print(music_url)
    
    
    # 提取每个音乐的详细信息
    def get_url_info(music_url):
        html = requests.get(music_url)
        selector = etree.HTML(html.text)
    
        name = selector.xpath('//*[@id="wrapper"]/h1/span/text()')[0]
        # name = selector.xpath('//*[@id="info"]/span[1]/text()')[0]
        author = re.findall('表演者:.*?>(.*?)</a>', html.text, re.S)[0]              # .*? ?以非贪婪模式,re.S,匹配包括换行符
        style = re.findall('流派:</span>&nbsp;(.*?)<br>?', html.text, re.S)          # &nbsp 表示1个空格
        try:
            style = style[0].strip()
        except:
            style = "未知"
        pubtime = re.findall('发行时间:</span>&nbsp;(.*?)<br>?', html.text, re.S)[0].strip()
        publisher = re.findall('出版者:</span>&nbsp;(.*?)<br>?', html.text, re.S)
        if len(publisher) == 0:
            publisher = "未知"
        else:
            publisher = publisher[0].strip()
    
        score = selector.xpath('//*[@id="interest_sectl"]//strong/text()')[0]
    
        # 每首音乐的信息以字典的形式存放
        info = {
            'name': name,
            'author': author,
            'style': style,
            'time': pubtime,
            'score': score,
        }
    
        # 向数据库插入数据
        musictop.insert_one(info)
    
        global x
        x += 1
        print(x, info)
    
    
    if __name__ == '__main__':
        urls = [ 'https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]  # 取得10页的rul
        for url in urls:
            get_url_music(url)
            time.sleep(0.5)
    
    正在爬.png 已经存入到MongoDB中.png






    来自:从零开始学python网络爬虫

    相关文章

      网友评论

          本文标题:爬虫-豆瓣音乐top250信息

          本文链接:https://www.haomeiwen.com/subject/ygjswftx.html