爬虫-豆瓣音乐top250信息

作者: 我问你瓜保熟吗 | 来源:发表于2018-08-31 12:00 被阅读12次

爬虫-豆瓣音乐top250信息
Python学习
练习：豆瓣电影TOP250爬虫
Python第三天（spider_豆瓣）
豆瓣爬虫实践-python版
用Xpath爬取豆瓣音乐top250遇到的问题
爬虫爬取豆瓣top250
IR03-利用Scrapy爬取豆瓣电影Top250
用23行代码爬取豆瓣音乐top250
Python学习

介绍：爬取豆瓣音乐TOP250的数据，练习到了了MondoDB，正则表达式，lxml

import requests
from lxml import etree
import re
import time
import pymongo

x = 0

# 连接数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client['mydb']
musictop = mydb['musictop']

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
}

# 取得每一页中25个音乐的url
def get_url_music(url):
    html = requests.get(url, headers=headers)
    selector = etree.HTML(html.text)
    music_urls = selector.xpath( '//*[@id="content"]//div//tr//a/@href')   # @href @取属性，text()取标签内容
    music_urls=list(music_urls)[0::2]

    for music_url in music_urls:
        get_url_info(music_url)
        # print(music_url)


# 提取每个音乐的详细信息
def get_url_info(music_url):
    html = requests.get(music_url)
    selector = etree.HTML(html.text)

    name = selector.xpath('//*[@id="wrapper"]/h1/span/text()')[0]
    # name = selector.xpath('//*[@id="info"]/span[1]/text()')[0]
    author = re.findall('表演者:.*?>(.*?)</a>', html.text, re.S)[0]              # .*? ?以非贪婪模式，re.S,匹配包括换行符
    style = re.findall('流派:</span>&nbsp;(.*?)<br>?', html.text, re.S)          # &nbsp 表示1个空格
    try:
        style = style[0].strip()
    except:
        style = "未知"
    pubtime = re.findall('发行时间:</span>&nbsp;(.*?)<br>?', html.text, re.S)[0].strip()
    publisher = re.findall('出版者:</span>&nbsp;(.*?)<br>?', html.text, re.S)
    if len(publisher) == 0:
        publisher = "未知"
    else:
        publisher = publisher[0].strip()

    score = selector.xpath('//*[@id="interest_sectl"]//strong/text()')[0]

    # 每首音乐的信息以字典的形式存放
    info = {
        'name': name,
        'author': author,
        'style': style,
        'time': pubtime,
        'score': score,
    }

    # 向数据库插入数据
    musictop.insert_one(info)

    global x
    x += 1
    print(x, info)


if __name__ == '__main__':
    urls = [ 'https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]  # 取得10页的rul
    for url in urls:
        get_url_music(url)
        time.sleep(0.5)