介绍:爬取豆瓣音乐TOP250的数据,练习到了了MondoDB,正则表达式,lxml
import requests
from lxml import etree
import re
import time
import pymongo
x = 0
# 连接数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client['mydb']
musictop = mydb['musictop']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
}
# 取得每一页中25个音乐的url
def get_url_music(url):
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
music_urls = selector.xpath( '//*[@id="content"]//div//tr//a/@href') # @href @取属性,text()取标签内容
music_urls=list(music_urls)[0::2]
for music_url in music_urls:
get_url_info(music_url)
# print(music_url)
# 提取每个音乐的详细信息
def get_url_info(music_url):
html = requests.get(music_url)
selector = etree.HTML(html.text)
name = selector.xpath('//*[@id="wrapper"]/h1/span/text()')[0]
# name = selector.xpath('//*[@id="info"]/span[1]/text()')[0]
author = re.findall('表演者:.*?>(.*?)</a>', html.text, re.S)[0] # .*? ?以非贪婪模式,re.S,匹配包括换行符
style = re.findall('流派:</span> (.*?)<br>?', html.text, re.S) #   表示1个空格
try:
style = style[0].strip()
except:
style = "未知"
pubtime = re.findall('发行时间:</span> (.*?)<br>?', html.text, re.S)[0].strip()
publisher = re.findall('出版者:</span> (.*?)<br>?', html.text, re.S)
if len(publisher) == 0:
publisher = "未知"
else:
publisher = publisher[0].strip()
score = selector.xpath('//*[@id="interest_sectl"]//strong/text()')[0]
# 每首音乐的信息以字典的形式存放
info = {
'name': name,
'author': author,
'style': style,
'time': pubtime,
'score': score,
}
# 向数据库插入数据
musictop.insert_one(info)
global x
x += 1
print(x, info)
if __name__ == '__main__':
urls = [ 'https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)] # 取得10页的rul
for url in urls:
get_url_music(url)
time.sleep(0.5)
正在爬.png
已经存入到MongoDB中.png
来自:从零开始学python网络爬虫
网友评论