最近因毕业论文已经写完,人比较闲,便琢磨着找点事做。然后正好在听歌时冒出个念头:网易云音乐上那首歌的评论最多?这一下子像打开了潘多拉宝盒,人变得异常亢奋,赶紧打开浏览器搜索,这时才,发现还没人干这件无聊的事,于是秉着老祖宗留下的八字真言:自己动手丰衣足食,便琢磨着自己用python写个小爬虫。
# coding=utf-8
# 爬虫,爬取网易云音乐中歌曲的评论数,并给出最高评论数歌曲的编号。
import urllib
import re
import time
import random
import urllib2
#获取网页信息
def gethtml(url):
user_agent=["User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"User-Agent:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"
"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"
"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"]
request=urllib2.Request(url)
#输入一些必要的header,尽量模拟浏览器访问,减少被T的次数
request.add_header('Content-type', 'application/json')
request.add_header("User-Agent",random.choice(user_agent))
request.add_header('Referer','http://music.163.com/')
responce=urllib2.urlopen(request)
return responce.read()
#获取歌曲名称
def get_title(html):
#利用正则表达式从网页信息中筛选歌曲名
#这里仅对中文、英文、日文、韩文进行了匹配
s=u"<title>([\xAC00-\xD7A3\x3130-\x318F\u0800-\u4e00\u4e00-\u9fa5A-Za-z\s_]*)"
title=re.search(s,html)
return title.group(1)
#获取评论数
def get_cnum(html):
#利用正则表达式从网页信息中筛选出特定ID歌曲的评论数
s=r'id="cnt_comment_count">(\d*)'
cnum=re.search(s,html)
if cnum is None:
return 0
else:
return cnum.group(1)
def main(id,file):
#time.sleep(0.2**random.random())
html=gethtml("http://music.163.com/song?id="+str(id))
html=html.decode("utf8")
title=get_title(html)
cnum=get_cnum(html)
if unicode.encode(title,'utf-8')!="网易云音乐 听见好时光<":
file.write( unicode.encode(title,'utf-8')+","+str(id)+","+str(cnum)+"\n")
print title,cnum
num=1
file=open(r"e:\python\output\comment.txt","a")
print "begin........"
for id in range(num,1000000):
try:
main(id,file)
if id%100000==0:
file.close()
path=r"e:\python\output\comment_"+str(id/10000)+".txt"
file=open(path,"a")
except:
time.sleep(random.randint(3,10))
file.close()
网友评论