平台:VS Code
语言:python
代码链接:https://pan.baidu.com/s/1boIMDaB
from urllib import request
import re
#目标URL地址
url = 'https://www.panda.tv/cate/lol'
# *? 表示非贪婪的匹配,匹配尽可能少的HTML
#()-->提取匹配的字符串 [] -->定义匹配的字符范围 {} -->表示匹配的长度
root_pattern = '<div class="video-info">([\s\S]*?)</div>'
name_pattern = '></i>([\s\S]*?)</span>'
number_pattern = '<span class="video-number">([\s\S]*?)</span>'
def __fetch_content(self):
#打开对应的url获取HTML
r = request.urlopen(Spider.url)
#读取HTML
htmls = r.read()
#把byte数据换成字符串
htmls = str(htmls,encoding='utf-8')
return htmls
def __analysis(self,htmls):
#root_html返回为一个list
root_html = re.findall(Spider.root_pattern,htmls)
anchors = []
for html in root_html:
name = re.findall(Spider.name_pattern,html)
number = re.findall(Spider.number_pattern,html)
anchor ={'name':name,'number':number}
#加入到list集合
anchors.append(anchor)
return anchors
#对爬取内容进行清洗
def __refine(self,anchors):
l = lambda anchor:{
'name':anchor['name'][0].strip(),
'number':anchor['number'][0]
}
#用map函数进行逐一的处理
return map(l,anchors)
#应用sorted函数,排序前进对排序规则进行设定
def __sort_seed(self,anchor):
r = re.findall('\d*',anchor['number'])
number = float(r[0])
#把爬虫中带‘万’字的转成对应的数字
if '万' in anchor['number']:
number *=10000
return number
#对value进行排序,从大往小排
def __sort(self,anchors):
anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
return anchors
def __show(self,anchors):
for rank in range(0,len(anchors)):
print("rank: "+str(rank+1)+" name: "+anchors[rank]['name']+" number: "+anchors[rank]['number'])
def go(self):
htmls = self.__fetch_content()
anchors = self.__analysis(htmls)
anchors = list(self.__refine(anchors))
anchors = self.__sort(anchors)
self. __show(anchors)
#类变量相当java的static变量需要用类名调用
#私有函数仅供本类实例调用
spider = Spider()
spider.go()
data:image/s3,"s3://crabby-images/25af2/25af2ed5fa310254a7a87e4cbf297ba1a1b405e0" alt=""
网友评论