美文网首页
慕课学习爬虫实战

慕课学习爬虫实战

作者: CrazyCat_007 | 来源:发表于2019-02-01 09:48 被阅读0次

    爬虫前奏:

    明确目的;

    找到数据对应的网页;

    分析网页的结构找到数据所在的标签位置

    模拟HTTP请求,向服务器发送这个请求,获取到服务器返回给我们的HTML

    用正则表达式提取我们要的数据(名字,人气)

    import re

    from urllib import request

    class Spider():

        url = 'https://www.panda.tv/cate/lol'

        root_pattern = r'<div class="video-info">([\s\S]*?)</div>'

        name_pattern = r'</i>([\s\S]*?)</span>'

        # number_pattern = '<span class="video-number">([\s\S]*?)</span>'

        number_pattern = r'<i class="video-station-num">([\s\S]*?)</i>'

        #私有方法

        def __fetch_content(self):

            r = request.urlopen(Spider.url)

            htmls = r.read()

            htmls = str(htmls,encoding='utf-8')

            return htmls

        def __analysis(self, htmls):

            root_html = re.findall(Spider.root_pattern, htmls)

            anchors = []

            for html in root_html:

                name = re.findall(Spider.name_pattern,html)

                number = re.findall(Spider.number_pattern,html)

                anchor = {'name':name,'number':number}

                anchors.append(anchor)

            # print(anchors[0])   

            return anchors

        def __refine(self,anchors):

            l = lambda anchor : {

                'name':anchor['name'][0].strip(),

                'number':anchor['number'][0]

                }

            return map(l,anchors)

        def __sort(self,anchors):

            anchors = sorted(anchors,key=self.__sort_seed,reverse=True)

            return anchors

        def __sort_seed(self,anchor):

            r = re.findall(r'\d*',anchor['number'])

            number = float(r[0])

            if '万' in anchor['number']:

                number *= 10000

            return number

        def __show(self,anchors):

            for rank in range(0,len(anchors)):

                print('rank'+str(rank + 1)

                + ':' + anchors[rank]['name']

                + '      ' + anchors[rank]['number'])

        #入口方法

        def go(self):

            htmls = self.__fetch_content()

            anchors = self.__analysis(htmls)

            anchors = list(self.__refine(anchors))

            anchors = self.__sort(anchors)

            self.__show(anchors)

    spider = Spider()

    spider.go()

    相关文章

      网友评论

          本文标题:慕课学习爬虫实战

          本文链接:https://www.haomeiwen.com/subject/nsbrsqtx.html