糗事百科爬虫

作者: Albert_Sun | 来源:发表于2017-06-13 16:55 被阅读71次

    糗事百科爬虫-爬取段子

    # -*- coding:utf-8 -*-
    import urllib
    import urllib2
    import re
    import thread
    import time
    
    
    # 糗事百科爬虫类
    class QSBK:
        def __init__(self):
            self.pageIndex = 1
            self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64)'
            self.headers = {'User-Agent': self.user_agent}
            self.stories = []
            self.enable = False
    
        def getPage(self, pageIndex):
            try:
                url = "http://www.qiushibaike.com/text/page/" + str(pageIndex)
                request = urllib2.Request(url, headers=self.headers)
                response = urllib2.urlopen(request)
                pageCode = response.read().decode('utf-8')
                return pageCode
            except urllib2.URLError, e:
                if hasattr(e, "reason"):
                    print u"连接糗事百科失败,错误原因", e.reason
                    return None
    
        def getPageItems(self, pageIndex):
            pageCode = self.getPage(pageIndex)
            if not pageCode:
                print "页面加载失败...."
                return None
            pattern = re.compile('<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<div class="content">.*?'
                                 '<span>(.*?)</span>.*?<div class="stats">.*?class="number">(.*?)</i>', re.S)
            items = re.findall(pattern, pageCode)
            pageStories = []
            for i in range(len(items)):
                # print i, items[i][0], ":", items[i][1], items[i][2]
                pageStories.append([items[i][0].strip(), items[i][1].strip(), items[i][2].strip()])
            return pageStories
    
        # 加载并提取页面的内容,加入到列表中
        def loadPage(self):
            if self.enable == True:
                if len(self.stories) < 2:
                    pageStories = self.getPageItems(self.pageIndex)
                    if pageStories:
                        self.stories.append(pageStories)
                        self.pageIndex += 1
    
        # 调用该方法,每次敲回车打印输出一个段子
        def getOneStory(self, pageStories, page):
            # 遍历一页的段子
            for story in pageStories:
                # 等待用户输入
                input = raw_input()
                # 如果输入Q则程序结束
                if input == "Q":
                    self.enable = False
                    return
                print u"第%d页\t发布人:%s\t赞:%s\n%s" % (page, story[0], story[2], story[1])
    
        # 开始方法
        def start(self):
            print u"正在读取糗事百科,按回车查看新段子,Q退出"
            self.enable = True
            # 局部变量,控制当前读到了第几页
            nowPage = 0
            while self.enable:
                if len(self.stories) > 0:
                    pageStories = self.stories[0]
                    nowPage += 1
                    del self.stories[0]
                    self.getOneStory(pageStories, nowPage)
                else:
                    self.loadPage()
    spider = QSBK()
    spider.start()
    
    

    【参考资料】

    1. 博主 http://cuiqingcai.com/1052.html

    相关文章

      网友评论

        本文标题:糗事百科爬虫

        本文链接:https://www.haomeiwen.com/subject/myzdqxtx.html