美文网首页
(python2.7)实现糗百爬虫

(python2.7)实现糗百爬虫

作者: ssins | 来源:发表于2016-09-13 22:05 被阅读0次

    先贴代码,文章以后有空再写,注释也以后再写,bug也以后再调,参考文献也以后在贴吧,就这样了(葛优躺)

    文件1:main.py

    # -*- coding:utf-8 -*-
    import urllib
    import qsbk
    
    spider = qsbk.QsbkSpider()
    spider.section='8hr'
    spider.loadSomePages(10)
    while True:
        article = spider.getRandomArticle()
        if not article:
            break
        print '[ page',article['pageIndex'],'artical',article['articleIndex'],']\n',\
            '< Article by', article['author'], '>\n', article['text'],'\n< God Comment >\n',\
            article['cmtMan'], article['cmt']
        print 'pause enter to get next article'
        input = raw_input()
        if(input in ['q','Q']):
            break
    

    文件2:qsbk.py

    __author__ = 'ssins'
    # -*- coding:utf-8 -*-
    import urllib
    import urllib2
    import re
    import random
    from bs4 import BeautifulSoup
    
    class QsbkSpider:
        def __init__(self):
            self._pageIndex = 1
            self.maxPageIndex = 35
            self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            self._qsbkUrl = 'http://www.qiushibaike.com/'
            self.section = '8hr'
            self._sections = ['8hr','hot','imgrank','text','history','pic','textnew']
    
            self._headers = {'User-Agent' : self.user_agent}
            self._stories = []
            self.enable = False
    
        def getPageUrl(self,section,pageIndex):
            if section not in self._sections or pageIndex < 1 or pageIndex > self.maxPageIndex:
                return None
            url = self._qsbkUrl + section + '/page/' + str(pageIndex)
            return url
    
        def getPageInfo(self, url):
            try:
                request = urllib2.Request(url, headers = self._headers)
                response = urllib2.urlopen(request)
                html = response.read()
                return html
            except:
                return None
    
        def find_article_span(self,tag):
            if tag.name != 'span':
                return False
            children = tag.children
            for child in children:
                if (child.name in ['img', 'h2']):
                    return False
            return True
    
        def getPageArticles(self,section,pageIndex):
            pageCode = self.getPageInfo(self.getPageUrl(section,pageIndex))
            if not pageCode:
                return None
            pageCode = str(pageCode)
            soup = BeautifulSoup(pageCode, 'lxml')
            #soup = BeautifulSoup(pageCode, 'html.parser')
            articles = soup.find_all('div', class_='article block untagged mb15')
            articlesDictionaryList = []
            try:
                for tmpArt in articles:
                    article = str(tmpArt)
                    if re.search('class="thumb"', article):
                        continue
                    replaceBr = re.compile('<br/>')
                    article = re.sub(replaceBr, "\n", article)
                    soupArticle = BeautifulSoup(article, 'lxml')
                    #soupArticle = BeautifulSoup(article, 'html.parser')
                    author = soupArticle.h2.string
                    text = soupArticle.find(self.find_article_span).string
                    cmtMan = 'no God Comment'
                    cmt = ''
                    try:
                        cmtMan = soupArticle.find('span', class_='cmt-name').string
                        cmt = soupArticle.find('div', class_='main-text').string
                    except:
                        pass
                    articlesDictionary = {}
                    articlesDictionary['author'] = author
                    articlesDictionary['text'] = text
                    articlesDictionary['cmtMan'] = cmtMan
                    articlesDictionary['cmt'] = cmt
                    articlesDictionaryList.append(articlesDictionary)
                self._stories.append(articlesDictionaryList)
            except:
                return False
            return True
    
        def loadNextPage(self):
            if(self._pageIndex > self.maxPageIndex):
                return False
            if(self.getPageArticles(self.section,self._pageIndex)):
                self._pageIndex += 1
                return True
            return False
    
        def loadSomePages(self, pageNums):
            for i in range(pageNums):
                self.loadNextPage();
    
        def getRandomArticle(self):
            if(len(self._stories)<1):
                return None
            pageIndex = random.randint(0, len(self._stories) - 1)
            articleIndex = random.randint(0, len(self._stories[pageIndex]) - 1)
            article = self._stories[pageIndex][articleIndex]
            article['pageIndex'] = pageIndex + 1
            article['articleIndex'] = articleIndex + 1
            return article

    相关文章

      网友评论

          本文标题:(python2.7)实现糗百爬虫

          本文链接:https://www.haomeiwen.com/subject/qlgjettx.html