美文网首页
爬取糗事百科

爬取糗事百科

作者: 飞行员suke | 来源:发表于2017-05-03 14:46 被阅读0次

    糗事百科纯文抓取,包括用户id,用户头像,糗事内容

    #coding:utf-8
    import sys
    import string
    import urllib2
    import sqlite3
    import re
    import time
    
    
    class HTMLTool:
        #用非贪婪模式匹配\t 或者\n 或者空格 或者超链接 或者图片
        BgnCharToNoneRex = re.compile('(\t|\n| |<a.*?>|<img.*?>)')
        #用非贪婪模式匹配任意 <>标签
        EndCharToNoneRex = re.compile("<.*?>")
    
        #用非贪婪模式匹配任意 <p>标签
        BgnPartRex = re.compile("<p.*?>")
        CharToNewLineRex = re.compile('(<br/>|</p>|<tr>|<div>|</div>)')
        CharToNextTabRex = re.compile("<td>")
    
        #将一些html符号实体转变为原始符号
        replaceTab = [("<","<"),(">",">"),("&","&"),("&","\""),(" "," ")]
    
        def replace_char(self,x):
            x = self.BgnCharToNoneRex.sub("",x)
            x = self.BgnPartRex.sub("\n    ",x)
            x = self.CharToNewLineRex.sub("\n",x)
            x = self.CharToNextTabRex.sub("\t",x)
            x = self.EndCharToNoneRex.sub("",x)
    
            for t in self.replaceTab:
                x = x.replace(t[0],t[1])
            return x
    
    
    class SQLITETool:
    
        def __init__(self,databaseName):
            self.databaseName = databaseName
            self.create_db()
    
        def create_db(self):
            conn = sqlite3.connect(self.databaseName)
            conn.close();
    
        def execute_table(self,sql):
            conn = sqlite3.connect(self.databaseName);
            cursor = conn.cursor();
            try:
                cursor.execute(sql)
            except Exception, e:
                print(Exception,":",e)
            finally:
                cursor.close()
                conn.commit()
                conn.close()
    
    
    class QiuBaiSpider:
    
        def __init__(self):
            self.myTool = HTMLTool()
            self.sqlTool = SQLITETool("qiubai.db")
            self.nowPage = ""
            self.pageNumber = 1
            print("create")
    
        def pageHandle(self,page):
            
            myItems = re.findall(r'<a href="/users/(.*?)/".*?rel.*?![](//(.*?)).*?<span>(.*?)</span>.*?</div>',page,re.S)
            for item in myItems:
                user_id = item[0]
                icon_url = item[1]
                content = self.myTool.replace_char(item[2]).decode('utf-8')
                sql = 'insert into text_table(user_id, icon_url, content) values({},\'{}\',\'{}\');'.format(user_id,icon_url,content)
                self.sqlTool.execute_table(sql)
    
    
        def getPageNumber(self,page):
            myMatch = re.search(r'<ul class="pagination">(.*?)</ul>',page,re.S)
            myItems = re.findall(r'<li>.*?<span .*?>(.*?)</span>.*?</li>',myMatch.group(1),re.S)
            value = self.myTool.replace_char(myItems[6])
            print("count="+self.myTool.replace_char(myItems[6]))
            if value.isdigit():
                return int(myItems[6])
            else:
                return 0
    
        def getSinglePage(self,kindName,page):
            myUrl = "http://m.qiushibaike.com/{}/page/".format(kindName) + str(page)
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = {'User-Agent':user_agent}
            try:
                req = urllib2.Request(myUrl,headers=headers)
                myResponse = urllib2.urlopen(req)
                self.nowPage = myResponse.read()
                return self.nowPage
            except Exception, e:
                print(Exception,":",e)
                return
    
        def getAllPageOfKind(self,kindName):
            try:
                #先获取首页,
                self.getSinglePage(kindName,1)
                pageNum = self.getPageNumber(self.nowPage)
                self.pageHandle(self.nowPage)
                for i in range(2,pageNum+1):
                    print("page="+str(i))
                    self.getSinglePage(kindName,i)
                    self.pageHandle(self.nowPage)
                    time.sleep(0.5)
            except Exception, e:
                print(Exception,":",e)
    
    
    if __name__ == '__main__':
        reload(sys);
        sys.setdefaultencoding('utf-8')
        spider = QiuBaiSpider()
        sql = "CREATE TABLE text_table(caseid integer PRIMARY KEY autoincrement, user_id char(15),icon_url char(128),content char(512));"
        spider.sqlTool.execute_table(sql)
        spider.getAllPageOfKind("hot")

    相关文章

      网友评论

          本文标题:爬取糗事百科

          本文链接:https://www.haomeiwen.com/subject/jmgktxtx.html