美文网首页
python爬取百度贴吧

python爬取百度贴吧

作者: 有苦向瓜诉说 | 来源:发表于2017-03-24 22:34 被阅读0次

    1.对百度贴吧的任意帖子进行抓取

    2.指定是否只抓取楼主发帖内容

    3.将抓取到的内容分析并保存到文件

    import re
    import bs4
    from bs4 import BeautifulSoup
    import requests
    
    class TiebaSpider(object):
    
        def __init__(self,see_lz):
            self.see_lz=see_lz
    
        def getHTMLText(self,url,pageNumber):
            try:
                headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
                r=requests.get(url+str(pageNumber),timeout=30,headers=headers)
                r.raise_for_status()
                r.encoding='utf-8'
                return r.text
            except:
                return 'ERROR'
    
        def getTitle(self,html):
            try:
                title=re.search(r'<h3 class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>',html)
                return title.group(1).strip()
            except:
                return '百度贴吧'
    
        def getContent(self,html):
            floors=[]
            soup=BeautifulSoup(html,'html.parser')#,attrs={'class':'d_post_content j_d_post_content'}
            contents=soup.find_all('div',attrs={'class':'d_post_content j_d_post_content '})
            for content in contents:
                item=content.get_text()
                floors.append(item)
            return floors
    
        def getPageNumber(self,html):
            soup=BeautifulSoup(html,'html.parser')
            li=soup.find('li',attrs={'class':'l_reply_num'})
            pageNumber=li.find_all('span')[1].string
            #print(pageNumber)
            return int(pageNumber)
    
        def writeFile(self,title,contents):
            if title is None or contents is None:
                return 'ERROR'
            floor=0
            with open(title+'.txt','a',encoding='utf-8') as f: 
                for item in contents:
                    floor=floor+1
                    floorline=str(floor)+u'楼------------------------------------------------------------------------------'+'\n'
                    f.write(floorline+item+'\n'+'\n')
    
        def start(self):
            start_url='https://tieba.baidu.com/p/3138733512?'
            url=start_url+'see_lz='+str(self.see_lz)+'&pn='
            html=self.getHTMLText(url,1)
            title=self.getTitle(html)
            pageNumber=self.getPageNumber(html)
            contents=self.getContent(html)
            count=0
            self.writeFile(title,contents)
            for i in range(2,pageNumber+1):
                count=count+1
                print('\r当前进度{:.2f}%'.format(count*100/pageNumber),end='')
                html=self.getHTMLText(url,i)
                contents=self.getContent(html)
                self.writeFile(title,contents)
    
    baidu=TiebaSpider(1)
    baidu.start()

    相关文章

      网友评论

          本文标题:python爬取百度贴吧

          本文链接:https://www.haomeiwen.com/subject/hhjqottx.html