美文网首页
分享一个百度贴吧爬虫

分享一个百度贴吧爬虫

作者: yousa_ | 来源:发表于2019-09-26 13:41 被阅读0次

    最近公司业务要求,需要找一批疑似涉诈骗的数据,百度贴吧是一个好的平台。

    # -*- coding: utf-8 -*-
    import requests
    import time
    from bs4 import BeautifulSoup
     
    import io
    import sys
    #sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gbk') #改变标准输出的默认编码
    #生活大爆炸吧
    '''
     # 标题&帖子链接:
        <a rel="noreferrer" href="/p/4788526595" title="我的人物设计和制作" target="_blank" class="j_th_tit ">我的人物设计和制作</a>
        
    #发帖人:
        <span class="tb_icon_author " title="主题作者: 新日落" data-field="{"user_id":2137596235}"><i class="icon_author"></i><span class="frs-author-name-wrap"><a rel="noreferrer" data-field="{"un":"\u65b0\u65e5\u843d"}" class="frs-author-name j_user_card " href="/home/main/?un=%E6%96%B0%E6%97%A5%E8%90%BD&ie=utf-8&fr=frs" target="_blank">新日落</a></span><span class="icon_wrap  icon_wrap_theme1 frs_bright_icons "></span>    </span>
    #发帖日期:
      <span class="pull-right is_show_create_time" title="创建时间">2016-09</span>
      
      
    #回复数量:
        <div class="col2_left j_threadlist_li_left">
    <span class="threadlist_rep_num center_text" title="回复">73</span>
        </div>
    '''
    
    
    
    #抓取网页的通用框架,获取页面的内容
    def getHtml(url):
        try:
            r= requests.get(url,timeout=30)
            #状态码不是200就发出httpError的异常
            r.raise_for_status()
            #获取正确的编码格式
            # r.encoding=r.apparent_encoding
            r.encoding="utf-8"
            #打印内容
            return r.text
     
     
        except:
            return "wrong!"
     
     
     
    #分析网页的html文件,整理信息,保存问列表文件中
    def get_content(url):
        #初始化一个列表来保存所有的帖子信息
        contents=[]
     
        #获取网页的内容
        html=getHtml(url)
     
        #将网页内容格式化利用bs4库
        soup = BeautifulSoup(html, 'lxml')
     
        #获取所有的li标签属性为 j_thread_list clearfix,用列表接收
        liTags = soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})
        print  (len(liTags))
     
        #循环这个内容li集合
        for li in liTags:
     
            #将爬取到了每一条信息。保存到字典里
            content={}
     
            #将异样抛出,避免无数据时,停止运
            try:
                 #开始筛选信息
                 content['title']=li.find('a',attrs={"class":"j_th_tit"}).text.strip()#.strip()  翻译为中文
                 print (li.find('a',attrs={"class":"j_th_tit"}).text.strip())
     
                 #获取a标签的内部属性
                 content['link'] ="http://tieba.baidu.com/"+li.find('a', attrs={"class": "j_th_tit"})["href"]
                 print("http://tieba.baidu.com/"+li.find('a', attrs={"class": "j_th_tit"})["href"])
    
                 # 获取第一条内容(标题 + 第一条内容结合内容会比较丰富)
                 content['content'] = li.find('div', attrs={"class":'threadlist_abs threadlist_abs_onlyline '}).text.strip()
                 print(li.find('div', attrs={"class": 'threadlist_abs threadlist_abs_onlyline '}).text.strip())
    
                 content['author']=li.find('span',attrs={"class":'tb_icon_author '}).text.strip()
                 print (li.find('span',attrs={"class":'tb_icon_author '}).text.strip())
    
     
                 content['responseNum']=li.find('span',attrs={'class': 'threadlist_rep_num center_text'}).text.strip()
                 print(li.find(
                     'span', attrs={'class': 'threadlist_rep_num center_text'}).text.strip())
                 content['creatTime']=li.find('span',attrs={"class":'pull-right is_show_create_time'}).text.strip()
                 print (li.find(
                    'span', attrs={'class': 'pull-right is_show_create_time'}).text.strip())
                 #将字典加入到列表中
                 contents.append(content)
     
     
            except:
                print('出问题')
     
     
     
            #返回数据
        return contents
     
     
    def writeTxt(object_dir, content):
     
        #这里不能写成 f=open("data.txt",'a+')否则会乱码,设置沉utf-8的格式,与getHtml(url):中的encoding一致
        f=open(object_dir, 'a+',encoding='utf-8')
     
        for c in content:
            # f.write('标题: {} \t 链接:{} \t 发帖人:{} \t 发帖时间:{} \t 回复数量: {} \n'.format(
            #         c['title'], c['link'], c['author'], c['creatTime'], c['responseNum']))
            # f.write('标题: {} \t 内容: {} \t \n'.format(
            #     c['title'], c['content']))
            f.write('{}。{}\n链接:{}\n'.format(
                c['title'], c['content'], c['link']))
    
    
    def main(url, page, object_dir='data.txt'):
        url_list=[]
        #将所需要爬去的url放到列表中
        for i in range(0,page):
            url_list.append(url+'&pn='+str(i*50))
     
        for u in url_list:
            content=get_content(u)
            writeTxt(object_dir, content)
     
    if __name__=="__main__":
        # url = "https://tieba.baidu.com/f?ie=utf-8&kw" \
        #       "=%E6%B8%B8%E6%88%8F%E5%8F%B7&fr=search"
        # page = 20
        # object_dir = '游戏账号.txt'
    
        # url = "https://tieba.baidu.com/f?ie=utf-8&kw=%E7%82%" \
        #       "AB%E8%88%9E%E8%B4%A6%E5%8F%B7%E4%BA%A4%E6%98%93"
        # page = 20
        # object_dir = '炫舞账号交易.txt'
    
        # url = "https://tieba.baidu.com/f?ie=utf-8&kw=" \
        #       "lol%E8%B4%A6%E5%8F%B7%E4%BA%A4%E6%98%93"
        # page = 20
        # object_dir = '英雄联盟账号交易.txt'
    
        url = "http://tieba.baidu.com/f?" \
              "kw=%E8%99%9A%E6%8B%9F%E5%B8%81&ie=utf-8&pn=0"
        page = 200
        object_dir = '虚拟币.txt'
    
        main(url,page, object_dir)
        # get_content("https://tieba.baidu.com/f?ie=utf-8&kw=%E6%B8%B8%E6%88%8F%E5%8F%B7&fr=search")
     
     
     
    

    另一个

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    # author: ShidongDu time2019/9/6
    
    # -*- coding:utf-8 -*-
    import os
    import codecs
    import json
    import urllib
    import urllib.request
    from lxml import etree
    
    class Spider:
        def __init__(self, pages, url, dir):
            # self.pages = int(input('请输入需要爬取的页数(请输入50的倍数数字):'))
            self.pages = pages
            # self.url = 'http://tieba.baidu.com/f?kw=%E6%95%B4%E5%AE%B9&ie=utf-8&pn='
            self.dir = dir
            self.url = url
            self.ua_header = {"User-Agent" : "Mozilla/5.0(compatible;MSIE 9.0;Windows NT 6.1; Trident/5.0;"}
    
    
        def tiebaSpider(self):
            for page in range(50, self.pages + 1, 50):
                url = self.url + str(page)
                # 并且获取页面所有帖子链接,
                links = self.loadPage(url)
    
    
        #读取页面内容
        def  loadPage(self, url):
            req = urllib.request.Request(url, headers=self.ua_header)
            html = urllib.request.urlopen(req).read().decode('utf8',errors='replace')
    
            #解析html 为 HTML
            selector = etree.HTML(html)
            # print(selector)
            links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
    
            for link in links:
                link = "http://tieba.baidu.com" + link
                self.loadImages(link)
    
    
        # 获取
        def loadImages(self, link):
            req = urllib.request.Request(link, headers= self.ua_header)
            html = urllib.request.urlopen(req).read().decode('utf8',errors='replace')
    
            selector = etree.HTML(html)
    
            #获取这个帖子里所有回复人,回复内容,和帖子标题
            title = selector.xpath('//div[@class="left_section"]//div/h1/text()')[0]
            # 获取每个内容块
            content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
            reply = {}
            reply['reply_title'] = title
    
    
            for each_content in content_field:
                reply_info = json.loads(each_content.xpath('@data-field')[0])
                author = reply_info['author']['user_name']
                reply_time = reply_info['content']['date']
                content = each_content.xpath('div[@class="d_post_content_main"]/div/cc/div[starts-with(@id, "post_content") \
                                                    and contains(@class,"d_post_content j_d_post_content  clearfix")]')
                reply['reply_author'] = author
                reply['reply_content_time'] = reply_time
                reply['reply_content'] = content[0].xpath('string(.)').replace(' ', '')
                self.writeImages(self.dir, reply)
    
    
    
        #按帖子title来建立文件名
        def writeImages(self, dir, reply):
            s_path = './'+ dir+ '/'
            if not os.path.isdir(s_path):
                os.mkdir(s_path)
            else:
                pass
            try:
                file = codecs.open(s_path + str(reply['reply_title']) + '.txt', 'a', encoding='utf-8')
                # file.write(reply['reply_author'] + ":" + reply['reply_content'] + '\n')
                file.write(reply['reply_content'] + '\n')
    
                file.close()
            except:
                print("oops~")
    # 5000条,即100页
    pages = 5000
    # url:爬取的页面
    url = "http://tieba.baidu.com/f?kw=%E7%88%B1%E5%A5%87%E8%89%BAvip%E4%BC%9A%E5%91%98&ie=utf-8"
    dir = "爱奇艺"
    Spider = Spider(pages, url, dir)
    Spider.tiebaSpider()
    print("OK!")
    

    相关文章

      网友评论

          本文标题:分享一个百度贴吧爬虫

          本文链接:https://www.haomeiwen.com/subject/ffjsyctx.html