美文网首页
爬取某吧的帖子及各帖子所有回复的内容

爬取某吧的帖子及各帖子所有回复的内容

作者: 噫树 | 来源:发表于2017-08-01 17:26 被阅读17次

    这个程序有借鉴别人的代码:

    # -*- coding:utf-8 -*-
    
    import os
    import codecs
    import json
    import urllib
    import urllib.request
    from lxml import etree
    
    class Spider:
        def __init__(self):
            self.pages = int(input('请输入需要爬取的页数(请输入50的倍数数字):'))
            self.url = 'http://tieba.baidu.com/f?kw=%E6%95%B4%E5%AE%B9&ie=utf-8&pn='
            self.ua_header = {"User-Agent" : "Mozilla/5.0(compatible;MSIE 9.0;Windows NT 6.1; Trident/5.0;"}
    
    
        def tiebaSpider(self):
            for page in range(50, self.pages + 1, 50):
                url = 'http://tieba.baidu.com/f?kw=%E6%95%B4%E5%AE%B9&ie=utf-8&pn=' + str(page)
                # 并且获取页面所有帖子链接,
                links = self.loadPage(url)
    
    
        #读取页面内容
        def  loadPage(self, url):
            req = urllib.request.Request(url, headers=self.ua_header)
            html = urllib.request.urlopen(req).read().decode('utf8',errors='replace')
    
            #解析html 为 HTML
            selector = etree.HTML(html)
            # print(selector)
            links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
    
            for link in links:
                link = "http://tieba.baidu.com" + link
                self.loadImages(link)
    
    
        # 获取
        def loadImages(self, link):
            req = urllib.request.Request(link, headers= self.ua_header)
            html = urllib.request.urlopen(req).read().decode('utf8',errors='replace')
    
            selector = etree.HTML(html)
    
            #获取这个帖子里所有回复人,回复内容,和帖子标题
            title = selector.xpath('//div[@class="left_section"]//div/h1/text()')[0]
            # 获取每个内容块
            content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
            reply = {}
            reply['reply_title'] = title
    
    
            for each_content in content_field:
                reply_info = json.loads(each_content.xpath('@data-field')[0])
                author = reply_info['author']['user_name']
                reply_time = reply_info['content']['date']
                content = each_content.xpath('div[@class="d_post_content_main"]/div/cc/div[starts-with(@id, "post_content") \
                                                    and contains(@class,"d_post_content j_d_post_content  clearfix")]')
                reply['reply_author'] = author
                reply['reply_content_time'] = reply_time
                reply['reply_content'] = content[0].xpath('string(.)').replace(' ', '')
                self.writeImages(reply)
    
    
    
        #按帖子title来建立文件名
        def writeImages(self, reply):
            s_path = './Baidu/'
            if not os.path.isdir(s_path):
                os.mkdir(s_path)
            else:
                pass
            file = codecs.open(s_path + str(reply['reply_title']) + '.txt', 'a', encoding='utf-8')
            file.write(reply['reply_author'] + ":" + reply['reply_content'] + '\n')
            file.close()
    
    
    
    Spider = Spider()
    Spider.tiebaSpider()
    

    效果图:


    1501579580(1).jpg

    相关文章

      网友评论

          本文标题:爬取某吧的帖子及各帖子所有回复的内容

          本文链接:https://www.haomeiwen.com/subject/hvzjlxtx.html