美文网首页
爬去简书推荐作者的粉丝信息保存到mongodb数据库

爬去简书推荐作者的粉丝信息保存到mongodb数据库

作者: 半杯故事 | 来源:发表于2017-08-15 09:00 被阅读17次
    import requests,time
    from lxml import etree
    from pymongo import MongoClient
    
    
    def requestGet(url):
        r = requests.get(url, headers=headers)
        html = r.text
        select = etree.HTML(html)
        return select
    
    def get_fens_info(url,fens_count):
    
        try:
            # select = requestGet(url)
    
            fens_page = int(fens_count)//9+1 if int(fens_count)%9 == 0 else int(fens_count)//9+2
            print(fens_page)
            #粉丝数只有前100页有数据
            for page in range(1,100):
                print(page)
                select = requestGet(url+'?page={}'.format(page))
    
                infos = select.xpath('//div[@id="list-container"]//div[@class="info"]')
                for info in infos:
                    name = info.xpath('a/text()')[0]
                    text = info.xpath('div/text()')
    
                    print(name,text)
                    post_data = {
                        'name' : name,
                        'text' : text
                    }
                    #存放粉丝信息到表里
                    result = posts.insert_one(post_data)
        except Exception as e:
            print('get_fens_info函数解析错误 错误为:',e)
    
    
    #获取简书推荐作者的名字和url地址
    def get_recommend_author_name():
    
        try:
            page_index = 1
            while True:
                select = requestGet(base_url + str(page_index))
                infos = select.xpath('//div[@class="wrap"][position()>0]')
                # print(len(infos))
                print(page_index)
                if len(infos) != 0:
                    page_index += 1
                    for info in infos:
                        name = info.xpath('a/h4/text()')[0]
                        print(name)
                        url = jianshu + info.xpath('a/@href')[0]
                        get_recommend_author_info(name, url)
    
                else:
                    break
    
        except Exception as e:
            print("get_recommend_author_name函数解析错误 错误为 ", e)
    
    
    #获取作者的粉丝数和关注数等信息
    def get_recommend_author_info(name,url):
    
        try:
            select = requestGet(url)
            infos = select.xpath('//div[@class="meta-block"][position()>0]')
            # print(len(infos))
            guanzhu_count = infos[0].xpath('a/p/text()')[0]
            # guanzhu_url = jianshu + infos[0].xpath('a/@href')[0]
            fensi_count = infos[1].xpath('a/p/text()')[0]
            fensi_url = jianshu + infos[1].xpath('a/@href')[0]
            wenzhang_count = infos[2].xpath('a/p/text()')[0]
            zishu_count = infos[3].xpath('p/text()')[0]
            xihuan_count = infos[4].xpath('p/text()')[0]
    
            # print(guanzhu_url, fensi_url)
            print(guanzhu_count, fensi_count, wenzhang_count, zishu_count, xihuan_count)
            get_fens_info(fensi_url,fensi_count)
    
            return True
    
        except Exception as e:
            print('get_recommend_author_info函数解析错误 错误为:',e)
    
    
    if __name__ == "__main__":
        jianshu = 'http://www.jianshu.com'
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
        headers = {
            'User-Agent': user_agent
        }
    
        base_url = 'http://www.jianshu.com/recommendations/users?page='
    
        start = time.time()
        client = MongoClient()
    
        # 使用上面的代码片段,将建立连接到默认主机(localhost)和端口(27017)。您还可以指定主机和 / 或使用端口:
        client = MongoClient('localhost', 27017)
        # 或者使用MongoURl格式:
        # client = MongoClient('mongodb://localhost:27017')
    
        #连接fens_db这个数据库
        db = client.fens_db
        #posts表名
        posts = db.posts
    
        get_recommend_author_name()
        end = time.time()
        # print(UserName)
        print("总耗时 %0.3f" % (end - start))
    
    
    屏幕快照 2017-08-15 上午9.00.12.png

    相关文章

      网友评论

          本文标题:爬去简书推荐作者的粉丝信息保存到mongodb数据库

          本文链接:https://www.haomeiwen.com/subject/etkmrxtx.html