美文网首页
mongodb aggregate 聚合

mongodb aggregate 聚合

作者: 大富帅 | 来源:发表于2018-12-13 11:18 被阅读21次

    按包名分组, 统计每个包名的uuid数量,按数量大小倒叙排列

    db.InstallAPP.aggregate([
        {
        $group: 
            {
                _id: "$app_package",
                             total: {$addToSet: "$uuid"}
            }
        },
        {
        $project: {
                app_package: 1,
                size: {$size: "$total"}
            }
        },
        {
        $sort: {
            size: -1
            }
        }
    ])
    

    或者这样:

    db.InstallAPP.aggregate([
        {
        $group: 
            {
                _id: "$app_package",
                size: { $sum: 1 }
            }
        },
        {
        $sort: {
            size: -1
            }
        }
    ])
    

    同时使用and or

    chn_uuids = db.InstallAPP.find({                                                           
         "$or":[
             {"$and": [{"app_package": {"$regex": pattern}}, {"app_package": {"$ne": uc_package_    name}}]},
             {"app_package": channel_add}
         ],
         # "app_package": {"$regex": pattern, "$ne": uc_package_name},
         # 两个app_package并排写就会把两个条件的记录累加起来了
         # "app_package": {"$ne": uc_package_name},
         "time": {
             "$gte": start_time,
             "$lt": end_time
         }
     }).distinct('uuid')
    

    比较经典的案例:
    大数据的时候mongo有很多内存限制
    http://zackku.com/Mongo-exceed-size/
    https://docs.mongodb.com/manual/core/aggregation-pipeline-limits/
    https://docs.mongodb.com/manual/reference/limits/#BSON-Document-Size

    报错1:Exception: aggregation result exceeds maximum document size (16MB)
    报错2: BSONObj size: 21167864 (0x142FEF8) is invalid. Size must be between 0 and 16793600(16MB)

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    from connection import db_mongo
    from datetime import datetime, timedelta
    from base.date_utils import DateValid
    import argparse
    import time
    import traceback 
    import xlwt
    
    # link:  http://web.pm.netease.com/v6/my?issueId=44762
    
    db = db_mongo
    
    start_time = datetime(2018, 5, 12)
    end_time = datetime(2018, 5, 13)
    print start_time, end_time
    
    # 由于读取的数据过大,一开始用distinct()获取几十万的uuid时候,一致报错
    # 然后用返回cursor的方式,拿到了才迭代,就不报错了。而不是一次过把全部数据返回过来
    uuid_list_cursor = db.LaunchEmulator.aggregate([
            {"$match":
                {"time": 
                    {
                       "$gt": start_time,
                        "$lte": end_time
                    }
                }
            },
            {"$group":
                {"_id": "$uuid"}
            }
    ], allowDiskUse=True)
    uuid_list = []
    for i in uuid_list_cursor:
        uuid_list.append(i["_id"])
    print 'uuid总数', len(uuid_list)
    
    
    page = 0
    perpage = 5000
    # 0 5000
    # 5000*1 5000*2
    # 5000*2 5000*3
    
    ret_dict = {}
    while True:
        slice_uuids = uuid_list[perpage*page:int(perpage*(page+1))]
        print 'uuid长度',len(slice_uuids)
        if not slice_uuids:
            print '结束'
            break
        print '范围', perpage*page, perpage*(page+1)
    
        """
        需求是:把所有日活的uuid取出来,然后到Download表去查看他们的分辨率 
        但是,一个uuid可能对应多个Download表的记录,如果不去重的话,有可能一个uuid对应多条记录,就把
    多条分辨率都累加上去了。
        所以,首先要在日活表取出uuid列表(40W条),所以分辨率也能只有40W条。 那么查询Download表的时候,就要一个uuid,只取最新时间的那条分辨率记录。
     可以看到下面的筛选,第一个group 首先按uuid分组,按时间倒叙排序,把最新的一条分辨率对应到uuid,这样就能uuid->分辨率一一对应,不会出现多余的记录。
    这是断点看记录是
     [
    {"_id": "uuidxxxx1", screen: "{"width":500, "height":200},
    {"_id": "uuidxxxx2", screen: "{"width":500, "height":200}
    ]
    可以看出,这里还是文档集合,相当于还是mongodb里面的记录,所以!
    可以继续在这个基础上使用group!!!!
    可以看出第二个group by 是screen.width, screen.height, 说明group 是在上面的记录上做group的,理论上可以无限次group!!
    下面为了让输出结果是{width_height: 500}这样,使用了$project 选择显示的字段
    还用了concat 和 substr 来组合他们
        """
        results = db.Download.aggregate([
            {"$match":
                {
                    "uuid": {"$in": slice_uuids}
                }
            },
            {"$sort":
                {
                    "time": -1
                }
            
            },
            {"$group":
                {
                    "_id": "$uuid",
                    "screen": {"$first": "$machine.screen"}
                }
            },
        # ])
        # results = results.get("result")
        # print len(results)
        # for i in results:
        #     print i
        #     pass
        # break
            {"$group":
                {
                    "_id": {
                            # "width-height": {"$concat": ["", "__"] },
                            "width": "$screen.width" ,
                            "height": "$screen.height"
                           },
                    "count": {"$sum": 1}
                }
            },
            {"$sort":
                {
                    "count": -1
                }
            
            },
            # {"$match":
            #     {
            #         "count": {"$gt": 10}
            #     }
            # },
            {"$project":
                {
                    "_id": {"$concat": [
                                            {"$substr": ["$_id.width", 0, -1]},
                                             "_", 
                                            {"$substr": ["$_id.height", 0, -1]}
                                        ]
                            },
                    "count": 1
                }
            }
        ], allowDiskUse=True)
        page +=1
        # 处理结果
        ret = results
        for r in ret:
            _id = r['_id']
            count = r['count']
            if _id not in ret_dict.keys():
                ret_dict[_id] = count
            else:
                ret_dict[_id] += count
    
    sorted_dict = sorted(ret_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    for i in sorted_dict:
        if i[1] < 100:
            continue
        print  i
    

    统计某个版本的uuid 个数是多少

    # 这个是统计所有的package的uuid总数
     db.LaunchEmulator.aggregate([
        {$match: {
                time: {$gte: ISODate("2018-05-24T")}, 
                version: "1.25.2.1", 
                engine: "NEMU",
                package : "mumu-green"
            }
        },
        {$group:    
            
            {
                _id: null,
                uuid_list: {$addToSet: "$uuid"},
            }
        },
        {
        $project: {
             "size": {$size: "$uuid_list"}
            }
        }
    ])
    

    返回结果是

     { "_id" : null, "size" : 293 }
    
    # 这个是按版本来统计uuid个数
     db.LaunchEmulator.aggregate([
        {$match: {
                time: {$gte: ISODate("2018-05-24T")}, 
                version: "1.25.2.1", 
                engine: "NEMU",
            }
        },
        {$group:    
            
            {
                _id: {package: "$package"},
                uuid_list: {$addToSet: "$uuid"},
            }
        },
        {
        $project: {
             "size": {$size: "$uuid_list"}
            }
        }
    ])
    
    

    结果

    { "_id" : { "package" : "com.netease.soulofhunter.netease_simulator" }, "size" : 2 }
    { "_id" : { "package" : "mumu-green" }, "size" : 10 }
    { "_id" : { "package" : "com.miHoYo.enterprise.NGHSoD" }, "size" : 25 }
    { "_id" : { "package" : "com.bilibili.azurlane" }, "size" : 9 }
    { "_id" : { "package" : "" }, "size" : 15 }
    { "_id" : { "package" : "com.tencent.tmgp.pubgmhd" }, "size" : 1 }
    { "_id" : { "package" : "mumu" }, "size" : 217 }
    { "_id" : { "package" : "com.xd.ro.roapk" }, "size" : 6 }
    { "_id" : { "package" : "com.netease.zjz" }, "size" : 1 }
    { "_id" : { "package" : "com.netease.dwrg" }, "size" : 6 }
    { "_id" : { "package" : "com.netease.zgz" }, "size" : 2 }
    { "_id" : { "package" : "com.georgie.SoundWireFree" }, "size" : 1 }
    
    

    相关文章

      网友评论

          本文标题:mongodb aggregate 聚合

          本文链接:https://www.haomeiwen.com/subject/qqttdftx.html