美文网首页
mongodb aggregate 聚合

mongodb aggregate 聚合

作者: 大富帅 | 来源:发表于2018-12-13 11:18 被阅读21次

按包名分组, 统计每个包名的uuid数量,按数量大小倒叙排列

db.InstallAPP.aggregate([
    {
    $group: 
        {
            _id: "$app_package",
                         total: {$addToSet: "$uuid"}
        }
    },
    {
    $project: {
            app_package: 1,
            size: {$size: "$total"}
        }
    },
    {
    $sort: {
        size: -1
        }
    }
])

或者这样:

db.InstallAPP.aggregate([
    {
    $group: 
        {
            _id: "$app_package",
            size: { $sum: 1 }
        }
    },
    {
    $sort: {
        size: -1
        }
    }
])

同时使用and or

chn_uuids = db.InstallAPP.find({                                                           
     "$or":[
         {"$and": [{"app_package": {"$regex": pattern}}, {"app_package": {"$ne": uc_package_    name}}]},
         {"app_package": channel_add}
     ],
     # "app_package": {"$regex": pattern, "$ne": uc_package_name},
     # 两个app_package并排写就会把两个条件的记录累加起来了
     # "app_package": {"$ne": uc_package_name},
     "time": {
         "$gte": start_time,
         "$lt": end_time
     }
 }).distinct('uuid')

比较经典的案例:
大数据的时候mongo有很多内存限制
http://zackku.com/Mongo-exceed-size/
https://docs.mongodb.com/manual/core/aggregation-pipeline-limits/
https://docs.mongodb.com/manual/reference/limits/#BSON-Document-Size

报错1:Exception: aggregation result exceeds maximum document size (16MB)
报错2: BSONObj size: 21167864 (0x142FEF8) is invalid. Size must be between 0 and 16793600(16MB)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from connection import db_mongo
from datetime import datetime, timedelta
from base.date_utils import DateValid
import argparse
import time
import traceback 
import xlwt

# link:  http://web.pm.netease.com/v6/my?issueId=44762

db = db_mongo

start_time = datetime(2018, 5, 12)
end_time = datetime(2018, 5, 13)
print start_time, end_time

# 由于读取的数据过大,一开始用distinct()获取几十万的uuid时候,一致报错
# 然后用返回cursor的方式,拿到了才迭代,就不报错了。而不是一次过把全部数据返回过来
uuid_list_cursor = db.LaunchEmulator.aggregate([
        {"$match":
            {"time": 
                {
                   "$gt": start_time,
                    "$lte": end_time
                }
            }
        },
        {"$group":
            {"_id": "$uuid"}
        }
], allowDiskUse=True)
uuid_list = []
for i in uuid_list_cursor:
    uuid_list.append(i["_id"])
print 'uuid总数', len(uuid_list)


page = 0
perpage = 5000
# 0 5000
# 5000*1 5000*2
# 5000*2 5000*3

ret_dict = {}
while True:
    slice_uuids = uuid_list[perpage*page:int(perpage*(page+1))]
    print 'uuid长度',len(slice_uuids)
    if not slice_uuids:
        print '结束'
        break
    print '范围', perpage*page, perpage*(page+1)

    """
    需求是:把所有日活的uuid取出来,然后到Download表去查看他们的分辨率 
    但是,一个uuid可能对应多个Download表的记录,如果不去重的话,有可能一个uuid对应多条记录,就把
多条分辨率都累加上去了。
    所以,首先要在日活表取出uuid列表(40W条),所以分辨率也能只有40W条。 那么查询Download表的时候,就要一个uuid,只取最新时间的那条分辨率记录。
 可以看到下面的筛选,第一个group 首先按uuid分组,按时间倒叙排序,把最新的一条分辨率对应到uuid,这样就能uuid->分辨率一一对应,不会出现多余的记录。
这是断点看记录是
 [
{"_id": "uuidxxxx1", screen: "{"width":500, "height":200},
{"_id": "uuidxxxx2", screen: "{"width":500, "height":200}
]
可以看出,这里还是文档集合,相当于还是mongodb里面的记录,所以!
可以继续在这个基础上使用group!!!!
可以看出第二个group by 是screen.width, screen.height, 说明group 是在上面的记录上做group的,理论上可以无限次group!!
下面为了让输出结果是{width_height: 500}这样,使用了$project 选择显示的字段
还用了concat 和 substr 来组合他们
    """
    results = db.Download.aggregate([
        {"$match":
            {
                "uuid": {"$in": slice_uuids}
            }
        },
        {"$sort":
            {
                "time": -1
            }
        
        },
        {"$group":
            {
                "_id": "$uuid",
                "screen": {"$first": "$machine.screen"}
            }
        },
    # ])
    # results = results.get("result")
    # print len(results)
    # for i in results:
    #     print i
    #     pass
    # break
        {"$group":
            {
                "_id": {
                        # "width-height": {"$concat": ["", "__"] },
                        "width": "$screen.width" ,
                        "height": "$screen.height"
                       },
                "count": {"$sum": 1}
            }
        },
        {"$sort":
            {
                "count": -1
            }
        
        },
        # {"$match":
        #     {
        #         "count": {"$gt": 10}
        #     }
        # },
        {"$project":
            {
                "_id": {"$concat": [
                                        {"$substr": ["$_id.width", 0, -1]},
                                         "_", 
                                        {"$substr": ["$_id.height", 0, -1]}
                                    ]
                        },
                "count": 1
            }
        }
    ], allowDiskUse=True)
    page +=1
    # 处理结果
    ret = results
    for r in ret:
        _id = r['_id']
        count = r['count']
        if _id not in ret_dict.keys():
            ret_dict[_id] = count
        else:
            ret_dict[_id] += count

sorted_dict = sorted(ret_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
for i in sorted_dict:
    if i[1] < 100:
        continue
    print  i

统计某个版本的uuid 个数是多少

# 这个是统计所有的package的uuid总数
 db.LaunchEmulator.aggregate([
    {$match: {
            time: {$gte: ISODate("2018-05-24T")}, 
            version: "1.25.2.1", 
            engine: "NEMU",
            package : "mumu-green"
        }
    },
    {$group:    
        
        {
            _id: null,
            uuid_list: {$addToSet: "$uuid"},
        }
    },
    {
    $project: {
         "size": {$size: "$uuid_list"}
        }
    }
])

返回结果是

 { "_id" : null, "size" : 293 }
# 这个是按版本来统计uuid个数
 db.LaunchEmulator.aggregate([
    {$match: {
            time: {$gte: ISODate("2018-05-24T")}, 
            version: "1.25.2.1", 
            engine: "NEMU",
        }
    },
    {$group:    
        
        {
            _id: {package: "$package"},
            uuid_list: {$addToSet: "$uuid"},
        }
    },
    {
    $project: {
         "size": {$size: "$uuid_list"}
        }
    }
])

结果

{ "_id" : { "package" : "com.netease.soulofhunter.netease_simulator" }, "size" : 2 }
{ "_id" : { "package" : "mumu-green" }, "size" : 10 }
{ "_id" : { "package" : "com.miHoYo.enterprise.NGHSoD" }, "size" : 25 }
{ "_id" : { "package" : "com.bilibili.azurlane" }, "size" : 9 }
{ "_id" : { "package" : "" }, "size" : 15 }
{ "_id" : { "package" : "com.tencent.tmgp.pubgmhd" }, "size" : 1 }
{ "_id" : { "package" : "mumu" }, "size" : 217 }
{ "_id" : { "package" : "com.xd.ro.roapk" }, "size" : 6 }
{ "_id" : { "package" : "com.netease.zjz" }, "size" : 1 }
{ "_id" : { "package" : "com.netease.dwrg" }, "size" : 6 }
{ "_id" : { "package" : "com.netease.zgz" }, "size" : 2 }
{ "_id" : { "package" : "com.georgie.SoundWireFree" }, "size" : 1 }

相关文章

网友评论

      本文标题:mongodb aggregate 聚合

      本文链接:https://www.haomeiwen.com/subject/qqttdftx.html