按包名分组, 统计每个包名的uuid数量,按数量大小倒叙排列
db.InstallAPP.aggregate([
{
$group:
{
_id: "$app_package",
total: {$addToSet: "$uuid"}
}
},
{
$project: {
app_package: 1,
size: {$size: "$total"}
}
},
{
$sort: {
size: -1
}
}
])
或者这样:
db.InstallAPP.aggregate([
{
$group:
{
_id: "$app_package",
size: { $sum: 1 }
}
},
{
$sort: {
size: -1
}
}
])
同时使用and or
chn_uuids = db.InstallAPP.find({
"$or":[
{"$and": [{"app_package": {"$regex": pattern}}, {"app_package": {"$ne": uc_package_ name}}]},
{"app_package": channel_add}
],
# "app_package": {"$regex": pattern, "$ne": uc_package_name},
# 两个app_package并排写就会把两个条件的记录累加起来了
# "app_package": {"$ne": uc_package_name},
"time": {
"$gte": start_time,
"$lt": end_time
}
}).distinct('uuid')
比较经典的案例:
大数据的时候mongo有很多内存限制
http://zackku.com/Mongo-exceed-size/
https://docs.mongodb.com/manual/core/aggregation-pipeline-limits/
https://docs.mongodb.com/manual/reference/limits/#BSON-Document-Size
报错1:Exception: aggregation result exceeds maximum document size (16MB)
报错2: BSONObj size: 21167864 (0x142FEF8) is invalid. Size must be between 0 and 16793600(16MB)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from connection import db_mongo
from datetime import datetime, timedelta
from base.date_utils import DateValid
import argparse
import time
import traceback
import xlwt
# link: http://web.pm.netease.com/v6/my?issueId=44762
db = db_mongo
start_time = datetime(2018, 5, 12)
end_time = datetime(2018, 5, 13)
print start_time, end_time
# 由于读取的数据过大,一开始用distinct()获取几十万的uuid时候,一致报错
# 然后用返回cursor的方式,拿到了才迭代,就不报错了。而不是一次过把全部数据返回过来
uuid_list_cursor = db.LaunchEmulator.aggregate([
{"$match":
{"time":
{
"$gt": start_time,
"$lte": end_time
}
}
},
{"$group":
{"_id": "$uuid"}
}
], allowDiskUse=True)
uuid_list = []
for i in uuid_list_cursor:
uuid_list.append(i["_id"])
print 'uuid总数', len(uuid_list)
page = 0
perpage = 5000
# 0 5000
# 5000*1 5000*2
# 5000*2 5000*3
ret_dict = {}
while True:
slice_uuids = uuid_list[perpage*page:int(perpage*(page+1))]
print 'uuid长度',len(slice_uuids)
if not slice_uuids:
print '结束'
break
print '范围', perpage*page, perpage*(page+1)
"""
需求是:把所有日活的uuid取出来,然后到Download表去查看他们的分辨率
但是,一个uuid可能对应多个Download表的记录,如果不去重的话,有可能一个uuid对应多条记录,就把
多条分辨率都累加上去了。
所以,首先要在日活表取出uuid列表(40W条),所以分辨率也能只有40W条。 那么查询Download表的时候,就要一个uuid,只取最新时间的那条分辨率记录。
可以看到下面的筛选,第一个group 首先按uuid分组,按时间倒叙排序,把最新的一条分辨率对应到uuid,这样就能uuid->分辨率一一对应,不会出现多余的记录。
这是断点看记录是
[
{"_id": "uuidxxxx1", screen: "{"width":500, "height":200},
{"_id": "uuidxxxx2", screen: "{"width":500, "height":200}
]
可以看出,这里还是文档集合,相当于还是mongodb里面的记录,所以!
可以继续在这个基础上使用group!!!!
可以看出第二个group by 是screen.width, screen.height, 说明group 是在上面的记录上做group的,理论上可以无限次group!!
下面为了让输出结果是{width_height: 500}这样,使用了$project 选择显示的字段
还用了concat 和 substr 来组合他们
"""
results = db.Download.aggregate([
{"$match":
{
"uuid": {"$in": slice_uuids}
}
},
{"$sort":
{
"time": -1
}
},
{"$group":
{
"_id": "$uuid",
"screen": {"$first": "$machine.screen"}
}
},
# ])
# results = results.get("result")
# print len(results)
# for i in results:
# print i
# pass
# break
{"$group":
{
"_id": {
# "width-height": {"$concat": ["", "__"] },
"width": "$screen.width" ,
"height": "$screen.height"
},
"count": {"$sum": 1}
}
},
{"$sort":
{
"count": -1
}
},
# {"$match":
# {
# "count": {"$gt": 10}
# }
# },
{"$project":
{
"_id": {"$concat": [
{"$substr": ["$_id.width", 0, -1]},
"_",
{"$substr": ["$_id.height", 0, -1]}
]
},
"count": 1
}
}
], allowDiskUse=True)
page +=1
# 处理结果
ret = results
for r in ret:
_id = r['_id']
count = r['count']
if _id not in ret_dict.keys():
ret_dict[_id] = count
else:
ret_dict[_id] += count
sorted_dict = sorted(ret_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
for i in sorted_dict:
if i[1] < 100:
continue
print i
统计某个版本的uuid 个数是多少
# 这个是统计所有的package的uuid总数
db.LaunchEmulator.aggregate([
{$match: {
time: {$gte: ISODate("2018-05-24T")},
version: "1.25.2.1",
engine: "NEMU",
package : "mumu-green"
}
},
{$group:
{
_id: null,
uuid_list: {$addToSet: "$uuid"},
}
},
{
$project: {
"size": {$size: "$uuid_list"}
}
}
])
返回结果是
{ "_id" : null, "size" : 293 }
# 这个是按版本来统计uuid个数
db.LaunchEmulator.aggregate([
{$match: {
time: {$gte: ISODate("2018-05-24T")},
version: "1.25.2.1",
engine: "NEMU",
}
},
{$group:
{
_id: {package: "$package"},
uuid_list: {$addToSet: "$uuid"},
}
},
{
$project: {
"size": {$size: "$uuid_list"}
}
}
])
结果
{ "_id" : { "package" : "com.netease.soulofhunter.netease_simulator" }, "size" : 2 }
{ "_id" : { "package" : "mumu-green" }, "size" : 10 }
{ "_id" : { "package" : "com.miHoYo.enterprise.NGHSoD" }, "size" : 25 }
{ "_id" : { "package" : "com.bilibili.azurlane" }, "size" : 9 }
{ "_id" : { "package" : "" }, "size" : 15 }
{ "_id" : { "package" : "com.tencent.tmgp.pubgmhd" }, "size" : 1 }
{ "_id" : { "package" : "mumu" }, "size" : 217 }
{ "_id" : { "package" : "com.xd.ro.roapk" }, "size" : 6 }
{ "_id" : { "package" : "com.netease.zjz" }, "size" : 1 }
{ "_id" : { "package" : "com.netease.dwrg" }, "size" : 6 }
{ "_id" : { "package" : "com.netease.zgz" }, "size" : 2 }
{ "_id" : { "package" : "com.georgie.SoundWireFree" }, "size" : 1 }
网友评论